JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
parse comments (13 passing tests)
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
23 # parsing.
24 #
25 # Instead, the data structure produced by this parser is an array of Nodes.
26
27
28 # stacks/lists
29 #
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
33 # (both as stacks)
34 #
35 # stacks grow downward (current element is index=0)
36 #
37 # example: open_els = [a, b, c, d, e, f, g]
38 #
39 # "grows downwards" means it's visualized like this: (index: el, names)
40 #
41 #   6: g "start of the list", "topmost", "first"
42 #   5: f
43 #   4: e "previous" (to d), "above", "before"
44 #   3: d   (previous/next are relative to this element)
45 #   2: c "next", "after", "lower", "below"
46 #   1: b
47 #   0: a "end of the list", "current node", "bottommost", "last"
48
49
50 # browser
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
54         window.wheic = {}
55         module = exports: window.wheic
56
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
60 TYPE_COMMENT = 2
61 TYPE_DOCTYPE = 3
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
65 TYPE_EOF = 6
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
68
69 # namespace constants
70 NS_HTML = 1
71 NS_MATHML = 2
72 NS_SVG = 3
73
74 g_debug_log = []
75 debug_log_reset = ->
76         g_debug_log = []
77 debug_log = (str) ->
78         g_debug_log.push str
79 debug_log_each = (cb) ->
80         for str in g_debug_log
81                 cb str
82
83 prev_node_id = 0
84 class Node
85         constructor: (type, args = {}) ->
86                 @type = type # one of the TYPE_* constants above
87                 @name = args.name ? '' # tag name
88                 @text = args.text ? '' # contents for text/comment nodes
89                 @attrs = args.attrs ? {}
90                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91                 @children = args.children ? []
92                 @namespace = args.namespace ? NS_HTML
93                 @parent = args.parent ? null
94                 @token = args.token ? null
95                 if args.id?
96                         @id = "#{args.id}+"
97                 else
98                         @id = "#{++prev_node_id}"
99         shallow_clone: -> # return a new node that's the same except without the children or parent
100                 # WARNING this doesn't work right on open tags that are still being parsed
101                 attrs = {}
102                 attrs[k] = v for k, v of @attrs
103                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id, token: @token
104         acknowledge_self_closing: ->
105                 if @token?
106                         @token.flag 'did_self_close'
107                 else
108                         @flag 'did_self_close', true
109         flag: ->
110                 # fixfull
111         serialize: (shallow = false, show_ids = false) -> # for unit tests
112                 ret = ''
113                 switch @type
114                         when TYPE_TAG
115                                 ret += 'tag:'
116                                 ret += JSON.stringify @name
117                                 ret += ','
118                                 if show_ids
119                                         ret += "##{@id},"
120                                 if shallow
121                                         break
122                                 attr_keys = []
123                                 for k of @attrs
124                                         attr_keys.push k
125                                 attr_keys.sort()
126                                 ret += '{'
127                                 sep = ''
128                                 for k in attr_keys
129                                         ret += sep
130                                         sep = ','
131                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
132                                 ret += '},['
133                                 sep = ''
134                                 for c in @children
135                                         ret += sep
136                                         sep = ','
137                                         ret += c.serialize shallow, show_ids
138                                 ret += ']'
139                         when TYPE_TEXT
140                                 ret += 'text:'
141                                 ret += JSON.stringify @text
142                         when TYPE_COMMENT
143                                 ret += 'comment:'
144                                 ret += JSON.stringify @text
145                         when TYPE_DOCTYPE
146                                 ret += 'doctype'
147                                 # FIXME
148                         when TYPE_AFE_MARKER
149                                 ret += 'marker'
150                         when TYPE_AAA_BOOKMARK
151                                 ret += 'aaa_bookmark'
152                         else
153                                 ret += 'unknown:'
154                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
155                 return ret
156
157 # helpers: (only take args that are normally known when parser creates nodes)
158 new_open_tag = (name) ->
159         return new Node TYPE_START_TAG, name: name
160 new_end_tag = (name) ->
161         return new Node TYPE_END_TAG, name: name
162 new_element = (name) ->
163         return new Node TYPE_TAG, name: name
164 new_text_node = (txt) ->
165         return new Node TYPE_TEXT, text: txt
166 new_character_token = new_text_node
167 new_comment_token = (txt) ->
168         return new Node TYPE_COMMENT, text: txt
169 new_eof_token = ->
170         return new Node TYPE_EOF
171 new_afe_marker = ->
172         return new Node TYPE_AFE_MARKER
173 new_aaa_bookmark = ->
174         return new Node TYPE_AAA_BOOKMARK
175
176 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
177 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
178 digits = "0123456789"
179 alnum = lc_alpha + uc_alpha + digits
180 hex_chars = digits + "abcdefABCDEF"
181
182 # some SVG elements have dashes in them
183 tag_name_chars = alnum + "-"
184
185 # http://www.w3.org/TR/html5/infrastructure.html#space-character
186 space_chars = "\u0009\u000a\u000c\u000d\u0020"
187 is_space = (txt) ->
188         return txt.length is 1 and space_chars.indexOf(txt) > -1
189 is_space_tok = (t) ->
190         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
191
192 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
193 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
194
195 # These are the character references that don't need a terminating semicolon
196 # min length: 2, max: 6, none are a prefix of any other.
197 legacy_char_refs = {
198         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
199         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
200         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
201         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
202         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
203         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
204         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
205         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
206         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
207         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
208         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
209         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
210         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
211         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
212         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
213         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
214         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
215         yen: '¥', yuml: 'ÿ'
216 }
217
218 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
219 raw_text_elements = ['script', 'style']
220 escapable_raw_text_elements = ['textarea', 'title']
221 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
222 svg_elements = [
223         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
224         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
225         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
226         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
227         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
228         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
229         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
230         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
231         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
232         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
233         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
234         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
235         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
236         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
237         'view', 'vkern'
238 ]
239
240 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
241 mathml_elements = [
242         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
243         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
244         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
245         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
246         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
247         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
248         'determinant', 'diff', 'divergence', 'divide', 'domain',
249         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
250         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
251         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
252         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
253         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
254         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
255         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
256         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
257         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
258         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
259         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
260         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
261         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
262         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
263         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
264         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
265         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
266         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
267         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
268         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
269         'vectorproduct', 'xor'
270 ]
271 # foreign_elements = [svg_elements..., mathml_elements...]
272 #normal_elements = All other allowed HTML elements are normal elements.
273
274 special_elements = {
275         # HTML:
276         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
277         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
278         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
279         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
280         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
281         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
282         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
283         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
284         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
285         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
286         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
287         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
288         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
289         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
290         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
291         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
292         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
293         wbr:NS_HTML, xmp:NS_HTML,
294
295         # MathML:
296         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
297         'annotation-xml':NS_MATHML,
298
299         # SVG:
300         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
301 }
302
303 formatting_elements = {
304          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
305          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
306          u: true
307 }
308
309 foster_parenting_targets = {
310         table: true
311         tbody: true
312         tfoot: true
313         thead: true
314         tr: true
315 }
316
317 # all html I presume
318 end_tag_implied = {
319         dd: true
320         dt: true
321         li: true
322         option: true
323         optgroup: true
324         p: true
325         rb: true
326         rp: true
327         rt: true
328         rtc: true
329 }
330
331 el_is_special = (e) ->
332         return special_elements[e.name] is e.namespace
333
334 # decode_named_char_ref()
335 #
336 # The list of named character references is _huge_ so ask the browser to decode
337 # for us instead of wasting bandwidth/space on including the table here.
338 #
339 # Pass without the "&" but with the ";" examples:
340 #    for "&amp" pass "amp;"
341 #    for "&#x2032" pass "x2032;"
342 g_dncr = {
343         cache: {}
344         textarea: document.createElement('textarea')
345 }
346 # TODO test this in IE8
347 decode_named_char_ref = (txt) ->
348         txt = "&#{txt}"
349         decoded = g_dncr.cache[txt]
350         return decoded if decoded?
351         g_dncr.textarea.innerHTML = txt
352         decoded = g_dncr.textarea.value
353         return null if decoded is txt
354         return g_dncr.cache[txt] = decoded
355
356 parse_html = (txt, parse_error_cb = null) ->
357         cur = 0 # index of next char in txt to be parsed
358         # declare doc and tokenizer variables so they're in scope below
359         doc = null
360         open_els = null # stack of open elements
361         afe = null # active formatting elements
362         template_insertion_modes = null
363         insertion_mode = null
364         original_insertion_mode = null
365         tok_state = null
366         tok_cur_tag = null # partially parsed tag
367         flag_scripting = null
368         flag_frameset_ok = null
369         flag_parsing = null
370         flag_foster_parenting = null
371         form_element_pointer = null
372         temporary_buffer = null
373         pending_table_character_tokens = null
374         head_element_pointer = null
375         flag_fragment_parsing = null
376         context_element = null
377
378         stop_parsing = ->
379                 flag_parsing = false
380
381         parse_error = ->
382                 if parse_error_cb?
383                         parse_error_cb cur
384                 else
385                         console.log "Parse error at character #{cur} of #{txt.length}"
386
387         afe_push = (new_el) ->
388                 matches = 0
389                 for el, i in afe
390                         if el.name is new_el.name and el.namespace is new_el.namespace
391                                 for k, v of el.attrs
392                                         continue unless new_el.attrs[k] is v
393                                 for k, v of new_el.attrs
394                                         continue unless el.attrs[k] is v
395                                 matches += 1
396                                 if matches is 3
397                                         afe.splice i, 1
398                                         break
399                 afe.unshift new_el
400         afe_push_marker = ->
401                 afe.unshift new_afe_marker()
402
403         # the functions below impliment the Tree Contstruction algorithm
404         # http://www.w3.org/TR/html5/syntax.html#tree-construction
405
406         # But first... the helpers
407         template_tag_is_open = ->
408                 for t in open_els
409                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
410                                 return true
411                 return false
412         is_in_scope_x = (tag_name, scope, namespace) ->
413                 for t in open_els
414                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
415                                 return true
416                         if scope[t.name] is t.namespace
417                                 return false
418                 return false
419         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
420                 for t in open_els
421                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
422                                 return true
423                         if scope[t.name] is t.namespace
424                                 return false
425                         if scope2[t.name] is t.namespace
426                                 return false
427                 return false
428         standard_scopers = { # FIXME these are supposed to be namespace specific
429                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
430                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
431                 template: NS_HTML, mi: NS_MATHML,
432
433                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
434                 'annotation-xml': NS_MATHML,
435
436                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
437         }
438         button_scopers = button: NS_HTML
439         li_scopers = ol: NS_HTML, ul: NS_HTML
440         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
441         is_in_scope = (tag_name, namespace = null) ->
442                 return is_in_scope_x tag_name, standard_scopers, namespace
443         is_in_button_scope = (tag_name, namespace = null) ->
444                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
445         is_in_table_scope = (tag_name, namespace = null) ->
446                 return is_in_scope_x tag_name, table_scopers, namespace
447         is_in_select_scope = (tag_name, namespace = null) ->
448                 for t in open_els
449                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
450                                 return true
451                         if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
452                                 return false
453                 return false
454         # this checks for a particular element, not by name
455         el_is_in_scope = (el) ->
456                 for t in open_els
457                         if t is el
458                                 return true
459                         if standard_scopers[t.name] is t.namespace
460                                 return false
461                 return false
462
463         clear_to_table_stopers = {
464                 'table': true
465                 'template': true
466                 'html': true
467         }
468         clear_stack_to_table_context = ->
469                 loop
470                         if clear_to_table_stopers[open_els[0].name]?
471                                 break
472                         open_els.shift()
473                 return
474         clear_to_table_body_stopers = {
475                 'tbody': true
476                 'tfoot': true
477                 'thead': true
478                 'template': true
479                 'html': true
480         }
481         clear_stack_to_table_body_context = ->
482                 loop
483                         if clear_to_table_body_stopers[open_els[0].name]?
484                                 break
485                         open_els.shift()
486                 return
487         clear_to_table_row_stopers = {
488                 'tr': true
489                 'template': true
490                 'html': true
491         }
492         clear_stack_to_table_row_context = ->
493                 loop
494                         if clear_to_table_row_stopers[open_els[0].name]?
495                                 break
496                         open_els.shift()
497                 return
498         clear_afe_to_marker = ->
499                 loop
500                         el = afe.shift()
501                         if el.type is TYPE_AFE_MARKER
502                                 return
503
504         # 8.2.3.1 ...
505         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
506         reset_insertion_mode = ->
507                 # 1. Let last be false.
508                 last = false
509                 # 2. Let node be the last node in the stack of open elements.
510                 node_i = 0
511                 node = open_els[node_i]
512                 # 3. Loop: If node is the first node in the stack of open elements,
513                 # then set last to true, and, if the parser was originally created as
514                 # part of the HTML fragment parsing algorithm (fragment case) set node
515                 # to the context element.
516                 loop
517                         if node_i is open_els.length - 1
518                                 last = true
519                                 # fixfull (fragment case)
520
521                         # 4. If node is a select element, run these substeps:
522                         if node.name is 'select'
523                                 # 1. If last is true, jump to the step below labeled done.
524                                 unless last
525                                         # 2. Let ancestor be node.
526                                         ancestor_i = node_i
527                                         ancestor = node
528                                         # 3. Loop: If ancestor is the first node in the stack of
529                                         # open elements, jump to the step below labeled done.
530                                         loop
531                                                 if ancestor_i is open_els.length - 1
532                                                         break
533                                                 # 4. Let ancestor be the node before ancestor in the stack
534                                                 # of open elements.
535                                                 ancestor_i += 1
536                                                 ancestor = open_els[ancestor_i]
537                                                 # 5. If ancestor is a template node, jump to the step below
538                                                 # labeled done.
539                                                 if ancestor.name is 'template'
540                                                         break
541                                                 # 6. If ancestor is a table node, switch the insertion mode
542                                                 # to "in select in table" and abort these steps.
543                                                 if ancestor.name is 'table'
544                                                         insertion_mode = ins_mode_in_select_in_table
545                                                         return
546                                                 # 7. Jump back to the step labeled loop.
547                                 # 8. Done: Switch the insertion mode to "in select" and abort
548                                 # these steps.
549                                 insertion_mode = ins_mode_in_select
550                                 return
551                         # 5. If node is a td or th element and last is false, then switch
552                         # the insertion mode to "in cell" and abort these steps.
553                         if (node.name is 'td' or node.name is 'th') and last is false
554                                 insertion_mode = ins_mode_in_cell
555                                 return
556                         # 6. If node is a tr element, then switch the insertion mode to "in
557                         # row" and abort these steps.
558                         if node.name is 'tr'
559                                 insertion_mode = ins_mode_in_row
560                                 return
561                         # 7. If node is a tbody, thead, or tfoot element, then switch the
562                         # insertion mode to "in table body" and abort these steps.
563                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
564                                 insertion_mode = ins_mode_in_table_body
565                                 return
566                         # 8. If node is a caption element, then switch the insertion mode
567                         # to "in caption" and abort these steps.
568                         if node.name is 'caption'
569                                 insertion_mode = ins_mode_in_caption
570                                 return
571                         # 9. If node is a colgroup element, then switch the insertion mode
572                         # to "in column group" and abort these steps.
573                         if node.name is 'colgroup'
574                                 insertion_mode = ins_mode_in_column_group
575                                 return
576                         # 10. If node is a table element, then switch the insertion mode to
577                         # "in table" and abort these steps.
578                         if node.name is 'table'
579                                 insertion_mode = ins_mode_in_table
580                                 return
581                         # 11. If node is a template element, then switch the insertion mode
582                         # to the current template insertion mode and abort these steps.
583                         # fixfull (template insertion mode stack)
584
585                         # 12. If node is a head element and last is true, then switch the
586                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
587                         # these steps. (fragment case)
588                         if node.name is 'head' and last
589                                 insertion_mode = ins_mode_in_body
590                                 return
591                         # 13. If node is a head element and last is false, then switch the
592                         # insertion mode to "in head" and abort these steps.
593                         if node.name is 'head' and last is false
594                                 insertion_mode = ins_mode_in_head
595                                 return
596                         # 14. If node is a body element, then switch the insertion mode to
597                         # "in body" and abort these steps.
598                         if node.name is 'body'
599                                 insertion_mode = ins_mode_in_body
600                                 return
601                         # 15. If node is a frameset element, then switch the insertion mode
602                         # to "in frameset" and abort these steps. (fragment case)
603                         if node.name is 'frameset'
604                                 insertion_mode = ins_mode_in_frameset
605                                 return
606                         # 16. If node is an html element, run these substeps:
607                         if node.name is 'html'
608                                 # 1. If the head element pointer is null, switch the insertion
609                                 # mode to "before head" and abort these steps. (fragment case)
610                                 # fixfull (fragment case)
611
612                                 # 2. Otherwise, the head element pointer is not null, switch
613                                 # the insertion mode to "after head" and abort these steps.
614                                 insertion_mode = ins_mode_in_body # FIXME fixfull
615                                 return
616                         # 17. If last is true, then switch the insertion mode to "in body"
617                         # and abort these steps. (fragment case)
618                         if last
619                                 insertion_mode = ins_mode_in_body
620                                 return
621                         # 18. Let node now be the node before node in the stack of open
622                         # elements.
623                         node_i += 1
624                         node = open_els[node_i]
625                         # 19. Return to the step labeled loop.
626
627         # 8.2.3.2
628
629         # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
630         adjusted_current_node = ->
631                 if open_els.length is 1 and flag_fragment_parsing
632                         return context_element
633                 return open_els[0]
634
635         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
636         # this implementation is structured (mostly) as described at the link above.
637         # capitalized comments are the "labels" described at the link above.
638         reconstruct_active_formatting_elements = ->
639                 return if afe.length is 0
640                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
641                         return
642                 # Rewind
643                 i = 0
644                 loop
645                         if i is afe.length - 1
646                                 break
647                         i += 1
648                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
649                                 i -= 1 # Advance
650                                 break
651                 # Create
652                 loop
653                         el = afe[i].shallow_clone()
654                         tree_insert_element el
655                         afe[i] = el
656                         break if i is 0
657                         i -= 1 # Advance
658
659         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
660         # adoption agency algorithm
661         # overview here:
662         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
663         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
664         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
665         adoption_agency = (subject) ->
666                 debug_log "adoption_agency()"
667                 debug_log "tree: #{serialize_els doc.children, false, true}"
668                 debug_log "open_els: #{serialize_els open_els, true, true}"
669                 debug_log "afe: #{serialize_els afe, true, true}"
670                 if open_els[0].name is subject
671                         el = open_els[0]
672                         open_els.shift()
673                         # remove it from the list of active formatting elements (if found)
674                         for t, i in afe
675                                 if t is el
676                                         afe.splice i, 1
677                                         break
678                         debug_log "aaa: starting off with subject on top of stack, exiting"
679                         return
680                 outer = 0
681                 loop
682                         if outer >= 8
683                                 return
684                         outer += 1
685                         # 5. Let formatting element be the last element in the list of
686                         # active formatting elements that: is between the end of the list
687                         # and the last scope marker in the list, if any, or the start of
688                         # the list otherwise, and  has the tag name subject.
689                         fe = null
690                         for t, fe_of_afe in afe
691                                 if t.type is TYPE_AFE_MARKER
692                                         break
693                                 if t.name is subject
694                                         fe = t
695                                         break
696                         # If there is no such element, then abort these steps and instead
697                         # act as described in the "any other end tag" entry above.
698                         if fe is null
699                                 debug_log "aaa: fe not found in afe"
700                                 in_body_any_other_end_tag subject
701                                 return
702                         # 6. If formatting element is not in the stack of open elements,
703                         # then this is a parse error; remove the element from the list, and
704                         # abort these steps.
705                         in_open_els = false
706                         for t, fe_of_open_els in open_els
707                                 if t is fe
708                                         in_open_els = true
709                                         break
710                         unless in_open_els
711                                 debug_log "aaa: fe not found in open_els"
712                                 parse_error()
713                                 # "remove it from the list" must mean afe, since it's not in open_els
714                                 afe.splice fe_of_afe, 1
715                                 return
716                         # 7. If formatting element is in the stack of open elements, but
717                         # the element is not in scope, then this is a parse error; abort
718                         # these steps.
719                         unless el_is_in_scope fe
720                                 debug_log "aaa: fe not in scope"
721                                 parse_error()
722                                 return
723                         # 8. If formatting element is not the current node, this is a parse
724                         # error. (But do not abort these steps.)
725                         unless open_els[0] is fe
726                                 parse_error()
727                                 # continue
728                         # 9. Let furthest block be the topmost node in the stack of open
729                         # elements that is lower in the stack than formatting element, and
730                         # is an element in the special category. There might not be one.
731                         fb = null
732                         fb_of_open_els = null
733                         for t, i in open_els
734                                 if t is fe
735                                         break
736                                 if el_is_special t
737                                         fb = t
738                                         fb_of_open_els = i
739                                         # and continue, to see if there's one that's more "topmost"
740                         # 10. If there is no furthest block, then the UA must first pop all
741                         # the nodes from the bottom of the stack of open elements, from the
742                         # current node up to and including formatting element, then remove
743                         # formatting element from the list of active formatting elements,
744                         # and finally abort these steps.
745                         if fb is null
746                                 debug_log "aaa: no fb"
747                                 loop
748                                         t = open_els.shift()
749                                         if t is fe
750                                                 afe.splice fe_of_afe, 1
751                                                 return
752                         # 11. Let common ancestor be the element immediately above
753                         # formatting element in the stack of open elements.
754                         ca = open_els[fe_of_open_els + 1] # common ancestor
755
756                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
757                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
758                         bookmark = new_aaa_bookmark()
759                         for t, i in afe
760                                 if t is fe
761                                         afe.splice i, 0, bookmark
762                                         break
763                         node = last_node = fb
764                         inner = 0
765                         loop
766                                 inner += 1
767                                 # 3. Let node be the element immediately above node in the
768                                 # stack of open elements, or if node is no longer in the stack
769                                 # of open elements (e.g. because it got removed by this
770                                 # algorithm), the element that was immediately above node in
771                                 # the stack of open elements before node was removed.
772                                 node_next = null
773                                 for t, i in open_els
774                                         if t is node
775                                                 node_next = open_els[i + 1]
776                                                 break
777                                 node = node_next ? node_above
778                                 debug_log "inner loop #{inner}"
779                                 debug_log "tree: #{serialize_els doc.children, false, true}"
780                                 debug_log "open_els: #{serialize_els open_els, true, true}"
781                                 debug_log "afe: #{serialize_els afe, true, true}"
782                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
783                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
784                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
785                                 debug_log "node: #{node.serialize true, true}"
786                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
787
788                                 # 4. If node is formatting element, then go to the next step in
789                                 # the overall algorithm.
790                                 if node is fe
791                                         break
792                                 debug_log "the meat"
793                                 # 5. If inner loop counter is greater than three and node is in
794                                 # the list of active formatting elements, then remove node from
795                                 # the list of active formatting elements.
796                                 node_in_afe = false
797                                 for t, i in afe
798                                         if t is node
799                                                 if inner > 3
800                                                         afe.splice i, 1
801                                                         debug_log "max out inner"
802                                                 else
803                                                         node_in_afe = true
804                                                         debug_log "in afe"
805                                                 break
806                                 # 6. If node is not in the list of active formatting elements,
807                                 # then remove node from the stack of open elements and then go
808                                 # back to the step labeled inner loop.
809                                 unless node_in_afe
810                                         debug_log "not in afe"
811                                         for t, i in open_els
812                                                 if t is node
813                                                         node_above = open_els[i + 1]
814                                                         open_els.splice i, 1
815                                                         break
816                                         continue
817                                 debug_log "the bones"
818                                 # 7. create an element for the token for which the element node
819                                 # was created, in the HTML namespace, with common ancestor as
820                                 # the intended parent; replace the entry for node in the list
821                                 # of active formatting elements with an entry for the new
822                                 # element, replace the entry for node in the stack of open
823                                 # elements with an entry for the new element, and let node be
824                                 # the new element.
825                                 new_node = node.shallow_clone()
826                                 for t, i in afe
827                                         if t is node
828                                                 afe[i] = new_node
829                                                 debug_log "replaced in afe"
830                                                 break
831                                 for t, i in open_els
832                                         if t is node
833                                                 node_above = open_els[i + 1]
834                                                 open_els[i] = new_node
835                                                 debug_log "replaced in open_els"
836                                                 break
837                                 node = new_node
838                                 # 8. If last node is furthest block, then move the
839                                 # aforementioned bookmark to be immediately after the new node
840                                 # in the list of active formatting elements.
841                                 if last_node is fb
842                                         for t, i in afe
843                                                 if t is bookmark
844                                                         afe.splice i, 1
845                                                         debug_log "removed bookmark"
846                                                         break
847                                         for t, i in afe
848                                                 if t is node
849                                                         # "after" means lower
850                                                         afe.splice i, 0, bookmark # "after as <-
851                                                         debug_log "placed bookmark after node"
852                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
853                                                         break
854                                 # 9. Insert last node into node, first removing it from its
855                                 # previous parent node if any.
856                                 if last_node.parent?
857                                         debug_log "last_node has parent"
858                                         for c, i in last_node.parent.children
859                                                 if c is last_node
860                                                         debug_log "removing last_node from parent"
861                                                         last_node.parent.children.splice i, 1
862                                                         break
863                                 node.children.push last_node
864                                 last_node.parent = node
865                                 # 10. Let last node be node.
866                                 last_node = node
867                                 debug_log "at last"
868                                 # 11. Return to the step labeled inner loop.
869                         # 14. Insert whatever last node ended up being in the previous step
870                         # at the appropriate place for inserting a node, but using common
871                         # ancestor as the override target.
872
873                         # In the case where fe is immediately followed by fb:
874                         #   * inner loop exits out early (node==fe)
875                         #   * last_node is fb
876                         #   * last_node is still in the tree (not a duplicate)
877                         if last_node.parent?
878                                 debug_log "FEFIRST? last_node has parent"
879                                 for c, i in last_node.parent.children
880                                         if c is last_node
881                                                 debug_log "removing last_node from parent"
882                                                 last_node.parent.children.splice i, 1
883                                                 break
884
885                         debug_log "after aaa inner loop"
886                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
887                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
888                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
889                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
890                         debug_log "tree: #{serialize_els doc.children, false, true}"
891
892                         debug_log "insert"
893
894
895                         # can't use standard insert token thing, because it's already in
896                         # open_els and must stay at it's current position in open_els
897                         dest = adjusted_insertion_location ca
898                         dest[0].children.splice dest[1], 0, last_node
899                         last_node.parent = dest[0]
900
901
902                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
903                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
904                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
905                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
906                         debug_log "tree: #{serialize_els doc.children, false, true}"
907
908                         # 15. Create an element for the token for which formatting element
909                         # was created, in the HTML namespace, with furthest block as the
910                         # intended parent.
911                         new_element = fe.shallow_clone() # FIXME intended parent thing
912                         # 16. Take all of the child nodes of furthest block and append them
913                         # to the element created in the last step.
914                         while fb.children.length
915                                 t = fb.children.shift()
916                                 t.parent = new_element
917                                 new_element.children.push t
918                         # 17. Append that new element to furthest block.
919                         new_element.parent = fb
920                         fb.children.push new_element
921                         # 18. Remove formatting element from the list of active formatting
922                         # elements, and insert the new element into the list of active
923                         # formatting elements at the position of the aforementioned
924                         # bookmark.
925                         for t, i in afe
926                                 if t is fe
927                                         afe.splice i, 1
928                                         break
929                         for t, i in afe
930                                 if t is bookmark
931                                         afe[i] = new_element
932                                         break
933                         # 19. Remove formatting element from the stack of open elements,
934                         # and insert the new element into the stack of open elements
935                         # immediately below the position of furthest block in that stack.
936                         for t, i in open_els
937                                 if t is fe
938                                         open_els.splice i, 1
939                                         break
940                         for t, i in open_els
941                                 if t is fb
942                                         open_els.splice i, 0, new_element
943                                         break
944                         # 20. Jump back to the step labeled outer loop.
945                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
946                         debug_log "tree: #{serialize_els doc.children, false, true}"
947                         debug_log "open_els: #{serialize_els open_els, true, true}"
948                         debug_log "afe: #{serialize_els afe, true, true}"
949                 debug_log "AAA DONE"
950
951         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
952         close_p_element = ->
953                 generate_implied_end_tags 'p' # arg is exception
954                 if open_els[0].name isnt 'p'
955                         parse_error()
956                 while open_els.length > 1 # just in case
957                         el = open_els.shift()
958                         if el.name is 'p'
959                                 return
960         close_p_if_in_button_scope = ->
961                 if is_in_button_scope 'p'
962                         close_p_element()
963
964         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
965         # aka insert_a_character = (t) ->
966         insert_character = (t) ->
967                 dest = adjusted_insertion_location()
968                 # fixfull check for Document node
969                 if dest[1] > 0
970                         prev = dest[0].children[dest[1] - 1]
971                         if prev.type is TYPE_TEXT
972                                 prev.text += t.text
973                                 return
974                 dest[0].children.splice dest[1], 0, t
975
976         # 8.2.5.1
977         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
978         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
979         adjusted_insertion_location = (override_target = null) ->
980                 # 1. If there was an override target specified, then let target be the
981                 # override target.
982                 if override_target?
983                         target = override_target
984                 else # Otherwise, let target be the current node.
985                         target = open_els[0]
986                 # 2. Determine the adjusted insertion location using the first matching
987                 # steps from the following list:
988                 #
989                 # If foster parenting is enabled and target is a table, tbody, tfoot,
990                 # thead, or tr element Foster parenting happens when content is
991                 # misnested in tables.
992                 if flag_foster_parenting and foster_parenting_targets[target.name]
993                         loop # once. this is here so we can ``break`` to "abort these substeps"
994                                 # 1. Let last template be the last template element in the
995                                 # stack of open elements, if any.
996                                 last_template = null
997                                 last_template_i = null
998                                 for el, i in open_els
999                                         if el.name is 'template'
1000                                                 last_template = el
1001                                                 last_template_i = i
1002                                                 break
1003                                 # 2. Let last table be the last table element in the stack of
1004                                 # open elements, if any.
1005                                 last_table = null
1006                                 last_table_i
1007                                 for el, i in open_els
1008                                         if el.name is 'table'
1009                                                 last_table = el
1010                                                 last_table_i = i
1011                                                 break
1012                                 # 3. If there is a last template and either there is no last
1013                                 # table, or there is one, but last template is lower (more
1014                                 # recently added) than last table in the stack of open
1015                                 # elements, then: let adjusted insertion location be inside
1016                                 # last template's template contents, after its last child (if
1017                                 # any), and abort these substeps.
1018                                 if last_template and (last_table is null or last_template_i < last_table_i)
1019                                         target = template # fixfull should be it's contents
1020                                         target_i = target.children.length
1021                                         break
1022                                 # 4. If there is no last table, then let adjusted insertion
1023                                 # location be inside the first element in the stack of open
1024                                 # elements (the html element), after its last child (if any),
1025                                 # and abort these substeps. (fragment case)
1026                                 if last_table is null
1027                                         # this is odd
1028                                         target = open_els[open_els.length - 1]
1029                                         target_i = target.children.length
1030                                 # 5. If last table has a parent element, then let adjusted
1031                                 # insertion location be inside last table's parent element,
1032                                 # immediately before last table, and abort these substeps.
1033                                 if last_table.parent?
1034                                         for c, i in last_table.parent.children
1035                                                 if c is last_table
1036                                                         target = last_table.parent
1037                                                         target_i = i
1038                                                         break
1039                                         break
1040                                 # 6. Let previous element be the element immediately above last
1041                                 # table in the stack of open elements.
1042                                 #
1043                                 # huh? how could it not have a parent?
1044                                 previous_element = open_els[last_table_i + 1]
1045                                 # 7. Let adjusted insertion location be inside previous
1046                                 # element, after its last child (if any).
1047                                 target = previous_element
1048                                 target_i = target.children.length
1049                                 # Note: These steps are involved in part because it's possible
1050                                 # for elements, the table element in this case in particular,
1051                                 # to have been moved by a script around in the DOM, or indeed
1052                                 # removed from the DOM entirely, after the element was inserted
1053                                 # by the parser.
1054                                 break # don't really loop
1055                 else
1056                         # Otherwise Let adjusted insertion location be inside target, after
1057                         # its last child (if any).
1058                         target_i = target.children.length
1059
1060                 # 3. If the adjusted insertion location is inside a template element,
1061                 # let it instead be inside the template element's template contents,
1062                 # after its last child (if any).
1063                 # fixfull (template)
1064
1065                 # 4. Return the adjusted insertion location.
1066                 return [target, target_i]
1067
1068         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1069         # aka create_an_element_for_token
1070         token_to_element = (t, namespace, intended_parent) ->
1071                 t.type = TYPE_TAG # not TYPE_START_TAG
1072                 # convert attributes into a hash
1073                 attrs = {}
1074                 while t.attrs_a.length
1075                         a = t.attrs_a.pop()
1076                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1077                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1078
1079                 # TODO 2. If the newly created element has an xmlns attribute in the
1080                 # XMLNS namespace whose value is not exactly the same as the element's
1081                 # namespace, that is a parse error. Similarly, if the newly created
1082                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1083                 # value is not the XLink Namespace, that is a parse error.
1084
1085                 # fixfull: the spec says stuff about form pointers and ownerDocument
1086
1087                 return el
1088
1089         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1090         insert_foreign_element = (token, namespace) ->
1091                 ail = adjusted_insertion_location()
1092                 ail_el = ail[0]
1093                 ail_i = ail[1]
1094                 el = token_to_element token, namespace, ail_el
1095                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1096                 el.parent = ail_el
1097                 ail_el.children.splice ail_i, 0, el
1098                 open_els.unshift el
1099                 return el
1100         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1101         insert_html_element = insert_foreign_element # (token, namespace) ->
1102
1103         # FIXME read implement "foster parenting" part
1104         # FIXME read spec, do this right
1105         # FIXME implement the override target thing
1106         # note: this assumes it's an open tag
1107         # FIXME what part of the spec is this?
1108         # TODO look through all callers of this, and see what they should really be doing.
1109         #   eg probably insert_html_element for tokens
1110         tree_insert_element = (el, override_target = null, namespace = null) ->
1111                 if namespace?
1112                         el.namespace = namespace
1113                 dest = adjusted_insertion_location override_target
1114                 if el.type is TYPE_START_TAG # means it's a "token"
1115                         el = token_to_element el, namespace, dest[0]
1116                 unless el.namespace?
1117                         namespace = dest.namespace
1118                 # fixfull: Document nodes sometimes can't accept more chidren
1119                 dest[0].children.splice dest[1], 0, el
1120                 el.parent = dest[0]
1121                 open_els.unshift el
1122                 return el
1123
1124         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1125         # position should be [node, index_within_children]
1126         insert_comment = (t, position = null) ->
1127                 position ?= adjusted_insertion_location()
1128                 position[0].children.splice position[1], 0, t
1129
1130         # 8.2.5.2
1131         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1132         parse_generic_raw_text = (t) ->
1133                 insert_html_element t
1134                 tok_state = tok_state_rawtext
1135                 original_insertion_mode = insertion_mode
1136                 insertion_mode = ins_mode_text
1137         parse_generic_rcdata_text = (t) ->
1138                 insert_html_element t
1139                 tok_state = tok_state_rcdata
1140                 original_insertion_mode = insertion_mode
1141                 insertion_mode = ins_mode_text
1142
1143         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1144         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1145         generate_implied_end_tags = (except = null) ->
1146                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1147                         open_els.shift()
1148
1149         # 8.2.5.4 The rules for parsing tokens in HTML content
1150         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1151
1152         # 8.2.5.4.1 The "initial" insertion mode
1153         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1154         ins_mode_initial = (t) ->
1155                 if is_space_tok t
1156                         return
1157                 if t.type is TYPE_COMMENT
1158                         # fixfull this is supposed to be "the last child of the document object"
1159                         doc.children.push t
1160                         return
1161                 if t.type is TYPE_DOCTYPE
1162                         # fixfull
1163                         t.name = 'html'
1164                         doc.children.push t
1165                         insertion_mode = ins_mode_before_html
1166                         return
1167                 # Anything else
1168                 #fixfull (iframe, quirks)
1169                 insertion_mode = ins_mode_before_html
1170                 insertion_mode t # reprocess the token
1171                 return
1172
1173         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1174         ins_mode_before_html = (t) ->
1175                 if t.type is TYPE_DOCTYPE
1176                         parse_error()
1177                         return
1178                 if t.type is TYPE_COMMENT
1179                         doc.children.push t
1180                         return
1181                 if is_space_tok t
1182                         return
1183                 if t.type is TYPE_START_TAG and t.name is 'html'
1184                         el = token_to_element t, NS_HTML, doc
1185                         open_els.unshift(el)
1186                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1187                         insertion_mode = ins_mode_before_head
1188                         return
1189                 if t.type is TYPE_END_TAG
1190                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1191                                 # fall through to "anything else"
1192                         else
1193                                 parse_error()
1194                                 return
1195                 # Anything else
1196                 html_tok = new_open_tag 'html'
1197                 el = token_to_element html_tok, NS_HTML, doc
1198                 doc.children.push el
1199                 open_els.unshift el
1200                 # ?fixfull browsing context
1201                 insertion_mode = ins_mode_before_head
1202                 insertion_mode t
1203                 return
1204
1205         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1206         ins_mode_before_head = (t) ->
1207                 if is_space_tok t
1208                         return
1209                 if t.type is TYPE_COMMENT
1210                         insert_comment t
1211                         return
1212                 if t.type is TYPE_DOCTYPE
1213                         parse_error()
1214                         return
1215                 if t.type is TYPE_START_TAG and t.name is 'html'
1216                         ins_mode_in_body t
1217                         return
1218                 if t.type is TYPE_START_TAG and t.name is 'head'
1219                         el = insert_html_element t
1220                         head_element_pointer = el
1221                         insertion_mode = ins_mode_in_head
1222                 if t.type is TYPE_END_TAG
1223                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1224                                 # fall through to Anything else below
1225                         else
1226                                 parse_error()
1227                                 return
1228                 # Anything else
1229                 head_tok = new_open_tag 'head'
1230                 el = insert_html_element head_tok
1231                 head_element_pointer = el
1232                 insertion_mode = ins_mode_in_head
1233                 insertion_mode t # reprocess current token
1234
1235         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1236         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1237                 open_els.shift() # spec says this will be a 'head' node
1238                 insertion_mode = ins_mode_after_head
1239                 insertion_mode t
1240         ins_mode_in_head = (t) ->
1241                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1242                         insert_character t
1243                         return
1244                 if t.type is TYPE_COMMENT
1245                         insert_comment t
1246                         return
1247                 if t.type is TYPE_DOCTYPE
1248                         parse_error()
1249                         return
1250                 if t.type is TYPE_START_TAG and t.name is 'html'
1251                         ins_mode_in_body t
1252                         return
1253                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1254                         el = insert_html_element t
1255                         open_els.shift()
1256                         t.acknowledge_self_closing()
1257                         return
1258                 if t.type is TYPE_START_TAG and t.name is 'meta'
1259                         el = insert_html_element t
1260                         open_els.shift()
1261                         t.acknowledge_self_closing()
1262                         # fixfull encoding stuff
1263                         return
1264                 if t.type is TYPE_START_TAG and t.name is 'title'
1265                         parse_generic_rcdata_element t
1266                         return
1267                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1268                         parse_generic_raw_text t
1269                         return
1270                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1271                         insert_html_element t
1272                         insertion_mode = in_head_noscript # FIXME implement
1273                         return
1274                 if t.type is TYPE_START_TAG and t.name is 'script'
1275                         ail = adjusted_insertion_location()
1276                         el = token_to_element t, NS_HTML, ail
1277                         el.flag_parser_inserted true # FIXME implement
1278                         # fixfull frament case
1279                         ail[0].children.splice ail[1], 0, el
1280                         open_els.unshift el
1281                         tok_state = tok_state_script_data
1282                         original_insertion_mode = insertion_mode # make sure orig... is defined
1283                         insertion_mode = ins_mode_text # FIXME implement
1284                         return
1285                 if t.type is TYPE_END_TAG and t.name is 'head'
1286                         open_els.shift() # will be a head element... spec says so
1287                         insertion_mode = ins_mode_after_head
1288                         return
1289                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1290                         ins_mode_in_head_else t
1291                         return
1292                 if t.type is TYPE_START_TAG and t.name is 'template'
1293                         insert_html_element t
1294                         afe_push_marker()
1295                         flag_frameset_ok = false
1296                         insertion_mode = ins_mode_in_template
1297                         template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1298                         return
1299                 if t.type is TYPE_END_TAG and t.name is 'template'
1300                         if template_tag_is_open()
1301                                 generate_implied_end_tags
1302                                 if open_els[0].name isnt 'template'
1303                                         parse_error()
1304                                 loop
1305                                         el = open_els.shift()
1306                                         if el.name is 'template'
1307                                                 break
1308                                 clear_afe_to_marker()
1309                                 template_insertion_modes.shift()
1310                                 reset_insertion_mode()
1311                         else
1312                                 parse_error()
1313                         return
1314                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1315                         parse_error()
1316                         return
1317                 ins_mode_in_head_else t
1318
1319         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1320         ins_mode_in_head_noscript = (t) ->
1321                 # FIXME ?fixfull
1322                 console.log "ins_mode_in_head_noscript unimplemented"
1323
1324         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1325         ins_mode_after_head_else = (t) ->
1326                 body_tok = new_open_tag 'body'
1327                 insert_html_element body_tok
1328                 insertion_mode = ins_mode_in_body
1329                 insertion_mode t # reprocess token
1330                 return
1331         ins_mode_after_head = (t) ->
1332                 if is_space_tok t
1333                         insert_character t
1334                         return
1335                 if t.type is TYPE_COMMENT
1336                         insert_comment t
1337                         return
1338                 if t.type is TYPE_DOCTYPE
1339                         parse_error()
1340                         return
1341                 if t.type is TYPE_START_TAG and t.name is 'html'
1342                         ins_mode_in_body t
1343                         return
1344                 if t.type is TYPE_START_TAG and t.name is 'body'
1345                         insert_html_element t
1346                         flag_frameset_ok = false
1347                         insertion_mode = ins_mode_in_body
1348                         return
1349                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1350                         insert_html_element t
1351                         insertion_mode = ins_mode_in_frameset
1352                         return
1353                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1354                         parse_error()
1355                         open_els.unshift head_element_pointer
1356                         ins_mode_in_head t
1357                         for el, i of open_els
1358                                 if el is head_element_pointer
1359                                         open_els.splice i, 1
1360                                         return
1361                         console.log "warning: 23904 couldn't find head element in open_els"
1362                         return
1363                 if t.type is TYPE_END_TAG and t.name is 'template'
1364                         ins_mode_in_head t
1365                         return
1366                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1367                         ins_mode_after_head_else t
1368                         return
1369                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1370                         parse_error()
1371                         return
1372                 # Anything else
1373                 ins_mode_after_head_else t
1374
1375         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1376         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1377                 for node, i in open_els
1378                         if node.name is name # FIXME check namespace too
1379                                 generate_implied_end_tags name # arg is exception
1380                                 parse_error() unless i is 0
1381                                 while i >= 0
1382                                         open_els.shift()
1383                                         i -= 1
1384                                 return
1385                         if special_elements[node.name]? # FIXME check namespac too
1386                                 parse_error()
1387                                 return
1388         ins_mode_in_body = (t) ->
1389                 switch t.type
1390                         when TYPE_TEXT
1391                                 switch t.text
1392                                         when "\u0000"
1393                                                 parse_error()
1394                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
1395                                                 reconstruct_active_formatting_elements()
1396                                                 insert_character t
1397                                         else
1398                                                 reconstruct_active_formatting_elements()
1399                                                 insert_character t
1400                                                 flag_frameset_ok = false
1401                         when TYPE_COMMENT
1402                                 insert_comment t
1403                         when TYPE_DOCTYPE
1404                                 parse_error()
1405                         when TYPE_START_TAG
1406                                 switch t.name
1407                                         when 'html'
1408                                                 parse_error()
1409                                                 return if template_tag_is_open()
1410                                                 root_attrs = open_els[open_els.length - 1].attrs
1411                                                 for k, v of t.attrs
1412                                                         root_attrs[k] = v unless root_attrs[k]?
1413                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1414                                                 # FIXME also do this for </template> (end tag)
1415                                                 return ins_mode_in_head t
1416                                         when 'body'
1417                                                 parse_error()
1418                                                 # TODO
1419                                         when 'frameset'
1420                                                 parse_error()
1421                                                 # TODO
1422                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1423                                                 close_p_if_in_button_scope()
1424                                                 insert_html_element t
1425                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1426                                                 close_p_if_in_button_scope()
1427                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1428                                                         parse_error()
1429                                                         open_els.shift()
1430                                                 insert_html_element t
1431                                         # TODO lots more to implement here
1432                                         when 'a'
1433                                                 # If the list of active formatting elements
1434                                                 # contains an a element between the end of the list and
1435                                                 # the last marker on the list (or the start of the list
1436                                                 # if there is no marker on the list), then this is a
1437                                                 # parse error; run the adoption agency algorithm for
1438                                                 # the tag name "a", then remove that element from the
1439                                                 # list of active formatting elements and the stack of
1440                                                 # open elements if the adoption agency algorithm didn't
1441                                                 # already remove it (it might not have if the element
1442                                                 # is not in table scope).
1443                                                 found = false
1444                                                 for el in afe
1445                                                         if el.type is TYPE_AFE_MARKER
1446                                                                 break
1447                                                         if el.name is 'a'
1448                                                                 found = el
1449                                                 if found?
1450                                                         parse_error()
1451                                                         adoption_agency 'a'
1452                                                         for el, i in afe
1453                                                                 if el is found
1454                                                                         afe.splice i, 1
1455                                                         for el, i in open_els
1456                                                                 if el is found
1457                                                                         open_els.splice i, 1
1458                                                 reconstruct_active_formatting_elements()
1459                                                 el = insert_html_element t
1460                                                 afe_push el
1461                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1462                                                 reconstruct_active_formatting_elements()
1463                                                 el = insert_html_element t
1464                                                 afe_push el
1465                                         when 'table'
1466                                                 # fixfull quirksmode thing
1467                                                 close_p_if_in_button_scope()
1468                                                 insert_html_element t
1469                                                 insertion_mode = ins_mode_in_table
1470                                         # TODO lots more to implement here
1471                                         else # any other start tag
1472                                                 reconstruct_active_formatting_elements()
1473                                                 insert_html_element t
1474                         when TYPE_EOF
1475                                 ok_tags = {
1476                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1477                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1478                                 }
1479                                 for t in open_els
1480                                         unless ok_tags[t.name]?
1481                                                 parse_error()
1482                                                 break
1483                                 # TODO stack of template insertion modes thing
1484                                 stop_parsing()
1485                         when TYPE_END_TAG
1486                                 switch t.name
1487                                         when 'body'
1488                                                 unless is_in_scope 'body'
1489                                                         parse_error()
1490                                                         return
1491                                                 # TODO implement parse error and move to tree_after_body
1492                                         when 'html'
1493                                                 unless is_in_scope 'body' # weird, but it's what the spec says
1494                                                         parse_error()
1495                                                         return
1496                                                 # TODO implement parse error and move to tree_after_body, reprocess
1497                                         when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1498                                                 unless is_in_scope t.name, NS_HTML
1499                                                         parse_error()
1500                                                         return
1501                                                 generate_implied_end_tags()
1502                                                 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1503                                                         parse_error()
1504                                                 loop
1505                                                         el = open_els.shift()
1506                                                         if el.name is t.name and el.namespace is NS_HTML
1507                                                                 return
1508                                         # TODO lots more close tags to implement here
1509                                         when 'p'
1510                                                 unless is_in_button_scope 'p'
1511                                                         parse_error()
1512                                                         insert_html_element new_open_tag 'p'
1513                                                 close_p_element()
1514                                         # TODO lots more close tags to implement here
1515                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1516                                                 adoption_agency t.name
1517                                         # TODO lots more close tags to implement here
1518                                         else
1519                                                 in_body_any_other_end_tag t.name
1520                 return
1521
1522         ins_mode_in_table_else = (t) ->
1523                 parse_error()
1524                 flag_foster_parenting = true # FIXME
1525                 ins_mode_in_body t
1526                 flag_foster_parenting = false
1527         can_in_table = { # FIXME do this inline like everywhere else
1528                 'table': true
1529                 'tbody': true
1530                 'tfoot': true
1531                 'thead': true
1532                 'tr': true
1533         }
1534
1535         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1536         ins_mode_text = (t) ->
1537                 if t.type is TYPE_TEXT
1538                         insert_character t
1539                         return
1540                 if t.type is TYPE_EOF
1541                         parse_error()
1542                         if open_els[0].name is 'script'
1543                                 open_els[0].flag 'already started', true
1544                         open_els.shift()
1545                         insertion_mode = original_insertion_mode
1546                         insertion_mode t
1547                         return
1548                 if t.type is TYPE_END_TAG and t.name is 'script'
1549                         open_els.shift()
1550                         insertion_mode = original_insertion_mode
1551                         # fixfull the spec seems to assume that I'm going to run the script
1552                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1553                         return
1554                 if t.type is TYPE_END_TAG
1555                         open_els.shift()
1556                         insertion_mode = original_insertion_mode
1557                         return
1558                 console.log 'warning: end of ins_mode_text reached'
1559
1560         # the functions below implement the tokenizer stats described here:
1561         # http://www.w3.org/TR/html5/syntax.html#tokenization
1562
1563         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1564         ins_mode_in_table = (t) ->
1565                 switch t.type
1566                         when TYPE_TEXT
1567                                 if can_in_table[t.name]
1568                                         original_insertion_mode = insertion_mode
1569                                         insertion_mode = ins_mode_in_table_text
1570                                         insertion_mode t
1571                                 else
1572                                         ins_mode_in_table_else t
1573                         when TYPE_COMMENT
1574                                 insert_comment t
1575                         when TYPE_DOCTYPE
1576                                 parse_error()
1577                         when TYPE_START_TAG
1578                                 switch t.name
1579                                         when 'caption'
1580                                                 clear_stack_to_table_context()
1581                                                 afe_push_marker()
1582                                                 insert_html_element t
1583                                                 insertion_mode = ins_mode_in_caption
1584                                         when 'colgroup'
1585                                                 clear_stack_to_table_context()
1586                                                 insert_html_element t
1587                                                 insertion_mode = ins_mode_in_column_group
1588                                         when 'col'
1589                                                 clear_stack_to_table_context()
1590                                                 insert_html_element new_open_tag 'colgroup'
1591                                                 insertion_mode = ins_mode_in_column_group
1592                                                 insertion_mode t
1593                                         when 'tbody', 'tfoot', 'thead'
1594                                                 clear_stack_to_table_context()
1595                                                 insert_html_element t
1596                                                 insertion_mode = ins_mode_in_table_body
1597                                         when 'td', 'th', 'tr'
1598                                                 clear_stack_to_table_context()
1599                                                 insert_html_element new_open_tag 'tbody'
1600                                                 insertion_mode = ins_mode_in_table_body
1601                                                 insertion_mode t
1602                                         when 'table'
1603                                                 parse_error()
1604                                                 if is_in_table_scope 'table'
1605                                                         loop
1606                                                                 el = open_els.shift()
1607                                                                 if el.name is 'table'
1608                                                                         break
1609                                                         reset_insertion_mode()
1610                                                         insertion_mode t
1611                                         when 'style', 'script', 'template'
1612                                                 ins_mode_in_head t
1613                                         when 'input'
1614                                                 if token_is_input_hidden t
1615                                                         ins_mode_in_table_else t
1616                                                 else
1617                                                         parse_error()
1618                                                         el = insert_html_element t
1619                                                         open_els.shift()
1620                                                         t.acknowledge_self_closing()
1621                                         when 'form'
1622                                                 parse_error()
1623                                                 if form_element_pointer?
1624                                                         return
1625                                                 if template_tag_is_open()
1626                                                         return
1627                                                 form_element_pointer = insert_html_element t
1628                                                 open_els.shift()
1629                                         else
1630                                                 ins_mode_in_table_else t
1631                         when TYPE_END_TAG
1632                                 switch t.name
1633                                         when 'table'
1634                                                 if is_in_table_scope 'table'
1635                                                         loop
1636                                                                 el = open_els.shift()
1637                                                                 if el.name is 'table'
1638                                                                         break
1639                                                         reset_insertion_mode()
1640                                                 else
1641                                                         parse_error
1642                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1643                                                 parse_error()
1644                                         when 'template'
1645                                                 ins_mode_in_head t
1646                                         else
1647                                                 ins_mode_in_table_else t
1648                         when TYPE_EOF
1649                                 ins_mode_in_body t
1650                         else
1651                                 ins_mode_in_table_else t
1652
1653
1654         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1655         ins_mode_in_table_text = (t) ->
1656                 if t.type is TYPE_TEXT and t.text is "\u0000"
1657                         # huh? I thought the tokenizer didn't emit these
1658                         parse_error()
1659                         return
1660                 if t.type is TYPE_TEXT
1661                         pending_table_character_tokens.push t
1662                         return
1663                 # Anything else
1664                 all_space = true
1665                 for old in pending_table_character_tokens
1666                         unless is_space_tok old
1667                                 all_space = false
1668                                 break
1669                 if all_space
1670                         for old in pending_table_character_tokens
1671                                 insert_character old
1672                 else
1673                         for old in pending_table_character_tokens
1674                                 ins_mode_table_else old
1675                 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1676                 insertion_mode = original_insertion_mode
1677                 insertion_mode t
1678
1679         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1680         ins_mode_in_caption = (t) ->
1681                 if t.type is TYPE_END_TAG and t.name is 'caption'
1682                         if is_in_table_scope 'caption'
1683                                 generate_implied_end_tags()
1684                                 if open_els[0].name isnt 'caption'
1685                                         parse_error()
1686                                 loop
1687                                         el = open_els.shift()
1688                                         if el.name is 'caption'
1689                                                 break
1690                                 clear_afe_to_marker()
1691                                 insertion_mode = in_table
1692                         else
1693                                 parse_error()
1694                                 # fragment case
1695                         return
1696                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1697                         parse_error()
1698                         if is_in_table_scope 'caption'
1699                                 loop
1700                                         el = open_els.shift()
1701                                         if el.name is 'caption'
1702                                                 break
1703                                 clear_afe_to_marker()
1704                                 insertion_mode = in_table
1705                                 insertion_mode t
1706                         # else fragment case
1707                         return
1708                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1709                         parse_error()
1710                         return
1711                 # Anything else
1712                 ins_mode_in_body t
1713
1714         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1715         ins_mode_in_column_group = (t) ->
1716                 if is_space_tok t
1717                         insert_character t
1718                         return
1719                 if t.type is TYPE_COMMENT
1720                         insert_comment t
1721                         return
1722                 if t.type is TYPE_DOCTYPE
1723                         parse_error()
1724                         return
1725                 if t.type is TYPE_START_TAG and t.name is 'html'
1726                         ins_mode_in_body t
1727                         return
1728                 if t.type is TYPE_START_TAG and t.name is 'col'
1729                         el = insert_html_element t
1730                         open_els.shift()
1731                         t.acknowledge_self_closing()
1732                         return
1733                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1734                         if open_els[0].name is 'colgroup'
1735                                 open_els[0].shift()
1736                                 insertion_mode = ins_mode_in_table
1737                         else
1738                                 parse_error()
1739                         return
1740                 if t.type is TYPE_END_TAG and t.name is 'col'
1741                         parse_error()
1742                         return
1743                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1744                         ins_mode_in_head t
1745                         return
1746                 if t.type is TYPE_EOF
1747                         ins_mode_in_body t
1748                         return
1749                 # Anything else
1750                 if open_els[0].name isnt 'colgroup'
1751                         parse_error()
1752                         return
1753                 open_els.shift()
1754                 insertion_mode = ins_mode_in_table
1755                 insertion_mode t
1756                 return
1757
1758         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1759         ins_mode_in_table_body = (t) ->
1760                 if t.type is TYPE_START_TAG and t.name is 'tr'
1761                         clear_stack_to_table_body_context()
1762                         insert_html_element t
1763                         insertion_mode = ins_mode_in_row
1764                         return
1765                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1766                         parse_error()
1767                         clear_stack_to_table_body_context()
1768                         insert_html_element new_open_tag 'tr'
1769                         insertion_mode = ins_mode_in_row
1770                         insertion_mode t
1771                         return
1772                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1773                         unless is_in_table_scope t.name # fixfull check namespace
1774                                 parse_error()
1775                                 return
1776                         clear_stack_to_table_body_context()
1777                         open_els.shift()
1778                         insertion_mode = ins_mode_in_table
1779                         return
1780                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1781                         has = false
1782                         for el in open_els
1783                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1784                                         has = true
1785                                         break
1786                                 if table_scopers[el.name]
1787                                         break
1788                         if !has
1789                                 parse_error()
1790                                 return
1791                         clear_stack_to_table_body_context()
1792                         open_els.shift()
1793                         insertion_mode = ins_mode_in_table
1794                         insertion_mode t
1795                         return
1796                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1797                         parse_error()
1798                         return
1799                 # Anything else
1800                 ins_mode_in_table t
1801
1802         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1803         ins_mode_in_row = (t) ->
1804                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1805                         clear_stack_to_table_row_context()
1806                         insert_html_element t
1807                         insertion_mode = ins_mode_in_cell
1808                         afe_push_marker()
1809                         return
1810                 if t.type is TYPE_END_TAG and t.name is 'tr'
1811                         if is_in_table_scope 'tr'
1812                                 clear_stack_to_table_row_context()
1813                                 open_els.shift()
1814                                 insertion_mode = ins_mode_in_table_body
1815                         else
1816                                 parse_error()
1817                         return
1818                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1819                         if is_in_table_scope 'tr'
1820                                 clear_stack_to_table_row_context()
1821                                 open_els.shift()
1822                                 insertion_mode = ins_mode_in_table_body
1823                                 insertion_mode t
1824                         else
1825                                 parse_error()
1826                         return
1827                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1828                         if is_in_table_scope t.name # fixfull namespace
1829                                 if is_in_table_scope 'tr'
1830                                         clear_stack_to_table_row_context()
1831                                         open_els.shift()
1832                                         insertion_mode = ins_mode_in_table_body
1833                                         insertion_mode t
1834                         else
1835                                 parse_error()
1836                         return
1837                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1838                         parse_error()
1839                         return
1840                 # Anything else
1841                 ins_mode_in_table t
1842
1843         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1844         close_the_cell = ->
1845                 generate_implied_end_tags()
1846                 unless open_els[0].name is 'td' or open_els[0] is 'th'
1847                         parse_error()
1848                 loop
1849                         el = open_els.shift()
1850                         if el.name is 'td' or el.name is 'th'
1851                                 break
1852                 clear_afe_to_marker()
1853                 insertion_mode = ins_mode_in_row
1854
1855         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1856         ins_mode_in_cell = (t) ->
1857                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1858                         if is_in_table_scope t.name
1859                                 generate_implied_end_tags()
1860                                 if open_els[0].name isnt t.name
1861                                         parse_error
1862                                 loop
1863                                         el = open_els.shift()
1864                                         if el.name is t.name
1865                                                 break
1866                                 clear_afe_to_marker()
1867                                 insertion_mode = ins_mode_in_row
1868                         else
1869                                 parse_error()
1870                         return
1871                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1872                         has = false
1873                         for el in open_els
1874                                 if el.name is 'td' or el.name is 'th'
1875                                         has = true
1876                                         break
1877                                 if table_scopers[el.name]
1878                                         break
1879                         if !has
1880                                 parse_error()
1881                                 return
1882                         close_the_cell()
1883                         insertion_mode t
1884                         return
1885                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1886                         parse_error()
1887                         return
1888                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1889                         if is_in_table_scope t.name # fixfull namespace
1890                                 close_the_cell()
1891                                 insertion_mode t
1892                         else
1893                                 parse_error()
1894                         return
1895                 # Anything Else
1896                 ins_mode_in_body t
1897
1898         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
1899         ins_mode_in_select = (t) ->
1900                 if t.type is TYPE_TEXT and t.text is "\u0000"
1901                         parse_error()
1902                         return
1903                 if t.type is TYPE_TEXT
1904                         insert_character t
1905                         return
1906                 if t.type is TYPE_COMMENT
1907                         insert_comment t
1908                         return
1909                 if t.type is TYPE_DOCTYPE
1910                         parse_error()
1911                         return
1912                 if t.type is TYPE_START_TAG and t.name is 'html'
1913                         ins_mode_in_body t
1914                         return
1915                 if t.type is TYPE_START_TAG and t.name is 'option'
1916                         if open_els[0].name is 'option'
1917                                 open_els.shift()
1918                         insert_html_element t
1919                         return
1920                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
1921                         if open_els[0].name is 'option'
1922                                 open_els.shift()
1923                         if open_els[0].name is 'optgroup'
1924                                 open_els.shift()
1925                         insert_html_element t
1926                         return
1927                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
1928                         if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
1929                                 open_els.shift()
1930                         if open_els[0].name is 'optgroup'
1931                                 open_els.shift()
1932                         else
1933                                 parse_error()
1934                         return
1935                 if t.type is TYPE_END_TAG and t.name is 'option'
1936                         if open_els[0].name is 'option'
1937                                 open_els.shift()
1938                         else
1939                                 parse_error()
1940                         return
1941                 if t.type is TYPE_END_TAG and t.name is 'select'
1942                         if is_in_select_scope 'select'
1943                                 loop
1944                                         el = open_els.shift()
1945                                         if el.name is 'select'
1946                                                 break
1947                                 reset_insertion_mode()
1948                         else
1949                                 parse_error()
1950                         return
1951                 if t.type is TYPE_START_TAG and t.name is 'select'
1952                         parse_error()
1953                         loop
1954                                 el = open_els.shift()
1955                                 if el.name is 'select'
1956                                         break
1957                         reset_insertion_mode()
1958                         # spec says that this is the same as </select> but it doesn't say
1959                         # to check scope first
1960                         return
1961                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
1962                         parse_error()
1963                         if is_in_select_scope 'select'
1964                                 return
1965                         loop
1966                                 el = open_els.shift()
1967                                 if el.name is 'select'
1968                                         break
1969                         reset_insertion_mode()
1970                         insertion_mode t
1971                         return
1972                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
1973                         ins_mode_in_head t
1974                         return
1975                 if t.type is TYPE_EOF
1976                         ins_mode_in_body t
1977                         return
1978                 # Anything else
1979                 parse_error()
1980                 return
1981
1982         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
1983         ins_mode_in_select_in_table = (t) ->
1984                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1985                         parse_error()
1986                         loop
1987                                 el = open_els.shift()
1988                                 if el.name is 'select'
1989                                         break
1990                         reset_insertion_mode()
1991                         insertion_mode t
1992                         return
1993                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1994                         parse_error()
1995                         unless is_in_table_scope t.name, NS_HTML
1996                                 return
1997                         loop
1998                                 el = open_els.shift()
1999                                 if el.name is 'select'
2000                                         break
2001                         reset_insertion_mode()
2002                         insertion_mode t
2003                         return
2004                 # Anything else
2005                 ins_mode_in_select t
2006                 return
2007
2008         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2009         ins_mode_in_template = (t) ->
2010                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2011                         ins_mode_in_body t
2012                         return
2013                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2014                         ins_mode_in_head t
2015                         return
2016                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2017                         template_insertion_modes.shift()
2018                         template_insertion_modes.unshift ins_mode_in_table
2019                         insertion_mode = ins_mode_in_table
2020                         insertion_mode t
2021                         return
2022                 if t.type is TYPE_START_TAG and t.name is 'col'
2023                         template_insertion_modes.shift()
2024                         template_insertion_modes.unshift ins_mode_in_column_group
2025                         insertion_mode = ins_mode_in_column_group
2026                         insertion_mode t
2027                         return
2028                 if t.type is TYPE_START_TAG and t.name is 'tr'
2029                         template_insertion_modes.shift()
2030                         template_insertion_modes.unshift ins_mode_in_table_body
2031                         insertion_mode = ins_mode_in_table_body
2032                         insertion_mode t
2033                         return
2034                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2035                         template_insertion_modes.shift()
2036                         template_insertion_modes.unshift ins_mode_in_row
2037                         insertion_mode = ins_mode_in_row
2038                         insertion_mode t
2039                         return
2040                 if t.type is TYPE_START_TAG
2041                         template_insertion_modes.shift()
2042                         template_insertion_modes.unshift ins_mode_in_body
2043                         insertion_mode = ins_mode_in_body
2044                         insertion_mode t
2045                         return
2046                 if t.type is TYPE_END_TAG
2047                         parse_error()
2048                         return
2049                 if t.type is EOF
2050                         unless template_tag_is_open()
2051                                 stop_parsing()
2052                                 return
2053                         parse_error()
2054                         loop
2055                                 el = open_els.shift()
2056                                 if el.name is 'template' # fixfull check namespace
2057                                         break
2058                         clear_afe_to_marker()
2059                         template_insertion_modes.shift()
2060                         reset_insertion_mode()
2061                         insertion_mode t
2062
2063         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2064         ins_mode_after_body = (t) ->
2065                 if is_space_tok t
2066                         ins_mode_in_body t
2067                         return
2068                 if t.type is TYPE_COMMENT
2069                         insert_comment t, [open_els[0], open_els[0].children.length]
2070                         return
2071                 if t.type is TYPE_DOCTYPE
2072                         parse_error()
2073                         return
2074                 if t.type is TYPE_START_TAG and t.name is 'html'
2075                         ins_mode_in_body t
2076                         return
2077                 if t.type is TYPE_END_TAG and t.name is 'html'
2078                         # fixfull fragment case
2079                         insertion_mode = ins_mode_after_after_body
2080                         return
2081                 if t.type is TYPE_EOF
2082                         stop_parsing()
2083                         return
2084                 # Anything ELse
2085                 parse_error()
2086                 insertion_mode = ins_mode_in_body
2087                 insertion_mode t
2088
2089         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2090         ins_mode_in_frameset = (t) ->
2091                 if is_space_tok t
2092                         insert_character t
2093                         return
2094                 if t.type is TYPE_COMMENT
2095                         insert_comment t
2096                         return
2097                 if t.type is TYPE_DOCTYPE
2098                         parse_error()
2099                         return
2100                 if t.type is TYPE_START_TAG and t.name is 'html'
2101                         ins_mode_in_body t
2102                         return
2103                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2104                         insert_html_element t
2105                         return
2106                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2107                         # TODO ?correct for: "if the current node is the root html element"
2108                         if open_els.length is 1
2109                                 parse_error()
2110                                 return # fragment case
2111                         open_els.shift()
2112                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2113                                 insertion_mode = ins_mode_after_frameset
2114                         return
2115                 if t.type is TYPE_START_TAG and t.name is 'frame'
2116                         insert_html_element t
2117                         open_els.shift()
2118                         t.acknowledge_self_closing()
2119                         return
2120                 if t.type is TYPE_START TAG and t.name is 'noframes'
2121                         ins_mode_in_head t
2122                         return
2123                 if t.type is TYPE_EOF
2124                         # TODO ?correct for: "if the current node is not the root html element"
2125                         if open_els.length isnt 1
2126                                 parse_error()
2127                         stop_parsing()
2128                         return
2129                 # Anything else
2130                 parse_error()
2131                 return
2132
2133         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2134         ins_mode_after_frameset = (t) ->
2135                 if is_space_tok t
2136                         insert_character t
2137                         return
2138                 if t.type is TYPE_COMMENT
2139                         insert_comment t
2140                         return
2141                 if t.type is TYPE_DOCTYPE
2142                         parse_error()
2143                         return
2144                 if t.type is TYPE_START_TAG and t.name is 'html'
2145                         ins_mode_in_body t
2146                         return
2147                 if t.type is TYPE_END_TAG and t.name is 'html'
2148                         insert_mode = ins_mode_after_after_frameset
2149                         return
2150                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2151                         ins_mode_in_head t
2152                         return
2153                 if t.type is TYPE_EOF
2154                         stop_parsing()
2155                         return
2156                 # Anything else
2157                 parse_error()
2158                 return
2159
2160         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2161         ins_mode_after_after_body = (t) ->
2162                 if t.type is TYPE_COMMENT
2163                         insert_comment t, [doc, doc.children.length]
2164                         return
2165                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2166                         ins_mode_in_body t
2167                         return
2168                 if t.type is TYPE_EOF
2169                         stop_parsing()
2170                         return
2171                 # Anything else
2172                 parse_error()
2173                 insertion_mode = ins_mode_in_body
2174                 return
2175
2176         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2177         ins_mode_after_after_frameset = (t) ->
2178                 if t.type is TYPE_COMMENT
2179                         insert_comment t, [doc, doc.children.length]
2180                         return
2181                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2182                         ins_mode_in_body t
2183                         return
2184                 if t.type is TYPE_EOF
2185                         stop_parsing()
2186                         return
2187                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2188                         ins_mode_in_head t
2189                         return
2190                 # Anything else
2191                 parse_error()
2192                 return
2193
2194
2195
2196
2197
2198         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2199         tok_state_data = ->
2200                 switch c = txt.charAt(cur++)
2201                         when '&'
2202                                 return new_text_node parse_character_reference()
2203                         when '<'
2204                                 tok_state = tok_state_tag_open
2205                         when "\u0000"
2206                                 parse_error()
2207                                 return new_text_node c
2208                         when '' # EOF
2209                                 return new_eof_token()
2210                         else
2211                                 return new_text_node c
2212                 return null
2213
2214         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2215         # not needed: tok_state_character_reference_in_data = ->
2216         # just call parse_character_reference()
2217
2218         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2219         tok_state_rcdata = ->
2220                 switch c = txt.charAt(cur++)
2221                         when '&'
2222                                 return new_text_node parse_character_reference()
2223                         when '<'
2224                                 tok_state = tok_state_rcdata_less_than_sign
2225                         when "\u0000"
2226                                 parse_error()
2227                                 return new_character_token "\ufffd"
2228                         when '' # EOF
2229                                 return new_eof_token()
2230                         else
2231                                 return new_character_token c
2232                 return null
2233
2234         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2235         # not needed: tok_state_character_reference_in_rcdata = ->
2236         # just call parse_character_reference()
2237
2238         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2239         tok_state_rawtext = ->
2240                 switch c = txt.charAt(cur++)
2241                         when '<'
2242                                 tok_state = tok_state_rawtext_less_than_sign
2243                         when "\u0000"
2244                                 parse_error()
2245                                 return new_character_token "\ufffd"
2246                         when '' # EOF
2247                                 return new_eof_token()
2248                         else
2249                                 return new_character_token c
2250                 return null
2251
2252         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2253         tok_state_script_data = ->
2254                 switch c = txt.charAt(cur++)
2255                         when '<'
2256                                 tok_state = tok_state_script_data_less_than_sign
2257                         when "\u0000"
2258                                 parse_error()
2259                                 return new_character_token "\ufffd"
2260                         when '' # EOF
2261                                 return new_eof_token()
2262                         else
2263                                 return new_character_token c
2264                 return null
2265
2266         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2267         tok_state_plaintext = ->
2268                 switch c = txt.charAt(cur++)
2269                         when "\u0000"
2270                                 parse_error()
2271                                 return new_character_token "\ufffd"
2272                         when '' # EOF
2273                                 return new_eof_token()
2274                         else
2275                                 return new_character_token c
2276                 return null
2277
2278
2279         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2280         tok_state_tag_open = ->
2281                 switch c = txt.charAt(cur++)
2282                         when '!'
2283                                 tok_state = tok_state_markup_declaration_open
2284                         when '/'
2285                                 tok_state = tok_state_end_tag_open
2286                         when '?'
2287                                 parse_error()
2288                                 tok_cur_tag = new_comment_token '?'
2289                                 tok_state = tok_state_bogus_comment
2290                         else
2291                                 if lc_alpha.indexOf(c) > -1
2292                                         tok_cur_tag = new_open_tag c
2293                                         tok_state = tok_state_tag_name
2294                                 else if uc_alpha.indexOf(c) > -1
2295                                         tok_cur_tag = new_open_tag c.toLowerCase()
2296                                         tok_state = tok_state_tag_name
2297                                 else
2298                                         parse_error()
2299                                         tok_state = tok_state_data
2300                                         cur -= 1 # we didn't parse/handle the char after <
2301                                         return new_text_node '<'
2302                 return null
2303
2304         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2305         tok_state_end_tag_open = ->
2306                 switch c = txt.charAt(cur++)
2307                         when '>'
2308                                 parse_error()
2309                                 tok_state = tok_state_data
2310                         when '' # EOF
2311                                 parse_error()
2312                                 tok_state = tok_state_data
2313                                 return new_text_node '</'
2314                         else
2315                                 if uc_alpha.indexOf(c) > -1
2316                                         tok_cur_tag = new_end_tag c.toLowerCase()
2317                                         tok_state = tok_state_tag_name
2318                                 else if lc_alpha.indexOf(c) > -1
2319                                         tok_cur_tag = new_end_tag c
2320                                         tok_state = tok_state_tag_name
2321                                 else
2322                                         parse_error()
2323                                         tok_cur_tag = new_comment_token '/'
2324                                         tok_state = tok_state_bogus_comment
2325                 return null
2326
2327         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2328         tok_state_tag_name = ->
2329                 switch c = txt.charAt(cur++)
2330                         when "\t", "\n", "\u000c", ' '
2331                                 tok_state = tok_state_before_attribute_name
2332                         when '/'
2333                                 tok_state = tok_state_self_closing_start_tag
2334                         when '>'
2335                                 tok_state = tok_state_data
2336                                 tmp = tok_cur_tag
2337                                 tok_cur_tag = null
2338                                 return tmp
2339                         when "\u0000"
2340                                 parse_error()
2341                                 tok_cur_tag.name += "\ufffd"
2342                         when '' # EOF
2343                                 parse_error()
2344                                 tok_state = tok_state_data
2345                         else
2346                                 if uc_alpha.indexOf(c) > -1
2347                                         tok_cur_tag.name += c.toLowerCase()
2348                                 else
2349                                         tok_cur_tag.name += c
2350                 return null
2351
2352         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2353         tok_state_rcdata_less_than_sign = ->
2354                 c = txt.charAt(cur++)
2355                 if c is '/'
2356                         temporary_buffer = ''
2357                         tok_state = tok_state_rcdata_end_tag_open
2358                         return null
2359                 # Anything else
2360                 tok_state = tok_state_rcdata
2361                 cur -= 1 # reconsume the input character
2362                 return new_character_token '<'
2363
2364         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2365         tok_state_rcdata_end_tag_open = ->
2366                 c = txt.charAt(cur++)
2367                 if uc_alpha.indexOf(c) > -1
2368                         tok_cur_tag = new_end_tag c.toLowerCase()
2369                         temporary_buffer += c
2370                         tok_state = tok_state_rcdata_end_tag_name
2371                         return null
2372                 if lc_alpha.indexOf(c) > -1
2373                         tok_cur_tag = new_end_tag c
2374                         temporary_buffer += c
2375                         tok_state = tok_state_rcdata_end_tag_name
2376                         return null
2377                 # Anything else
2378                 tok_state = tok_state_rcdata
2379                 cur -= 1 # reconsume the input character
2380                 return new_character_token "</" # fixfull separate these
2381
2382         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2383         is_appropriate_end_tag = (t) ->
2384                 # spec says to check against "the tag name of the last start tag to
2385                 # have been emitted from this tokenizer", but this is only called from
2386                 # the various "raw" states, which I'm pretty sure all push the start
2387                 # token onto open_els. TODO: verify this after the script data states
2388                 # are implemented
2389                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2390                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2391
2392         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2393         tok_state_rcdata_end_tag_name = ->
2394                 c = txt.charAt(cur++)
2395                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2396                         if is_appropriate_end_tag tok_cur_tag
2397                                 tok_state = tok_state_before_attribute_name
2398                                 return
2399                         # else fall through to "Anything else"
2400                 if c is '/'
2401                         if is_appropriate_end_tag tok_cur_tag
2402                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2403                                 return
2404                         # else fall through to "Anything else"
2405                 if c is '>'
2406                         if is_appropriate_end_tag tok_cur_tag
2407                                 tok_state = tok_state_data
2408                                 return tok_cur_tag
2409                         # else fall through to "Anything else"
2410                 if uc_alpha.indexOf(c) > -1
2411                         tok_cur_tag.name += c.toLowerCase()
2412                         temporary_buffer += c
2413                         return null
2414                 if lc_alpha.indexOf(c) > -1
2415                         tok_cur_tag.name += c
2416                         temporary_buffer += c
2417                         return null
2418                 # Anything else
2419                 tok_state = tok_state_rcdata
2420                 cur -= 1 # reconsume the input character
2421                 return new_character_token '</' + temporary_buffer # fixfull separate these
2422
2423         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2424         tok_state_rawtext_less_than_sign = ->
2425                 c = txt.charAt(cur++)
2426                 if c is '/'
2427                         temporary_buffer = ''
2428                         tok_state = tok_state_rawtext_end_tag_open
2429                         return null
2430                 # Anything else
2431                 tok_state = tok_state_rawtext
2432                 cur -= 1 # reconsume the input character
2433                 return new_character_token '<'
2434
2435         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2436         tok_state_rawtext_end_tag_open = ->
2437                 c = txt.charAt(cur++)
2438                 if uc_alpha.indexOf(c) > -1
2439                         tok_cur_tag = new_end_tag c.toLowerCase()
2440                         temporary_buffer += c
2441                         tok_state = tok_state_rawtext_end_tag_name
2442                         return null
2443                 if lc_alpha.indexOf(c) > -1
2444                         tok_cur_tag = new_end_tag c
2445                         temporary_buffer += c
2446                         tok_state = tok_state_rawtext_end_tag_name
2447                         return null
2448                 # Anything else
2449                 tok_state = tok_state_rawtext
2450                 cur -= 1 # reconsume the input character
2451                 return new_character_token "</" # fixfull separate these
2452
2453         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2454         tok_state_rawtext_end_tag_name = ->
2455                 c = txt.charAt(cur++)
2456                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2457                         if is_appropriate_end_tag tok_cur_tag
2458                                 tok_state = tok_state_before_attribute_name
2459                                 return
2460                         # else fall through to "Anything else"
2461                 if c is '/'
2462                         if is_appropriate_end_tag tok_cur_tag
2463                                 tok_state = tok_state_self_closing_start_tag
2464                                 return
2465                         # else fall through to "Anything else"
2466                 if c is '>'
2467                         if is_appropriate_end_tag tok_cur_tag
2468                                 tok_state = tok_state_data
2469                                 return tok_cur_tag
2470                         # else fall through to "Anything else"
2471                 if uc_alpha.indexOf(c) > -1
2472                         tok_cur_tag.name += c.toLowerCase()
2473                         temporary_buffer += c
2474                         return null
2475                 if lc_alpha.indexOf(c) > -1
2476                         tok_cur_tag.name += c
2477                         temporary_buffer += c
2478                         return null
2479                 # Anything else
2480                 tok_state = tok_state_rawtext
2481                 cur -= 1 # reconsume the input character
2482                 return new_character_token '</' + temporary_buffer # fixfull separate these
2483
2484         # TODO _all_ of the missing states here (17-33) are for parsing script tags
2485
2486         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2487         tok_state_before_attribute_name = ->
2488                 attr_name = null
2489                 switch c = txt.charAt(cur++)
2490                         when "\t", "\n", "\u000c", ' '
2491                                 return null
2492                         when '/'
2493                                 tok_state = tok_state_self_closing_start_tag
2494                                 return null
2495                         when '>'
2496                                 tok_state = tok_state_data
2497                                 tmp = tok_cur_tag
2498                                 tok_cur_tag = null
2499                                 return tmp
2500                         when "\u0000"
2501                                 parse_error()
2502                                 attr_name = "\ufffd"
2503                         when '"', "'", '<', '='
2504                                 parse_error()
2505                                 attr_name = c
2506                         when '' # EOF
2507                                 parse_error()
2508                                 tok_state = tok_state_data
2509                         else
2510                                 if uc_alpha.indexOf(c) > -1
2511                                         attr_name = c.toLowerCase()
2512                                 else
2513                                         attr_name = c
2514                 if attr_name?
2515                         tok_cur_tag.attrs_a.unshift [attr_name, '']
2516                         tok_state = tok_state_attribute_name
2517                 return null
2518
2519         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2520         tok_state_attribute_name = ->
2521                 switch c = txt.charAt(cur++)
2522                         when "\t", "\n", "\u000c", ' '
2523                                 tok_state = tok_state_after_attribute_name
2524                         when '/'
2525                                 tok_state = tok_state_self_closing_start_tag
2526                         when '='
2527                                 tok_state = tok_state_before_attribute_value
2528                         when '>'
2529                                 tok_state = tok_state_data
2530                                 tmp = tok_cur_tag
2531                                 tok_cur_tag = null
2532                                 return tmp
2533                         when "\u0000"
2534                                 parse_error()
2535                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2536                         when '"', "'", '<'
2537                                 parse_error()
2538                                 tok_cur_tag.attrs_a[0][0] = c
2539                         when '' # EOF
2540                                 parse_error()
2541                                 tok_state = tok_state_data
2542                         else
2543                                 if uc_alpha.indexOf(c) > -1
2544                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2545                                 else
2546                                         tok_cur_tag.attrs_a[0][0] += c
2547                 return null
2548
2549         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2550         tok_state_after_attribute_name = ->
2551                 c = txt.charAt(cur++)
2552                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2553                         return
2554                 if c is '/'
2555                         tok_state = tok_state_self_closing_start_tag
2556                         return
2557                 if c is '='
2558                         tok_state = tok_state_before_attribute_value
2559                         return
2560                 if c is '>'
2561                         tok_state = tok_state_data
2562                         return
2563                 if uc_alpha.indexOf(c) > -1
2564                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2565                         tok_state = tok_state_attribute_name
2566                         return
2567                 if c is "\u0000"
2568                         parse_error()
2569                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2570                         tok_state = tok_state_attribute_name
2571                         return
2572                 if c is '' # EOF
2573                         parse_error()
2574                         tok_state = tok_state_data
2575                         cur -= 1 # reconsume
2576                         return
2577                 if c is '"' or c is "'" or c is '<'
2578                         parse_error()
2579                         # fall through to Anything else
2580                 # Anything else
2581                 tok_cur_tag.attrs_a.unshift [c, '']
2582                 tok_state = tok_state_attribute_name
2583
2584         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2585         tok_state_before_attribute_value = ->
2586                 switch c = txt.charAt(cur++)
2587                         when "\t", "\n", "\u000c", ' '
2588                                 return null
2589                         when '"'
2590                                 tok_state = tok_state_attribute_value_double_quoted
2591                         when '&'
2592                                 tok_state = tok_state_attribute_value_unquoted
2593                                 cur -= 1
2594                         when "'"
2595                                 tok_state = tok_state_attribute_value_single_quoted
2596                         when "\u0000"
2597                                 # Parse error
2598                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2599                                 tok_state = tok_state_attribute_value_unquoted
2600                         when '>'
2601                                 # Parse error
2602                                 tok_state = tok_state_data
2603                                 tmp = tok_cur_tag
2604                                 tok_cur_tag = null
2605                                 return tmp
2606                         when '' # EOF
2607                                 parse_error()
2608                                 tok_state = tok_state_data
2609                         else
2610                                 tok_cur_tag.attrs_a[0][1] += c
2611                                 tok_state = tok_state_attribute_value_unquoted
2612                 return null
2613
2614         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2615         tok_state_attribute_value_double_quoted = ->
2616                 switch c = txt.charAt(cur++)
2617                         when '"'
2618                                 tok_state = tok_state_after_attribute_value_quoted
2619                         when '&'
2620                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2621                         when "\u0000"
2622                                 # Parse error
2623                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2624                         when '' # EOF
2625                                 parse_error()
2626                                 tok_state = tok_state_data
2627                         else
2628                                 tok_cur_tag.attrs_a[0][1] += c
2629                 return null
2630
2631         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2632         tok_state_attribute_value_single_quoted = ->
2633                 switch c = txt.charAt(cur++)
2634                         when "'"
2635                                 tok_state = tok_state_after_attribute_value_quoted
2636                         when '&'
2637                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2638                         when "\u0000"
2639                                 # Parse error
2640                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2641                         when '' # EOF
2642                                 parse_error()
2643                                 tok_state = tok_state_data
2644                         else
2645                                 tok_cur_tag.attrs_a[0][1] += c
2646                 return null
2647
2648         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2649         tok_state_attribute_value_unquoted = ->
2650                 switch c = txt.charAt(cur++)
2651                         when "\t", "\n", "\u000c", ' '
2652                                 tok_state = tok_state_before_attribute_name
2653                         when '&'
2654                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2655                         when '>'
2656                                 tok_state = tok_state_data
2657                                 tmp = tok_cur_tag
2658                                 tok_cur_tag = null
2659                                 return tmp
2660                         when "\u0000"
2661                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2662                         when '' # EOF
2663                                 parse_error()
2664                                 tok_state = tok_state_data
2665                         else
2666                                 # Parse Error if ', <, = or ` (backtick)
2667                                 tok_cur_tag.attrs_a[0][1] += c
2668                 return null
2669
2670         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2671         tok_state_after_attribute_value_quoted = ->
2672                 switch c = txt.charAt(cur++)
2673                         when "\t", "\n", "\u000c", ' '
2674                                 tok_state = tok_state_before_attribute_name
2675                         when '/'
2676                                 tok_state = tok_state_self_closing_start_tag
2677                         when '>'
2678                                 tok_state = tok_state_data
2679                                 tmp = tok_cur_tag
2680                                 tok_cur_tag = null
2681                                 return tmp
2682                         when '' # EOF
2683                                 parse_error()
2684                                 tok_state = tok_state_data
2685                         else
2686                                 # Parse Error
2687                                 tok_state = tok_state_before_attribute_name
2688                                 cur -= 1 # we didn't handle that char
2689                 return null
2690
2691         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
2692         # WARNING: put a comment token in tok_cur_tag before setting this state
2693         tok_state_bogus_comment = ->
2694                 next_gt = txt.indexOf '>', cur
2695                 if next_gt is -1
2696                         val = txt.substr cur
2697                         cur = txt.length
2698                 else
2699                         val = txt.substr cur, (next_gt - cur)
2700                         cur = next_gt + 1
2701                 val = val.replace "\u0000", "\ufffd"
2702                 tok_cur_tag.text += val
2703                 tok_state = tok_state_data
2704                 return tok_cur_tag
2705
2706         # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
2707         tok_state_markup_declaration_open = ->
2708                 if txt.substr(cur, 2) is '--'
2709                         cur += 2
2710                         tok_cur_tag = new_comment_token ''
2711                         tok_state = tok_state_comment_start
2712                         return
2713                 if txt.substr(cur, 7).toLowerCase() is 'doctype'
2714                         cur += 7
2715                         tok_state = tok_state_doctype
2716                         return
2717                 acn = adjusted_current_node()
2718                 if acn and acn.namespace isnt NS_HTML and text.substr(cur, 7) is '[CDATA['
2719                         cur += 7
2720                         tok_state = tok_state_cdata_section
2721                         return
2722                 # Otherwise
2723                 parse_errer()
2724                 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
2725                 tok_state = tok_state_bogus_comment
2726                 return
2727
2728         # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
2729         tok_state_comment_start = ->
2730                 switch c = txt.charAt(cur++)
2731                         when '-'
2732                                 tok_state = tok_state_comment_start_dash
2733                         when "\u0000"
2734                                 parse_error()
2735                                 return new_character_token "\ufffd"
2736                         when '>'
2737                                 parse_error()
2738                                 tok_state = tok_state_data
2739                                 return tok_cur_tag
2740                         when '' # EOF
2741                                 parse_error()
2742                                 tok_state = tok_state_data
2743                                 cur -= 1 # Reconsume
2744                                 return tok_cur_tag
2745                         else
2746                                 tok_cur_tag.text += c
2747                 return null
2748
2749         # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
2750         tok_state_comment_start_dash = ->
2751                 switch c = txt.charAt(cur++)
2752                         when '-'
2753                                 tok_state = tok_state_comment_end
2754                         when "\u0000"
2755                                 parse_error()
2756                                 tok_cur_tag.text += "-\ufffd"
2757                                 tok_state = tok_state_comment
2758                         when '>'
2759                                 parse_error()
2760                                 tok_state = tok_state_data
2761                                 return tok_cur_tag
2762                         when '' # EOF
2763                                 parse_error()
2764                                 tok_state = tok_state_data
2765                                 cur -= 1 # Reconsume
2766                                 return tok_cur_tag
2767                         else
2768                                 tok_cur_tag.text += "-#{c}"
2769                                 tok_state = tok_state_comment
2770                 return null
2771
2772         # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
2773         tok_state_comment = ->
2774                 switch c = txt.charAt(cur++)
2775                         when '-'
2776                                 tok_state = tok_state_comment_end_dash
2777                         when "\u0000"
2778                                 parse_error()
2779                                 tok_cur_tag.text += "\ufffd"
2780                         when '' # EOF
2781                                 parse_error()
2782                                 tok_state = tok_state_data
2783                                 cur -= 1 # Reconsume
2784                                 return tok_cur_tag
2785                         else
2786                                 tok_cur_tag.text += c
2787                 return null
2788
2789         # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
2790         tok_state_comment_end_dash = ->
2791                 switch c = txt.charAt(cur++)
2792                         when '-'
2793                                 tok_state = tok_state_comment_end
2794                         when "\u0000"
2795                                 parse_error()
2796                                 tok_cur_tag.text += "-\ufffd"
2797                                 tok_state = tok_state_comment
2798                         when '' # EOF
2799                                 parse_error()
2800                                 tok_state = tok_state_data
2801                                 cur -= 1 # Reconsume
2802                                 return tok_cur_tag
2803                         else
2804                                 tok_cur_tag.text += "-#{c}"
2805                                 tok_state = tok_state_comment
2806                 return null
2807
2808         # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
2809         tok_state_comment_end = ->
2810                 switch c = txt.charAt(cur++)
2811                         when '>'
2812                                 tok_state = tok_state_data
2813                                 return tok_cur_tag
2814                         when "\u0000"
2815                                 parse_error()
2816                                 tok_cur_tag.text += "--\ufffd"
2817                                 tok_state = tok_state_comment
2818                         when '!'
2819                                 parse_error()
2820                                 tok_state = tok_state_comment_end_bang
2821                         when '-'
2822                                 parse_error()
2823                                 tok_cur_tag.text += '-'
2824                         when '' # EOF
2825                                 parse_error()
2826                                 tok_state = tok_state_data
2827                                 cur -= 1 # Reconsume
2828                                 return tok_cur_tag
2829                         else
2830                                 parse_error()
2831                                 tok_cur_tag.text += "--#{c}"
2832                                 tok_state = tok_state_comment
2833                 return null
2834
2835         # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
2836         tok_state_comment_end_bang = ->
2837                 switch c = txt.charAt(cur++)
2838                         when '-'
2839                                 tok_cur_tag.text += "--!#{c}"
2840                                 tok_state = tok_state_comment_end_dash
2841                         when '>'
2842                                 tok_state = tok_state_data
2843                                 return tok_cur_tag
2844                         when "\u0000"
2845                                 parse_error()
2846                                 tok_cur_tag.text += "--!\ufffd"
2847                                 tok_state = tok_state_comment
2848                         when '' # EOF
2849                                 parse_error()
2850                                 tok_state = tok_state_data
2851                                 cur -= 1 # Reconsume
2852                                 return tok_cur_tag
2853                         else
2854                                 tok_cur_tag.text += "--!#{c}"
2855                                 tok_state = tok_state_comment
2856                 return null
2857
2858
2859         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
2860         # Don't set this as a state, just call it
2861         # returns a string (NOT a text node)
2862         parse_character_reference = (allowed_char = null, in_attr = false) ->
2863                 if cur >= txt.length
2864                         return '&'
2865                 switch c = txt.charAt(cur)
2866                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
2867                                 # explicitly not a parse error
2868                                 return '&'
2869                         when ';'
2870                                 # there has to be "one or more" alnums between & and ; to be a parse error
2871                                 return '&'
2872                         when '#'
2873                                 if cur + 1 >= txt.length
2874                                         return '&'
2875                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
2876                                         prefix = '#x'
2877                                         charset = hex_chars
2878                                         start = cur + 2
2879                                 else
2880                                         charset = digits
2881                                         start = cur + 1
2882                                         prefix = '#'
2883                                 i = 0
2884                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
2885                                         i += 1
2886                                 if i is 0
2887                                         return '&'
2888                                 if txt.charAt(start + i) is ';'
2889                                         i += 1
2890                                 # FIXME This is supposed to generate parse errors for some chars
2891                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
2892                                 if decoded?
2893                                         cur = start + i
2894                                         return decoded
2895                                 return '&'
2896                         else
2897                                 for i in [0...31]
2898                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
2899                                                 break
2900                                 if i is 0
2901                                         # exit early, because parse_error() below needs at least one alnum
2902                                         return '&'
2903                                 if txt.charAt(cur + i) is ';'
2904                                         i += 1 # include ';' terminator in value
2905                                         decoded = decode_named_char_ref txt.substr(cur, i)
2906                                         if decoded?
2907                                                 cur += i
2908                                                 return decoded
2909                                         parse_error()
2910                                         return '&'
2911                                 else
2912                                         # no ';' terminator (only legacy char refs)
2913                                         max = i
2914                                         for i in [2..max] # no prefix matches, so ok to check shortest first
2915                                                 c = legacy_char_refs[txt.substr(cur, i)]
2916                                                 if c?
2917                                                         if in_attr
2918                                                                 if txt.charAt(cur + i) is '='
2919                                                                         # "because some legacy user agents will
2920                                                                         # misinterpret the markup in those cases"
2921                                                                         parse_error()
2922                                                                         return '&'
2923                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
2924                                                                         # this makes attributes forgiving about url args
2925                                                                         return '&'
2926                                                         # ok, and besides the weird exceptions for attributes...
2927                                                         # return the matching char
2928                                                         cur += i # consume entity chars
2929                                                         parse_error() # because no terminating ";"
2930                                                         return c
2931                                         parse_error()
2932                                         return '&'
2933                 return # never reached
2934
2935         # tree constructor initialization
2936         # see comments on TYPE_TAG/etc for the structure of this data
2937         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
2938         open_els = [doc]
2939         afe = [] # active formatting elements
2940         template_insertion_modes = []
2941         insertion_mode = ins_mode_initial
2942         original_insertion_mode = insertion_mode # TODO check spec
2943         flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
2944         flag_frameset_ok = true
2945         flag_parsing = true
2946         flag_foster_parenting = false
2947         form_element_pointer = null
2948         temporary_buffer = null
2949         pending_table_character_tokens = []
2950         head_element_pointer = null
2951         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
2952         context_element = null # FIXME initialize from args.fragment
2953
2954         # tokenizer initialization
2955         tok_state = tok_state_data
2956
2957         # proccess input
2958         while flag_parsing
2959                 t = tok_state()
2960                 if t?
2961                         insertion_mode t
2962                         # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
2963         return doc.children
2964
2965 serialize_els = (els, shallow, show_ids) ->
2966         serialized = ''
2967         sep = ''
2968         for t in els
2969                 serialized += sep
2970                 sep = ','
2971                 serialized += t.serialize shallow, show_ids
2972         return serialized
2973
2974 # TODO export TYPE_*
2975 module.exports.parse_html = parse_html
2976 module.exports.debug_log_reset = debug_log_reset
2977 module.exports.debug_log_each = debug_log_each
2978 module.exports.TYPE_TAG = TYPE_TAG
2979 module.exports.TYPE_TEXT = TYPE_TEXT
2980 module.exports.TYPE_COMMENT = TYPE_COMMENT
2981 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE