JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
implement noa's ark, junk after attribute name
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
23 # parsing.
24 #
25 # Instead, the data structure produced by this parser is an array of Nodes.
26
27
28 # stacks/lists
29 #
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
33 # (both as stacks)
34 #
35 # stacks grow downward (current element is index=0)
36 #
37 # example: open_els = [a, b, c, d, e, f, g]
38 #
39 # "grows downwards" means it's visualized like this: (index: el, names)
40 #
41 #   6: g "start of the list", "topmost", "first"
42 #   5: f
43 #   4: e "previous" (to d), "above", "before"
44 #   3: d   (previous/next are relative to this element)
45 #   2: c "next", "after", "lower", "below"
46 #   1: b
47 #   0: a "end of the list", "current node", "bottommost", "last"
48
49
50
51 # Each node is an obect of the Node class. Here are the Node types:
52 TYPE_TAG = 0 # name, {attributes}, [children]
53 TYPE_TEXT = 1 # "text"
54 TYPE_COMMENT = 2
55 TYPE_DOCTYPE = 3
56 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
57 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
58 TYPE_END_TAG = 5 # name
59 TYPE_EOF = 6
60 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
61 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
62
63 # namespace constants
64 NS_HTML = 1
65 NS_MATHML = 2
66 NS_SVG = 3
67
68 g_debug_log = []
69 debug_log_reset = ->
70         g_debug_log = []
71 debug_log = (str) ->
72         g_debug_log.push str
73 debug_log_each = (cb) ->
74         for str in g_debug_log
75                 cb str
76
77 prev_node_id = 0
78 class Node
79         constructor: (type, args = {}) ->
80                 @type = type # one of the TYPE_* constants above
81                 @name = args.name ? '' # tag name
82                 @text = args.text ? '' # contents for text/comment nodes
83                 @attrs = args.attrs ? {}
84                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
85                 @children = args.children ? []
86                 @namespace = args.namespace ? NS_HTML
87                 @parent = args.parent ? null
88                 if args.id?
89                         @id = "#{args.id}+"
90                 else
91                         @id = "#{++prev_node_id}"
92         shallow_clone: -> # return a new node that's the same except without the children or parent
93                 # WARNING this doesn't work right on open tags that are still being parsed
94                 attrs = {}
95                 attrs[k] = v for k, v of @attrs
96                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
97         serialize: (shallow = false, show_ids = false) -> # for unit tests
98                 ret = ''
99                 switch @type
100                         when TYPE_TAG
101                                 ret += 'tag:'
102                                 ret += JSON.stringify @name
103                                 ret += ','
104                                 if show_ids
105                                         ret += "##{@id},"
106                                 if shallow
107                                         break
108                                 attr_keys = []
109                                 for k of @attrs
110                                         attr_keys.push k
111                                 attr_keys.sort()
112                                 ret += '{'
113                                 sep = ''
114                                 for k in attr_keys
115                                         ret += sep
116                                         sep = ','
117                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
118                                 ret += '},['
119                                 sep = ''
120                                 for c in @children
121                                         ret += sep
122                                         sep = ','
123                                         ret += c.serialize shallow, show_ids
124                                 ret += ']'
125                         when TYPE_TEXT
126                                 ret += 'text:'
127                                 ret += JSON.stringify @text
128                         when TYPE_COMMENT
129                                 ret += 'comment:'
130                                 ret += JSON.stringify @text
131                         when TYPE_DOCTYPE
132                                 ret += 'doctype'
133                                 # FIXME
134                         when TYPE_AFE_MARKER
135                                 ret += 'marker'
136                         when TYPE_AAA_BOOKMARK
137                                 ret += 'aaa_bookmark'
138                         else
139                                 ret += 'unknown:'
140                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
141                 return ret
142
143 # helpers: (only take args that are normally known when parser creates nodes)
144 new_open_tag = (name) ->
145         return new Node TYPE_START_TAG, name: name
146 new_end_tag = (name) ->
147         return new Node TYPE_END_TAG, name: name
148 new_element = (name) ->
149         return new Node TYPE_TAG, name: name
150 new_text_node = (txt) ->
151         return new Node TYPE_TEXT, text: txt
152 new_comment_node = (txt) ->
153         return new Node TYPE_COMMENT, text: txt
154 new_eof_token = ->
155         return new Node TYPE_EOF
156 new_afe_marker = ->
157         return new Node TYPE_AFE_MARKER
158 new_aaa_bookmark = ->
159         return new Node TYPE_AAA_BOOKMARK
160
161 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
162 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
163 digits = "0123456789"
164 alnum = lc_alpha + uc_alpha + digits
165 hex_chars = digits + "abcdefABCDEF"
166
167 # some SVG elements have dashes in them
168 tag_name_chars = alnum + "-"
169
170 # http://www.w3.org/TR/html5/infrastructure.html#space-character
171 space_chars = "\u0009\u000a\u000c\u000d\u0020"
172
173 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
174 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
175
176 # These are the character references that don't need a terminating semicolon
177 # min length: 2, max: 6, none are a prefix of any other.
178 legacy_char_refs = {
179         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
180         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
181         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
182         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
183         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
184         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
185         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
186         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
187         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
188         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
189         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
190         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
191         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
192         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
193         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
194         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
195         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
196         yen: '¥', yuml: 'ÿ'
197 }
198
199 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
200 raw_text_elements = ['script', 'style']
201 escapable_raw_text_elements = ['textarea', 'title']
202 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
203 svg_elements = [
204         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
205         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
206         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
207         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
208         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
209         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
210         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
211         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
212         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
213         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
214         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
215         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
216         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
217         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
218         'view', 'vkern'
219 ]
220
221 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
222 mathml_elements = [
223         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
224         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
225         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
226         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
227         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
228         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
229         'determinant', 'diff', 'divergence', 'divide', 'domain',
230         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
231         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
232         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
233         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
234         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
235         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
236         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
237         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
238         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
239         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
240         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
241         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
242         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
243         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
244         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
245         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
246         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
247         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
248         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
249         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
250         'vectorproduct', 'xor'
251 ]
252 # foreign_elements = [svg_elements..., mathml_elements...]
253 #normal_elements = All other allowed HTML elements are normal elements.
254
255 special_elements = {
256         # HTML:
257         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
258         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
259         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
260         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
261         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
262         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
263         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
264         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
265         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
266         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
267         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
268         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
269         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
270         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
271         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
272         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
273         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
274         wbr:NS_HTML, xmp:NS_HTML,
275
276         # MathML:
277         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
278         'annotation-xml':NS_MATHML,
279
280         # SVG:
281         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
282 }
283
284 formatting_elements = {
285          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
286          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
287          u: true
288 }
289
290 foster_parenting_targets = {
291         table: true
292         tbody: true
293         tfoot: true
294         thead: true
295         tr: true
296 }
297
298 # all html I presume
299 end_tag_implied = {
300         dd: true
301         dt: true
302         li: true
303         option: true
304         optgroup: true
305         p: true
306         rb: true
307         rp: true
308         rt: true
309         rtc: true
310 }
311
312 el_is_special = (e) ->
313         return special_elements[e.name]?
314         # FIXME it should really be:
315         #return special_elements[e.name] is e.namespace
316
317 # decode_named_char_ref()
318 #
319 # The list of named character references is _huge_ so ask the browser to decode
320 # for us instead of wasting bandwidth/space on including the table here.
321 #
322 # Pass without the "&" but with the ";" examples:
323 #    for "&amp" pass "amp;"
324 #    for "&#x2032" pass "x2032;"
325 g_dncr = {
326         cache: {}
327         textarea: document.createElement('textarea')
328 }
329 # TODO test this in IE8
330 decode_named_char_ref = (txt) ->
331         txt = "&#{txt}"
332         decoded = g_dncr.cache[txt]
333         return decoded if decoded?
334         g_dncr.textarea.innerHTML = txt
335         decoded = g_dncr.textarea.value
336         return null if decoded is txt
337         return g_dncr.cache[txt] = decoded
338
339 parse_html = (txt, parse_error_cb = null) ->
340         cur = 0 # index of next char in txt to be parsed
341         # declare tree and tokenizer variables so they're in scope below
342         tree = null
343         open_els = [] # stack of open elements
344         insertion_mode = null
345         tok_state = null
346         tok_cur_tag = null # partially parsed tag
347         flag_frameset_ok = null
348         flag_parsing = null
349         flag_foster_parenting = null
350         form_element_pointer = null
351         afe = [] # active formatting elements
352
353         parse_error = ->
354                 if parse_error_cb?
355                         parse_error_cb cur
356                 else
357                         console.log "Parse error at character #{cur} of #{txt.length}"
358
359         afe_push = (new_el) ->
360                 matches = 0
361                 for el, i in afe
362                         if el.name is new_el.name and el.namespace is new_el.namespace
363                                 for k, v of el.attrs
364                                         continue unless new_el.attrs[k] is v
365                                 for k, v of new_el.attrs
366                                         continue unless el.attrs[k] is v
367                                 matches += 1
368                                 if matches is 3
369                                         afe.splice i, 1
370                                         break
371                 afe.unshift new_el
372         afe_push_marker = ->
373                 afe.unshift new_afe_marker()
374
375         # the functions below impliment the Tree Contstruction algorithm
376         # http://www.w3.org/TR/html5/syntax.html#tree-construction
377
378         # But first... the helpers
379         template_tag_is_open = ->
380                 for t in open_els
381                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
382                                 return true
383                 return false
384         is_in_scope_x = (tag_name, scope, namespace) ->
385                 for t in open_els
386                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
387                                 return true
388                         if scope[t.name] is t.namespace
389                                 return false
390                 return false
391         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
392                 for t in open_els
393                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
394                                 return true
395                         if scope[t.name] is t.namespace
396                                 return false
397                         if scope2[t.name] is t.namespace
398                                 return false
399                 return false
400         standard_scopers = { # FIXME these are supposed to be namespace specific
401                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
402                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
403                 template: NS_HTML, mi: NS_MATHML,
404
405                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
406                 'annotation-xml': NS_MATHML,
407
408                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
409         }
410         button_scopers = button: NS_HTML
411         li_scopers = ol: NS_HTML, ul: NS_HTML
412         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
413         is_in_scope = (tag_name, namespace = null) ->
414                 return is_in_scope_x tag_name, standard_scopers, namespace
415         is_in_button_scope = (tag_name, namespace = null) ->
416                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
417         is_in_table_scope = (tag_name, namespace = null) ->
418                 return is_in_scope_x tag_name, table_scopers, namespace
419         is_in_select_scope = (tag_name, namespace = null) ->
420                 for t in open_els
421                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
422                                 return true
423                         if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
424                                 return false
425                 return false
426         # this checks for a particular element, not by name
427         el_is_in_scope = (el) ->
428                 for t in open_els
429                         if t is el
430                                 return true
431                         if standard_scopers[t.name] is t.namespace
432                                 return false
433                 return false
434
435         # 8.2.3.1 ...
436         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
437         reset_insertion_mode = ->
438                 # 1. Let last be false.
439                 last = false
440                 # 2. Let node be the last node in the stack of open elements.
441                 node_i = 0
442                 node = open_els[node_i]
443                 # 3. Loop: If node is the first node in the stack of open elements,
444                 # then set last to true, and, if the parser was originally created as
445                 # part of the HTML fragment parsing algorithm (fragment case) set node
446                 # to the context element.
447                 loop
448                         if node_i is open_els.length - 1
449                                 last = true
450                                 # fixfull (fragment case)
451
452                         # 4. If node is a select element, run these substeps:
453                         if node.name is 'select'
454                                 # 1. If last is true, jump to the step below labeled done.
455                                 unless last
456                                         # 2. Let ancestor be node.
457                                         ancestor_i = node_i
458                                         ancestor = node
459                                         # 3. Loop: If ancestor is the first node in the stack of
460                                         # open elements, jump to the step below labeled done.
461                                         loop
462                                                 if ancestor_i is open_els.length - 1
463                                                         break
464                                                 # 4. Let ancestor be the node before ancestor in the stack
465                                                 # of open elements.
466                                                 ancestor_i += 1
467                                                 ancestor = open_els[ancestor_i]
468                                                 # 5. If ancestor is a template node, jump to the step below
469                                                 # labeled done.
470                                                 if ancestor.name is 'template'
471                                                         break
472                                                 # 6. If ancestor is a table node, switch the insertion mode
473                                                 # to "in select in table" and abort these steps.
474                                                 if ancestor.name is 'table'
475                                                         insertion_mode = ins_mode_in_select_in_table
476                                                         return
477                                                 # 7. Jump back to the step labeled loop.
478                                 # 8. Done: Switch the insertion mode to "in select" and abort
479                                 # these steps.
480                                 insertion_mode = ins_mode_in_select
481                                 return
482                         # 5. If node is a td or th element and last is false, then switch
483                         # the insertion mode to "in cell" and abort these steps.
484                         if (node.name is 'td' or node.name is 'th') and last is false
485                                 insertion_mode = ins_mode_in_cell
486                                 return
487                         # 6. If node is a tr element, then switch the insertion mode to "in
488                         # row" and abort these steps.
489                         if node.name is 'tr'
490                                 insertion_mode = ins_mode_in_row
491                                 return
492                         # 7. If node is a tbody, thead, or tfoot element, then switch the
493                         # insertion mode to "in table body" and abort these steps.
494                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
495                                 insertion_mode = ins_mode_in_table_body
496                                 return
497                         # 8. If node is a caption element, then switch the insertion mode
498                         # to "in caption" and abort these steps.
499                         if node.name is 'caption'
500                                 insertion_mode = ins_mode_in_caption
501                                 return
502                         # 9. If node is a colgroup element, then switch the insertion mode
503                         # to "in column group" and abort these steps.
504                         if node.name is 'colgroup'
505                                 insertion_mode = ins_mode_in_column_group
506                                 return
507                         # 10. If node is a table element, then switch the insertion mode to
508                         # "in table" and abort these steps.
509                         if node.name is 'table'
510                                 insertion_mode = ins_mode_in_table
511                                 return
512                         # 11. If node is a template element, then switch the insertion mode
513                         # to the current template insertion mode and abort these steps.
514                         # fixfull (template insertion mode stack)
515
516                         # 12. If node is a head element and last is true, then switch the
517                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
518                         # these steps. (fragment case)
519                         if node.name is 'head' and last
520                                 insertion_mode = ins_mode_in_body
521                                 return
522                         # 13. If node is a head element and last is false, then switch the
523                         # insertion mode to "in head" and abort these steps.
524                         if node.name is 'head' and last is false
525                                 insertion_mode = ins_mode_in_head
526                                 return
527                         # 14. If node is a body element, then switch the insertion mode to
528                         # "in body" and abort these steps.
529                         if node.name is 'body'
530                                 insertion_mode = ins_mode_in_body
531                                 return
532                         # 15. If node is a frameset element, then switch the insertion mode
533                         # to "in frameset" and abort these steps. (fragment case)
534                         if node.name is 'frameset'
535                                 insertion_mode = ins_mode_in_frameset
536                                 return
537                         # 16. If node is an html element, run these substeps:
538                         if node.name is 'html'
539                                 # 1. If the head element pointer is null, switch the insertion
540                                 # mode to "before head" and abort these steps. (fragment case)
541                                 # fixfull (fragment case)
542
543                                 # 2. Otherwise, the head element pointer is not null, switch
544                                 # the insertion mode to "after head" and abort these steps.
545                                 insertion_mode = ins_mode_in_body # FIXME fixfull
546                                 return
547                         # 17. If last is true, then switch the insertion mode to "in body"
548                         # and abort these steps. (fragment case)
549                         if last
550                                 insertion_mode = ins_mode_in_body
551                                 return
552                         # 18. Let node now be the node before node in the stack of open
553                         # elements.
554                         node_i += 1
555                         node = open_els[node_i]
556                         # 19. Return to the step labeled loop.
557
558         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
559         # this implementation is structured (mostly) as described at the link above.
560         # capitalized comments are the "labels" described at the link above.
561         reconstruct_active_formatting_elements = ->
562                 return if afe.length is 0
563                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
564                         return
565                 # Rewind
566                 i = 0
567                 loop
568                         if i is afe.length - 1
569                                 break
570                         i += 1
571                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
572                                 i -= 1 # Advance
573                                 break
574                 # Create
575                 loop
576                         el = afe[i].shallow_clone()
577                         tree_insert_element el
578                         afe[i] = el
579                         break if i is 0
580                         i -= 1 # Advance
581
582         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
583         # adoption agency algorithm
584         # overview here:
585         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
586         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
587         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
588         adoption_agency = (subject) ->
589                 debug_log "adoption_agency()"
590                 debug_log "tree: #{serialize_els tree.children, false, true}"
591                 debug_log "open_els: #{serialize_els open_els, true, true}"
592                 debug_log "afe: #{serialize_els afe, true, true}"
593                 if open_els[0].name is subject
594                         el = open_els[0]
595                         open_els.shift()
596                         # remove it from the list of active formatting elements (if found)
597                         for t, i in afe
598                                 if t is el
599                                         afe.splice i, 1
600                                         break
601                         debug_log "aaa: starting off with subject on top of stack, exiting"
602                         return
603                 outer = 0
604                 loop
605                         if outer >= 8
606                                 return
607                         outer += 1
608                         # 5. Let formatting element be the last element in the list of
609                         # active formatting elements that: is between the end of the list
610                         # and the last scope marker in the list, if any, or the start of
611                         # the list otherwise, and  has the tag name subject.
612                         fe = null
613                         for t, fe_of_afe in afe
614                                 if t.type is TYPE_AFE_MARKER
615                                         break
616                                 if t.name is subject
617                                         fe = t
618                                         break
619                         # If there is no such element, then abort these steps and instead
620                         # act as described in the "any other end tag" entry above.
621                         if fe is null
622                                 debug_log "aaa: fe not found in afe"
623                                 in_body_any_other_end_tag subject
624                                 return
625                         # 6. If formatting element is not in the stack of open elements,
626                         # then this is a parse error; remove the element from the list, and
627                         # abort these steps.
628                         in_open_els = false
629                         for t, fe_of_open_els in open_els
630                                 if t is fe
631                                         in_open_els = true
632                                         break
633                         unless in_open_els
634                                 debug_log "aaa: fe not found in open_els"
635                                 parse_error()
636                                 # "remove it from the list" must mean afe, since it's not in open_els
637                                 afe.splice fe_of_afe, 1
638                                 return
639                         # 7. If formatting element is in the stack of open elements, but
640                         # the element is not in scope, then this is a parse error; abort
641                         # these steps.
642                         unless el_is_in_scope fe
643                                 debug_log "aaa: fe not in scope"
644                                 parse_error()
645                                 return
646                         # 8. If formatting element is not the current node, this is a parse
647                         # error. (But do not abort these steps.)
648                         unless open_els[0] is fe
649                                 parse_error()
650                                 # continue
651                         # 9. Let furthest block be the topmost node in the stack of open
652                         # elements that is lower in the stack than formatting element, and
653                         # is an element in the special category. There might not be one.
654                         fb = null
655                         fb_of_open_els = null
656                         for t, i in open_els
657                                 if t is fe
658                                         break
659                                 if el_is_special t
660                                         fb = t
661                                         fb_of_open_els = i
662                                         # and continue, to see if there's one that's more "topmost"
663                         # 10. If there is no furthest block, then the UA must first pop all
664                         # the nodes from the bottom of the stack of open elements, from the
665                         # current node up to and including formatting element, then remove
666                         # formatting element from the list of active formatting elements,
667                         # and finally abort these steps.
668                         if fb is null
669                                 debug_log "aaa: no fb"
670                                 loop
671                                         t = open_els.shift()
672                                         if t is fe
673                                                 afe.splice fe_of_afe, 1
674                                                 return
675                         # 11. Let common ancestor be the element immediately above
676                         # formatting element in the stack of open elements.
677                         ca = open_els[fe_of_open_els + 1] # common ancestor
678
679                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
680                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
681                         bookmark = new_aaa_bookmark()
682                         for t, i in afe
683                                 if t is fe
684                                         afe.splice i, 0, bookmark
685                                         break
686                         node = last_node = fb
687                         inner = 0
688                         loop
689                                 inner += 1
690                                 # 3. Let node be the element immediately above node in the
691                                 # stack of open elements, or if node is no longer in the stack
692                                 # of open elements (e.g. because it got removed by this
693                                 # algorithm), the element that was immediately above node in
694                                 # the stack of open elements before node was removed.
695                                 node_next = null
696                                 for t, i in open_els
697                                         if t is node
698                                                 node_next = open_els[i + 1]
699                                                 break
700                                 node = node_next ? node_above
701                                 debug_log "inner loop #{inner}"
702                                 debug_log "tree: #{serialize_els tree.children, false, true}"
703                                 debug_log "open_els: #{serialize_els open_els, true, true}"
704                                 debug_log "afe: #{serialize_els afe, true, true}"
705                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
706                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
707                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
708                                 debug_log "node: #{node.serialize true, true}"
709                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
710
711                                 # 4. If node is formatting element, then go to the next step in
712                                 # the overall algorithm.
713                                 if node is fe
714                                         break
715                                 debug_log "the meat"
716                                 # 5. If inner loop counter is greater than three and node is in
717                                 # the list of active formatting elements, then remove node from
718                                 # the list of active formatting elements.
719                                 node_in_afe = false
720                                 for t, i in afe
721                                         if t is node
722                                                 if inner > 3
723                                                         afe.splice i, 1
724                                                         debug_log "max out inner"
725                                                 else
726                                                         node_in_afe = true
727                                                         debug_log "in afe"
728                                                 break
729                                 # 6. If node is not in the list of active formatting elements,
730                                 # then remove node from the stack of open elements and then go
731                                 # back to the step labeled inner loop.
732                                 unless node_in_afe
733                                         debug_log "not in afe"
734                                         for t, i in open_els
735                                                 if t is node
736                                                         node_above = open_els[i + 1]
737                                                         open_els.splice i, 1
738                                                         break
739                                         continue
740                                 debug_log "the bones"
741                                 # 7. create an element for the token for which the element node
742                                 # was created, in the HTML namespace, with common ancestor as
743                                 # the intended parent; replace the entry for node in the list
744                                 # of active formatting elements with an entry for the new
745                                 # element, replace the entry for node in the stack of open
746                                 # elements with an entry for the new element, and let node be
747                                 # the new element.
748                                 new_node = node.shallow_clone()
749                                 for t, i in afe
750                                         if t is node
751                                                 afe[i] = new_node
752                                                 debug_log "replaced in afe"
753                                                 break
754                                 for t, i in open_els
755                                         if t is node
756                                                 node_above = open_els[i + 1]
757                                                 open_els[i] = new_node
758                                                 debug_log "replaced in open_els"
759                                                 break
760                                 node = new_node
761                                 # 8. If last node is furthest block, then move the
762                                 # aforementioned bookmark to be immediately after the new node
763                                 # in the list of active formatting elements.
764                                 if last_node is fb
765                                         for t, i in afe
766                                                 if t is bookmark
767                                                         afe.splice i, 1
768                                                         debug_log "removed bookmark"
769                                                         break
770                                         for t, i in afe
771                                                 if t is node
772                                                         # "after" means lower
773                                                         afe.splice i, 0, bookmark # "after as <-
774                                                         debug_log "placed bookmark after node"
775                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
776                                                         break
777                                 # 9. Insert last node into node, first removing it from its
778                                 # previous parent node if any.
779                                 if last_node.parent?
780                                         debug_log "last_node has parent"
781                                         for c, i in last_node.parent.children
782                                                 if c is last_node
783                                                         debug_log "removing last_node from parent"
784                                                         last_node.parent.children.splice i, 1
785                                                         break
786                                 node.children.push last_node
787                                 last_node.parent = node
788                                 # 10. Let last node be node.
789                                 last_node = node
790                                 debug_log "at last"
791                                 # 11. Return to the step labeled inner loop.
792                         # 14. Insert whatever last node ended up being in the previous step
793                         # at the appropriate place for inserting a node, but using common
794                         # ancestor as the override target.
795
796                         # JASON: In the case where fe is immediately followed by fb:
797                         #   * inner loop exits out early (node==fe)
798                         #   * last_node is fb
799                         #   * last_node is still in the tree (not a duplicate)
800                         if last_node.parent?
801                                 debug_log "FEFIRST? last_node has parent"
802                                 for c, i in last_node.parent.children
803                                         if c is last_node
804                                                 debug_log "removing last_node from parent"
805                                                 last_node.parent.children.splice i, 1
806                                                 break
807
808                         debug_log "after aaa inner loop"
809                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
810                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
811                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
812                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
813                         debug_log "tree: #{serialize_els tree.children, false, true}"
814
815                         debug_log "insert"
816
817
818                         # can't use standard insert token thing, because it's already in
819                         # open_els and must stay at it's current position in open_els
820                         dest = adjusted_insertion_location ca
821                         dest[0].children.splice dest[1], 0, last_node
822                         last_node.parent = dest[0]
823
824
825                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
826                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
827                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
828                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
829                         debug_log "tree: #{serialize_els tree.children, false, true}"
830
831                         # 15. Create an element for the token for which formatting element
832                         # was created, in the HTML namespace, with furthest block as the
833                         # intended parent.
834                         new_element = fe.shallow_clone() # FIXME intended parent thing
835                         # 16. Take all of the child nodes of furthest block and append them
836                         # to the element created in the last step.
837                         while fb.children.length
838                                 t = fb.children.shift()
839                                 t.parent = new_element
840                                 new_element.children.push t
841                         # 17. Append that new element to furthest block.
842                         new_element.parent = fb
843                         fb.children.push new_element
844                         # 18. Remove formatting element from the list of active formatting
845                         # elements, and insert the new element into the list of active
846                         # formatting elements at the position of the aforementioned
847                         # bookmark.
848                         for t, i in afe
849                                 if t is fe
850                                         afe.splice i, 1
851                                         break
852                         for t, i in afe
853                                 if t is bookmark
854                                         afe[i] = new_element
855                                         break
856                         # 19. Remove formatting element from the stack of open elements,
857                         # and insert the new element into the stack of open elements
858                         # immediately below the position of furthest block in that stack.
859                         for t, i in open_els
860                                 if t is fe
861                                         open_els.splice i, 1
862                                         break
863                         for t, i in open_els
864                                 if t is fb
865                                         open_els.splice i, 0, new_element
866                                         break
867                         # 20. Jump back to the step labeled outer loop.
868                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
869                         debug_log "tree: #{serialize_els tree.children, false, true}"
870                         debug_log "open_els: #{serialize_els open_els, true, true}"
871                         debug_log "afe: #{serialize_els afe, true, true}"
872                 debug_log "AAA DONE"
873
874         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
875         # FIXME test this (particularly emplied end tags)
876         close_p_element = ->
877                 generate_implied_end_tags 'p' # arg is exception
878                 if open_els[0].name isnt 'p'
879                         parse_error()
880                 while open_els.length > 1 # just in case
881                         el = open_els.shift()
882                         if el.name is 'p'
883                                 return
884         close_p_if_in_button_scope = ->
885                 if is_in_button_scope 'p'
886                         close_p_element()
887
888         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
889         tree_insert_text = (t) ->
890                 dest = adjusted_insertion_location()
891                 # fixfull check for Document node
892                 if dest[1] > 0
893                         prev = dest[0].children[dest[1] - 1]
894                         if prev.type is TYPE_TEXT
895                                 prev.text += t.text
896                                 return
897                 dest[0].children.splice dest[1], 0, t
898
899         # 8.2.5.1
900         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
901         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
902         adjusted_insertion_location = (override_target = null) ->
903                 # 1. If there was an override target specified, then let target be the
904                 # override target.
905                 if override_target?
906                         target = override_target
907                 else # Otherwise, let target be the current node.
908                         target = open_els[0]
909                 # 2. Determine the adjusted insertion location using the first matching
910                 # steps from the following list:
911                 #
912                 # If foster parenting is enabled and target is a table, tbody, tfoot,
913                 # thead, or tr element Foster parenting happens when content is
914                 # misnested in tables.
915                 if flag_foster_parenting and foster_parenting_targets[target.name]
916                         loop # once. this is here so we can ``break`` to "abort these substeps"
917                                 # 1. Let last template be the last template element in the
918                                 # stack of open elements, if any.
919                                 last_template = null
920                                 last_template_i = null
921                                 for el, i in open_els
922                                         if el.name is 'template'
923                                                 last_template = el
924                                                 last_template_i = i
925                                                 break
926                                 # 2. Let last table be the last table element in the stack of
927                                 # open elements, if any.
928                                 last_table = null
929                                 last_table_i
930                                 for el, i in open_els
931                                         if el.name is 'table'
932                                                 last_table = el
933                                                 last_table_i = i
934                                                 break
935                                 # 3. If there is a last template and either there is no last
936                                 # table, or there is one, but last template is lower (more
937                                 # recently added) than last table in the stack of open
938                                 # elements, then: let adjusted insertion location be inside
939                                 # last template's template contents, after its last child (if
940                                 # any), and abort these substeps.
941                                 if last_template and (last_table is null or last_template_i < last_table_i)
942                                         target = template # fixfull should be it's contents
943                                         target_i = target.children.length
944                                         break
945                                 # 4. If there is no last table, then let adjusted insertion
946                                 # location be inside the first element in the stack of open
947                                 # elements (the html element), after its last child (if any),
948                                 # and abort these substeps. (fragment case)
949                                 if last_table is null
950                                         # this is odd
951                                         target = open_els[open_els.length - 1]
952                                         target_i = target.children.length
953                                 # 5. If last table has a parent element, then let adjusted
954                                 # insertion location be inside last table's parent element,
955                                 # immediately before last table, and abort these substeps.
956                                 if last_table.parent?
957                                         for c, i in last_table.parent.children
958                                                 if c is last_table
959                                                         target = last_table.parent
960                                                         target_i = i
961                                                         break
962                                         break
963                                 # 6. Let previous element be the element immediately above last
964                                 # table in the stack of open elements.
965                                 #
966                                 # huh? how could it not have a parent?
967                                 previous_element = open_els[last_table_i + 1]
968                                 # 7. Let adjusted insertion location be inside previous
969                                 # element, after its last child (if any).
970                                 target = previous_element
971                                 target_i = target.children.length
972                                 # Note: These steps are involved in part because it's possible
973                                 # for elements, the table element in this case in particular,
974                                 # to have been moved by a script around in the DOM, or indeed
975                                 # removed from the DOM entirely, after the element was inserted
976                                 # by the parser.
977                                 break # don't really loop
978                 else
979                         # Otherwise Let adjusted insertion location be inside target, after
980                         # its last child (if any).
981                         target_i = target.children.length
982
983                 # 3. If the adjusted insertion location is inside a template element,
984                 # let it instead be inside the template element's template contents,
985                 # after its last child (if any).
986                 # fixfull (template)
987
988                 # 4. Return the adjusted insertion location.
989                 return [target, target_i]
990
991         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
992         # aka create_an_element_for_token
993         token_to_element = (t, namespace, intended_parent) ->
994                 t.type = TYPE_TAG # not TYPE_START_TAG
995                 # convert attributes into a hash
996                 attrs = {}
997                 while t.attrs_a.length
998                         a = t.attrs_a.pop()
999                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1000                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
1001
1002                 # TODO 2. If the newly created element has an xmlns attribute in the
1003                 # XMLNS namespace whose value is not exactly the same as the element's
1004                 # namespace, that is a parse error. Similarly, if the newly created
1005                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1006                 # value is not the XLink Namespace, that is a parse error.
1007
1008                 # fixfull: the spec says stuff about form pointers and ownerDocument
1009
1010                 return el
1011
1012         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1013         insert_foreign_element = (token, namespace) ->
1014                 ail = adjusted_insertion_location()
1015                 ail_el = ail[0]
1016                 ail_i = ail[1]
1017                 el = token_to_element token, namespace, ail_el
1018                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1019                 el.parent = ail_el
1020                 ail_el.children.splice ail_i, 0, el
1021                 open_els.unshift el
1022                 return el
1023         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1024         insert_html_element = insert_foreign_element # (token, namespace) ->
1025
1026         # FIXME read implement "foster parenting" part
1027         # FIXME read spec, do this right
1028         # FIXME implement the override target thing
1029         # note: this assumes it's an open tag
1030         # FIXME what part of the spec is this?
1031         # TODO look through all callers of this, and see what they should really be doing.
1032         #   eg probably insert_html_element for tokens
1033         tree_insert_element = (el, override_target = null, namespace = null) ->
1034                 if namespace?
1035                         el.namespace = namespace
1036                 dest = adjusted_insertion_location override_target
1037                 if el.type is TYPE_START_TAG # means it's a "token"
1038                         el = token_to_element el, namespace, dest[0]
1039                 unless el.namespace?
1040                         namespace = dest.namespace
1041                 # fixfull: Document nodes sometimes can't accept more chidren
1042                 dest[0].children.splice dest[1], 0, el
1043                 el.parent = dest[0]
1044                 open_els.unshift el
1045                 return el
1046
1047         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1048         # position should be [node, index_within_children]
1049         tree_insert_a_comment = (t, position = null) ->
1050                 position ?= adjusted_insertion_location()
1051                 position[0].children.splice position[1], 0, t
1052
1053         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1054         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1055         generate_implied_end_tags = (except = null) ->
1056                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1057                         open_els.shift()
1058
1059         # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1060         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1061                 for node, i in open_els
1062                         if node.name is name # FIXME check namespace too
1063                                 generate_implied_end_tags name # arg is exception
1064                                 parse_error() unless i is 0
1065                                 while i >= 0
1066                                         open_els.shift()
1067                                         i -= 1
1068                                 return
1069                         if special_elements[node.name]? # FIXME check namespac too
1070                                 parse_error()
1071                                 return
1072         ins_mode_in_body = (t) ->
1073                 switch t.type
1074                         when TYPE_TEXT
1075                                 switch t.text
1076                                         when "\u0000"
1077                                                 parse_error()
1078                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
1079                                                 reconstruct_active_formatting_elements()
1080                                                 tree_insert_text t
1081                                         else
1082                                                 reconstruct_active_formatting_elements()
1083                                                 tree_insert_text t
1084                                                 flag_frameset_ok = false
1085                         when TYPE_COMMENT
1086                                 tree_insert_a_comment t
1087                         when TYPE_DOCTYPE
1088                                 parse_error()
1089                         when TYPE_START_TAG
1090                                 switch t.name
1091                                         when 'html'
1092                                                 parse_error()
1093                                                 return if template_tag_is_open()
1094                                                 root_attrs = open_els[open_els.length - 1].attrs
1095                                                 for k, v of t.attrs
1096                                                         root_attrs[k] = v unless root_attrs[k]?
1097                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1098                                                 # FIXME also do this for </template> (end tag)
1099                                                 return tree_in_head t
1100                                         when 'body'
1101                                                 parse_error()
1102                                                 # TODO
1103                                         when 'frameset'
1104                                                 parse_error()
1105                                                 # TODO
1106                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1107                                                 close_p_if_in_button_scope()
1108                                                 insert_html_element t
1109                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1110                                                 close_p_if_in_button_scope()
1111                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1112                                                         parse_error()
1113                                                         open_els.shift()
1114                                                 insert_html_element t
1115                                         # TODO lots more to implement here
1116                                         when 'a'
1117                                                 # If the list of active formatting elements
1118                                                 # contains an a element between the end of the list and
1119                                                 # the last marker on the list (or the start of the list
1120                                                 # if there is no marker on the list), then this is a
1121                                                 # parse error; run the adoption agency algorithm for
1122                                                 # the tag name "a", then remove that element from the
1123                                                 # list of active formatting elements and the stack of
1124                                                 # open elements if the adoption agency algorithm didn't
1125                                                 # already remove it (it might not have if the element
1126                                                 # is not in table scope).
1127                                                 found = false
1128                                                 for el in afe
1129                                                         if el.type is TYPE_AFE_MARKER
1130                                                                 break
1131                                                         if el.name is 'a'
1132                                                                 found = el
1133                                                 if found?
1134                                                         parse_error()
1135                                                         adoption_agency 'a'
1136                                                         for el, i in afe
1137                                                                 if el is found
1138                                                                         afe.splice i, 1
1139                                                         for el, i in open_els
1140                                                                 if el is found
1141                                                                         open_els.splice i, 1
1142                                                 reconstruct_active_formatting_elements()
1143                                                 el = insert_html_element t
1144                                                 afe_push el
1145                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1146                                                 reconstruct_active_formatting_elements()
1147                                                 el = insert_html_element t
1148                                                 afe_push el
1149                                         when 'table'
1150                                                 # fixfull quirksmode thing
1151                                                 close_p_if_in_button_scope()
1152                                                 insert_html_element t
1153                                                 insertion_mode = ins_mode_in_table
1154                                         # TODO lots more to implement here
1155                                         else # any other start tag
1156                                                 reconstruct_active_formatting_elements()
1157                                                 insert_html_element t
1158                         when TYPE_EOF
1159                                 ok_tags = {
1160                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1161                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1162                                 }
1163                                 for t in open_els
1164                                         unless ok_tags[t.name]?
1165                                                 parse_error()
1166                                                 break
1167                                 # TODO stack of template insertion modes thing
1168                                 flag_parsing = false # stop parsing
1169                         when TYPE_END_TAG
1170                                 switch t.name
1171                                         when 'body'
1172                                                 unless is_in_scope 'body'
1173                                                         parse_error()
1174                                                         return
1175                                                 # TODO implement parse error and move to tree_after_body
1176                                         when 'html'
1177                                                 unless is_in_scope 'body' # weird, but it's what the spec says
1178                                                         parse_error()
1179                                                         return
1180                                                 # TODO implement parse error and move to tree_after_body, reprocess
1181                                         when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1182                                                 unless is_in_scope t.name, NS_HTML
1183                                                         parse_error()
1184                                                         return
1185                                                 generate_implied_end_tags()
1186                                                 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1187                                                         parse_error()
1188                                                 loop
1189                                                         el = open_els.shift()
1190                                                         if el.name is t.name and el.namespace is NS_HTML
1191                                                                 return
1192                                         # TODO lots more close tags to implement here
1193                                         when 'p'
1194                                                 unless is_in_button_scope 'p'
1195                                                         parse_error()
1196                                                         insert_html_element new_open_tag 'p'
1197                                                 close_p_element()
1198                                         # TODO lots more close tags to implement here
1199                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1200                                                 adoption_agency t.name
1201                                         # TODO lots more close tags to implement here
1202                                         else
1203                                                 in_body_any_other_end_tag t.name
1204                 return
1205
1206         ins_mode_in_table_else = (t) ->
1207                 parse_error()
1208                 flag_foster_parenting = true # FIXME
1209                 ins_mode_in_body t
1210                 flag_foster_parenting = false
1211         can_in_table = {
1212                 'table': true
1213                 'tbody': true
1214                 'tfoot': true
1215                 'thead': true
1216                 'tr': true
1217         }
1218         clear_to_table_stopers = {
1219                 'table': true
1220                 'template': true
1221                 'html': true
1222         }
1223         clear_stack_to_table_context = ->
1224                 loop
1225                         if clear_to_table_stopers[open_els[0].name]?
1226                                 break
1227                         open_els.shift()
1228                 return
1229         clear_to_table_body_stopers = {
1230                 'tbody': true
1231                 'tfoot': true
1232                 'thead': true
1233                 'template': true
1234                 'html': true
1235         }
1236         clear_stack_to_table_body_context = ->
1237                 loop
1238                         if clear_to_table_body_stopers[open_els[0].name]?
1239                                 break
1240                         open_els.shift()
1241                 return
1242         clear_to_table_row_stopers = {
1243                 'tr': true
1244                 'template': true
1245                 'html': true
1246         }
1247         clear_stack_to_table_row_context = ->
1248                 loop
1249                         if clear_to_table_row_stopers[open_els[0].name]?
1250                                 break
1251                         open_els.shift()
1252                 return
1253         clear_afe_to_marker = ->
1254                 loop
1255                         el = afe.shift()
1256                         if el.type is TYPE_AFE_MARKER
1257                                 return
1258         ins_mode_in_table = (t) ->
1259                 switch t.type
1260                         when TYPE_TEXT
1261                                 if can_in_table[t.name]
1262                                         original_insertion_mode = insertion_mode
1263                                         insertion_mode = ins_mode_in_table_text
1264                                         insertion_mode t
1265                                 else
1266                                         ins_mode_in_table_else t
1267                         when TYPE_COMMENT
1268                                 tree_insert_a_comment t
1269                         when TYPE_DOCTYPE
1270                                 parse_error()
1271                         when TYPE_START_TAG
1272                                 switch t.name
1273                                         when 'caption'
1274                                                 clear_stack_to_table_context()
1275                                                 afe_push_marker()
1276                                                 insert_html_element t
1277                                                 insertion_mode = ins_mode_in_caption
1278                                         when 'colgroup'
1279                                                 clear_stack_to_table_context()
1280                                                 insert_html_element t
1281                                                 insertion_mode = ins_mode_in_column_group
1282                                         when 'col'
1283                                                 clear_stack_to_table_context()
1284                                                 insert_html_element new_open_tag 'colgroup'
1285                                                 insertion_mode = ins_mode_in_column_group
1286                                                 insertion_mode t
1287                                         when 'tbody', 'tfoot', 'thead'
1288                                                 clear_stack_to_table_context()
1289                                                 insert_html_element t
1290                                                 insertion_mode = ins_mode_in_table_body
1291                                         when 'td', 'th', 'tr'
1292                                                 clear_stack_to_table_context()
1293                                                 insert_html_element new_open_tag 'tbody'
1294                                                 insertion_mode = ins_mode_in_table_body
1295                                                 insertion_mode t
1296                                         when 'table'
1297                                                 parse_error()
1298                                                 if is_in_table_scope 'table'
1299                                                         loop
1300                                                                 el = open_els.shift()
1301                                                                 if el.name is 'table'
1302                                                                         break
1303                                                         reset_insertion_mode()
1304                                                         insertion_mode t
1305                                         when 'style', 'script', 'template'
1306                                                 ins_mode_in_head t
1307                                         when 'input'
1308                                                 if token_is_input_hidden t
1309                                                         ins_mode_in_table_else t
1310                                                 else
1311                                                         parse_error()
1312                                                         insert_html_element t
1313                                                         open_els.shift()
1314                                                         # fixfull acknowledge sef-closing flag
1315                                         when 'form'
1316                                                 parse_error()
1317                                                 if form_element_pointer?
1318                                                         return
1319                                                 if template_tag_is_open()
1320                                                         return
1321                                                 form_element_pointer = insert_html_element t
1322                                                 open_els.shift()
1323                                         else
1324                                                 ins_mode_in_table_else t
1325                         when TYPE_END_TAG
1326                                 switch t.name
1327                                         when 'table'
1328                                                 if is_in_table_scope 'table'
1329                                                         loop
1330                                                                 el = open_els.shift()
1331                                                                 if el.name is 'table'
1332                                                                         break
1333                                                         reset_insertion_mode()
1334                                                 else
1335                                                         parse_error
1336                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1337                                                 parse_error()
1338                                         when 'template'
1339                                                 ins_mode_in_head t
1340                                         else
1341                                                 ins_mode_in_table_else t
1342                         when TYPE_EOF
1343                                 ins_mode_in_body t
1344                         else
1345                                 ins_mode_in_table_else t
1346
1347
1348         ins_mode_in_table_text = (t) ->
1349                 switch t.type
1350                         when TYPE_TEXT
1351                                 switch t.text
1352                                         when "\u0000"
1353                                                 parse_error()
1354                                                 return
1355                 console.log "unimplemented ins_mode_in_table_text"
1356                 # FIXME CONTINUE
1357
1358         ins_mode_in_table_body = (t) ->
1359                 if t.type is TYPE_START_TAG and t.name is 'tr'
1360                         clear_stack_to_table_body_context()
1361                         insert_html_element t
1362                         return
1363                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1364                         parse_error()
1365                         clear_stack_to_table_body_context()
1366                         insert_html_element new_open_tag 'tr'
1367                         insertion_mode = ins_mode_in_row
1368                         insertion_mode t
1369                         return
1370                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1371                         unless is_in_table_scope t.name # fixfull check namespace
1372                                 parse_error()
1373                                 return
1374                         clear_stack_to_table_body_context()
1375                         open_els.shift()
1376                         insertion_mode = ins_mode_in_table
1377                         return
1378                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1379                         has = false
1380                         for el in open_els
1381                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1382                                         has = true
1383                                         break
1384                                 if table_scopers[el.name]
1385                                         break
1386                         if !has
1387                                 parse_error()
1388                                 return
1389                         clear_stack_to_table_body_context()
1390                         open_els.shift()
1391                         insertion_mode = ins_mode_in_table
1392                         insertion_mode t
1393                         return
1394                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1395                         parse_error()
1396                         return
1397                 # Anything else
1398                 ins_mode_in_table t
1399
1400         ins_mode_in_row = (t) ->
1401                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1402                         clear_stack_to_table_row_context()
1403                         insert_html_element t
1404                         insertion_mode = ins_mode_in_cell
1405                         afe_push_marker()
1406                         return
1407                 if t.type is TYPE_END_TAG and t.name is 'tr'
1408                         if is_in_table_scope 'tr'
1409                                 clear_stack_to_table_row_context()
1410                                 open_els.shift()
1411                                 insertion_mode = ins_mode_in_table_body
1412                         else
1413                                 parse_error()
1414                         return
1415                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1416                         if is_in_table_scope 'tr'
1417                                 clear_stack_to_table_row_context()
1418                                 open_els.shift()
1419                                 insertion_mode = ins_mode_in_table_body
1420                                 insertion_mode t
1421                         else
1422                                 parse_error()
1423                         return
1424                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1425                         if is_in_table_scope t.name # fixfull namespace
1426                                 if is_in_table_scope 'tr'
1427                                         clear_stack_to_table_row_context()
1428                                         open_els.shift()
1429                                         insertion_mode = ins_mode_in_table_body
1430                                         insertion_mode t
1431                         else
1432                                 parse_error()
1433                         return
1434                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1435                         parse_error()
1436                         return
1437                 # Anything else
1438                 ins_mode_in_table t
1439
1440         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1441         close_the_cell = ->
1442                 generate_implied_end_tags()
1443                 unless open_els[0].name is 'td' or open_els[0] is 'th'
1444                         parse_error()
1445                 loop
1446                         el = open_els.shift()
1447                         if el.name is 'td' or el.name is 'th'
1448                                 break
1449                 clear_afe_to_marker()
1450                 insertion_mode = ins_mode_in_row
1451
1452         # http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1453         ins_mode_in_cell = (t) ->
1454                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1455                         if is_in_table_scope t.name
1456                                 generate_implied_end_tags()
1457                                 if open_els[0].name isnt t.name
1458                                         parse_error
1459                                 loop
1460                                         el = open_els.shift()
1461                                         if el.name is t.name
1462                                                 break
1463                                 clear_afe_to_marker()
1464                                 insertion_mode = ins_mode_in_row
1465                         else
1466                                 parse_error()
1467                         return
1468                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1469                         has = false
1470                         for el in open_els
1471                                 if el.name is 'td' or el.name is 'th'
1472                                         has = true
1473                                         break
1474                                 if table_scopers[el.name]
1475                                         break
1476                         if !has
1477                                 parse_error()
1478                                 return
1479                         close_the_cell()
1480                         insertion_mode t
1481                         return
1482                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1483                         parse_error()
1484                         return
1485                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1486                         if is_in_table_scope t.name # fixfull namespace
1487                                 close_the_cell()
1488                                 insertion_mode t
1489                         else
1490                                 parse_error()
1491                         return
1492                 # Anything Else
1493                 ins_mode_in_body t
1494
1495
1496         # the functions below implement the tokenizer stats described here:
1497         # http://www.w3.org/TR/html5/syntax.html#tokenization
1498
1499         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
1500         tok_state_data = ->
1501                 switch c = txt.charAt(cur++)
1502                         when '&'
1503                                 return new_text_node tokenize_character_reference()
1504                         when '<'
1505                                 tok_state = tok_state_tag_open
1506                         when "\u0000"
1507                                 parse_error()
1508                                 return new_text_node c
1509                         when '' # EOF
1510                                 return new_eof_token()
1511                         else
1512                                 return new_text_node c
1513                 return null
1514
1515         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
1516         # not needed: tok_state_character_reference_in_data = ->
1517         # just call tok_state_character_reference_in_data()
1518
1519         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
1520         tok_state_tag_open = ->
1521                 switch c = txt.charAt(cur++)
1522                         when '!'
1523                                 tok_state = tok_state_markup_declaration_open
1524                         when '/'
1525                                 tok_state = tok_state_end_tag_open
1526                         when '?'
1527                                 parse_error()
1528                                 tok_state = tok_state_bogus_comment
1529                         else
1530                                 if lc_alpha.indexOf(c) > -1
1531                                         tok_cur_tag = new_open_tag c
1532                                         tok_state = tok_state_tag_name
1533                                 else if uc_alpha.indexOf(c) > -1
1534                                         tok_cur_tag = new_open_tag c.toLowerCase()
1535                                         tok_state = tok_state_tag_name
1536                                 else
1537                                         parse_error()
1538                                         tok_state = tok_state_data
1539                                         cur -= 1 # we didn't parse/handle the char after <
1540                                         return new_text_node '<'
1541                 return null
1542
1543         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
1544         tok_state_end_tag_open = ->
1545                 switch c = txt.charAt(cur++)
1546                         when '>'
1547                                 parse_error()
1548                                 tok_state = tok_state_data
1549                         when '' # EOF
1550                                 parse_error()
1551                                 tok_state = tok_state_data
1552                                 return new_text_node '</'
1553                         else
1554                                 if uc_alpha.indexOf(c) > -1
1555                                         tok_cur_tag = new_end_tag c.toLowerCase()
1556                                         tok_state = tok_state_tag_name
1557                                 else if lc_alpha.indexOf(c) > -1
1558                                         tok_cur_tag = new_end_tag c
1559                                         tok_state = tok_state_tag_name
1560                                 else
1561                                         parse_error()
1562                                         tok_state = tok_state_bogus_comment
1563                 return null
1564
1565         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
1566         tok_state_tag_name = ->
1567                 switch c = txt.charAt(cur++)
1568                         when "\t", "\n", "\u000c", ' '
1569                                 tok_state = tok_state_before_attribute_name
1570                         when '/'
1571                                 tok_state = tok_state_self_closing_start_tag
1572                         when '>'
1573                                 tok_state = tok_state_data
1574                                 tmp = tok_cur_tag
1575                                 tok_cur_tag = null
1576                                 return tmp
1577                         when "\u0000"
1578                                 parse_error()
1579                                 tok_cur_tag.name += "\ufffd"
1580                         when '' # EOF
1581                                 parse_error()
1582                                 tok_state = tok_state_data
1583                         else
1584                                 if uc_alpha.indexOf(c) > -1
1585                                         tok_cur_tag.name += c.toLowerCase()
1586                                 else
1587                                         tok_cur_tag.name += c
1588                 return null
1589
1590         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
1591         tok_state_before_attribute_name = ->
1592                 attr_name = null
1593                 switch c = txt.charAt(cur++)
1594                         when "\t", "\n", "\u000c", ' '
1595                                 return null
1596                         when '/'
1597                                 tok_state = tok_state_self_closing_start_tag
1598                                 return null
1599                         when '>'
1600                                 tok_state = tok_state_data
1601                                 tmp = tok_cur_tag
1602                                 tok_cur_tag = null
1603                                 return tmp
1604                         when "\u0000"
1605                                 parse_error()
1606                                 attr_name = "\ufffd"
1607                         when '"', "'", '<', '='
1608                                 parse_error()
1609                                 attr_name = c
1610                         when '' # EOF
1611                                 parse_error()
1612                                 tok_state = tok_state_data
1613                         else
1614                                 if uc_alpha.indexOf(c) > -1
1615                                         attr_name = c.toLowerCase()
1616                                 else
1617                                         attr_name = c
1618                 if attr_name?
1619                         tok_cur_tag.attrs_a.unshift [attr_name, '']
1620                         tok_state = tok_state_attribute_name
1621                 return null
1622
1623         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
1624         tok_state_attribute_name = ->
1625                 switch c = txt.charAt(cur++)
1626                         when "\t", "\n", "\u000c", ' '
1627                                 tok_state = tok_state_after_attribute_name
1628                         when '/'
1629                                 tok_state = tok_state_self_closing_start_tag
1630                         when '='
1631                                 tok_state = tok_state_before_attribute_value
1632                         when '>'
1633                                 tok_state = tok_state_data
1634                                 tmp = tok_cur_tag
1635                                 tok_cur_tag = null
1636                                 return tmp
1637                         when "\u0000"
1638                                 parse_error()
1639                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
1640                         when '"', "'", '<'
1641                                 parse_error()
1642                                 tok_cur_tag.attrs_a[0][0] = c
1643                         when '' # EOF
1644                                 parse_error()
1645                                 tok_state = tok_state_data
1646                         else
1647                                 if uc_alpha.indexOf(c) > -1
1648                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
1649                                 else
1650                                         tok_cur_tag.attrs_a[0][0] += c
1651                 return null
1652
1653         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
1654         tok_state_after_attribute_name = ->
1655                 c = txt.charAt(cur++)
1656                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
1657                         return
1658                 if c is '/'
1659                         tok_state = tok_state_self_closing_start_tag
1660                         return
1661                 if c is '='
1662                         tok_state = tok_state_before_attribute_value
1663                         return
1664                 if c is '>'
1665                         tok_state = tok_state_data
1666                         return
1667                 if uc_alpha.indexOf(c) > -1
1668                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
1669                         tok_state = tok_state_attribute_name
1670                         return
1671                 if c is "\u0000"
1672                         parse_error()
1673                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
1674                         tok_state = tok_state_attribute_name
1675                         return
1676                 if c is '' # EOF
1677                         parse_error()
1678                         tok_state = tok_state_data
1679                         cur -= 1 # reconsume
1680                         return
1681                 if c is '"' or c is "'" or c is '<'
1682                         parse_error()
1683                         # fall through to Anything else
1684                 # Anything else
1685                 tok_cur_tag.attrs_a.unshift [c, '']
1686                 tok_state = tok_state_attribute_name
1687
1688         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
1689         tok_state_before_attribute_value = ->
1690                 switch c = txt.charAt(cur++)
1691                         when "\t", "\n", "\u000c", ' '
1692                                 return null
1693                         when '"'
1694                                 tok_state = tok_state_attribute_value_double_quoted
1695                         when '&'
1696                                 tok_state = tok_state_attribute_value_unquoted
1697                                 cur -= 1
1698                         when "'"
1699                                 tok_state = tok_state_attribute_value_single_quoted
1700                         when "\u0000"
1701                                 # Parse error
1702                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1703                                 tok_state = tok_state_attribute_value_unquoted
1704                         when '>'
1705                                 # Parse error
1706                                 tok_state = tok_state_data
1707                                 tmp = tok_cur_tag
1708                                 tok_cur_tag = null
1709                                 return tmp
1710                         when '' # EOF
1711                                 parse_error()
1712                                 tok_state = tok_state_data
1713                         else
1714                                 tok_cur_tag.attrs_a[0][1] += c
1715                                 tok_state = tok_state_attribute_value_unquoted
1716                 return null
1717
1718         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
1719         tok_state_attribute_value_double_quoted = ->
1720                 switch c = txt.charAt(cur++)
1721                         when '"'
1722                                 tok_state = tok_state_after_attribute_value_quoted
1723                         when '&'
1724                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
1725                         when "\u0000"
1726                                 # Parse error
1727                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1728                         when '' # EOF
1729                                 parse_error()
1730                                 tok_state = tok_state_data
1731                         else
1732                                 tok_cur_tag.attrs_a[0][1] += c
1733                 return null
1734
1735         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
1736         tok_state_attribute_value_single_quoted = ->
1737                 switch c = txt.charAt(cur++)
1738                         when "'"
1739                                 tok_state = tok_state_after_attribute_value_quoted
1740                         when '&'
1741                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
1742                         when "\u0000"
1743                                 # Parse error
1744                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1745                         when '' # EOF
1746                                 parse_error()
1747                                 tok_state = tok_state_data
1748                         else
1749                                 tok_cur_tag.attrs_a[0][1] += c
1750                 return null
1751
1752         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
1753         tok_state_attribute_value_unquoted = ->
1754                 switch c = txt.charAt(cur++)
1755                         when "\t", "\n", "\u000c", ' '
1756                                 tok_state = tok_state_before_attribute_name
1757                         when '&'
1758                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
1759                         when '>'
1760                                 tok_state = tok_state_data
1761                                 tmp = tok_cur_tag
1762                                 tok_cur_tag = null
1763                                 return tmp
1764                         when "\u0000"
1765                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1766                         when '' # EOF
1767                                 parse_error()
1768                                 tok_state = tok_state_data
1769                         else
1770                                 # Parse Error if ', <, = or ` (backtick)
1771                                 tok_cur_tag.attrs_a[0][1] += c
1772                 return null
1773
1774         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
1775         tok_state_after_attribute_value_quoted = ->
1776                 switch c = txt.charAt(cur++)
1777                         when "\t", "\n", "\u000c", ' '
1778                                 tok_state = tok_state_before_attribute_name
1779                         when '/'
1780                                 tok_state = tok_state_self_closing_start_tag
1781                         when '>'
1782                                 tok_state = tok_state_data
1783                                 tmp = tok_cur_tag
1784                                 tok_cur_tag = null
1785                                 return tmp
1786                         when '' # EOF
1787                                 parse_error()
1788                                 tok_state = tok_state_data
1789                         else
1790                                 # Parse Error
1791                                 tok_state = tok_state_before_attribute_name
1792                                 cur -= 1 # we didn't handle that char
1793                 return null
1794
1795         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
1796         # Don't set this as a state, just call it
1797         # returns a string (NOT a text node)
1798         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
1799                 if cur >= txt.length
1800                         return '&'
1801                 switch c = txt.charAt(cur)
1802                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
1803                                 # explicitly not a parse error
1804                                 return '&'
1805                         when ';'
1806                                 # there has to be "one or more" alnums between & and ; to be a parse error
1807                                 return '&'
1808                         when '#'
1809                                 if cur + 1 >= txt.length
1810                                         return '&'
1811                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
1812                                         prefix = '#x'
1813                                         charset = hex_chars
1814                                         start = cur + 2
1815                                 else
1816                                         charset = digits
1817                                         start = cur + 1
1818                                         prefix = '#'
1819                                 i = 0
1820                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
1821                                         i += 1
1822                                 if i is 0
1823                                         return '&'
1824                                 if txt.charAt(start + i) is ';'
1825                                         i += 1
1826                                 # FIXME This is supposed to generate parse errors for some chars
1827                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
1828                                 if decoded?
1829                                         cur = start + i
1830                                         return decoded
1831                                 return '&'
1832                         else
1833                                 for i in [0...31]
1834                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
1835                                                 break
1836                                 if i is 0
1837                                         # exit early, because parse_error() below needs at least one alnum
1838                                         return '&'
1839                                 if txt.charAt(cur + i) is ';'
1840                                         i += 1 # include ';' terminator in value
1841                                         decoded = decode_named_char_ref txt.substr(cur, i)
1842                                         if decoded?
1843                                                 cur += i
1844                                                 return decoded
1845                                         parse_error()
1846                                         return '&'
1847                                 else
1848                                         # no ';' terminator (only legacy char refs)
1849                                         max = i
1850                                         for i in [2..max] # no prefix matches, so ok to check shortest first
1851                                                 c = legacy_char_refs[txt.substr(cur, i)]
1852                                                 if c?
1853                                                         if in_attr
1854                                                                 if txt.charAt(cur + i) is '='
1855                                                                         # "because some legacy user agents will
1856                                                                         # misinterpret the markup in those cases"
1857                                                                         parse_error()
1858                                                                         return '&'
1859                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
1860                                                                         # this makes attributes forgiving about url args
1861                                                                         return '&'
1862                                                         # ok, and besides the weird exceptions for attributes...
1863                                                         # return the matching char
1864                                                         cur += i # consume entity chars
1865                                                         parse_error() # because no terminating ";"
1866                                                         return c
1867                                         parse_error()
1868                                         return '&'
1869                 return # never reached
1870
1871         # tree constructor initialization
1872         # see comments on TYPE_TAG/etc for the structure of this data
1873         tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
1874         open_els = [tree]
1875         insertion_mode = ins_mode_in_body
1876         flag_frameset_ok = true
1877         flag_parsing = true
1878         flag_foster_parenting = false
1879         form_element_pointer = null
1880         afe = [] # active formatting elements
1881
1882         # tokenizer initialization
1883         tok_state = tok_state_data
1884
1885         # proccess input
1886         while flag_parsing
1887                 t = tok_state()
1888                 if t?
1889                         insertion_mode t
1890         return tree.children
1891
1892 # everything below is tests on the above
1893 test_equals = (description, output, expected_output) ->
1894         if output is expected_output
1895                 console.log "passed." # don't say name, so smart consoles can merge all of these
1896         else
1897                 console.log "FAILED: \"#{description}\""
1898                 console.log "   Expected: #{expected_output}"
1899                 console.log "     Actual: #{output}"
1900 serialize_els = (els, shallow, show_ids) ->
1901         serialized = ''
1902         sep = ''
1903         for t in els
1904                 serialized += sep
1905                 sep = ','
1906                 serialized += t.serialize shallow, show_ids
1907         return serialized
1908 test_parser = (args) ->
1909         debug_log_reset()
1910         parse_errors = []
1911         errors_cb = (i) ->
1912                 parse_errors.push i
1913         prev_node_id = 0 # reset counter
1914         parsed = parse_html args.html, errors_cb
1915         serialized = serialize_els parsed, false, false
1916         if serialized isnt args.expected
1917                 debug_log_each (str) ->
1918                         console.log str
1919                 console.log "FAILED: \"#{args.name}\""
1920                 console.log "      Input: #{args.html}"
1921                 console.log "    Correct: #{args.expected}"
1922                 console.log "     Output: #{serialized}"
1923                 if parse_errors.length > 0
1924                         console.log " parse errs: #{JSON.stringify parse_errors}"
1925                 else
1926                         console.log "   No parse errors"
1927         else
1928                 console.log "passed \"#{args.name}\""
1929
1930 test_parser name: "empty", \
1931         html: "",
1932         expected: ''
1933 test_parser name: "just text", \
1934         html: "abc",
1935         expected: 'text:"abc"'
1936 test_parser name: "named entity", \
1937         html: "a&amp;1234",
1938         expected: 'text:"a&1234"'
1939 test_parser name: "broken named character references", \
1940         html: "1&amp2&&amp;3&aabbcc;",
1941         expected: 'text:"1&2&&3&aabbcc;"'
1942 test_parser name: "numbered entity overrides", \
1943         html: "1&#X80&#x80; &#x83",
1944         expected: 'text:"1€€ ƒ"'
1945 test_parser name: "open tag", \
1946         html: "foo<span>bar",
1947         expected: 'text:"foo",tag:"span",{},[text:"bar"]'
1948 test_parser name: "open tag with attributes", \
1949         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
1950         expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
1951 test_parser name: "open tag with attributes of various quotings", \
1952         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
1953         expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]'
1954 test_parser name: "attribute entity exceptions dq", \
1955         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
1956         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
1957 test_parser name: "attribute entity exceptions sq", \
1958         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
1959         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
1960 test_parser name: "attribute entity exceptions uq", \
1961         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
1962         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
1963 test_parser name: "matching closing tags", \
1964         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
1965         expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"'
1966 test_parser name: "missing closing tag inside", \
1967         html: "foo<div>bar<span>baz</div>qux",
1968         expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"'
1969 test_parser name: "mis-matched closing tags", \
1970         html: "<span>12<div>34</span>56</div>78",
1971         expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]'
1972 test_parser name: "mis-matched formatting elements", \
1973         html: "12<b>34<i>56</b>78</i>90",
1974         expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"'
1975 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
1976         html: '<p>1<b>2<i>3</b>4</i>5</p>',
1977         expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]'
1978 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
1979         html: '<b>1<p>2</b>3</p>',
1980         expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]'
1981 test_parser name: "crazy formatting elements test", \
1982         html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
1983         # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
1984         # firefox does this:
1985         expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
1986 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
1987 test_parser name: "html5lib aaa 1", \
1988         html: '<a><p></a></p>',
1989         expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]'
1990 test_parser name: "html5lib aaa 2", \
1991         html: '<a>1<p>2</a>3</p>',
1992         expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]'
1993 test_parser name: "html5lib aaa 3", \
1994         html: '<a>1<button>2</a>3</button>',
1995         expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]'
1996 test_parser name: "html5lib aaa 4", \
1997         html: '<a>1<b>2</a>3</b>',
1998         expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]'
1999 test_parser name: "html5lib aaa 5 (two divs deep)", \
2000         html: '<a>1<div>2<div>3</a>4</div>5</div>',
2001         expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]'
2002 test_parser name: "html5lib aaa 6 (foster parenting)", \
2003         html: '<table><a>1<p>2</a>3</p>',
2004         expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]'
2005 test_parser name: "html5lib aaa 7 (aaa, eof) 1", \
2006         html: '<b><b><a><p></a>',
2007         expected: 'tag:"b",{},[tag:"b",{},[tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]]]'
2008 test_parser name: "html5lib aaa 8 (aaa, eof) 2", \
2009         html: '<b><a><b><p></a>',
2010         expected: 'tag:"b",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2011 test_parser name: "html5lib aaa 9 (aaa, eof) 3", \
2012         html: '<a><b><b><p></a>',
2013         expected: 'tag:"a",{},[tag:"b",{},[tag:"b",{},[]]],tag:"b",{},[tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2014 test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \
2015         html: '<p>1<s id="A">2<b id="B">3</p>4</s>5</b>',
2016         expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]'
2017 test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
2018         html: '<table><a>1<td>2</td>3</table>',
2019         expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]'
2020 test_parser name: "html5lib aaa 12 (table with foster parenting, split text)", \
2021         html: '<table>A<td>B</td>C</table>',
2022         expected: 'text:"AC",tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2023 # TODO implement svg and namespacing
2024 #test_parser name: "html5lib aaa 13 (svg tr input)", \
2025 #       html: '<a><svg><tr><input></a>',
2026 #       expected: 'tag:"a",{},[svg:"svg",{},[svg:"tr",{},[svg:"input"]]]'
2027 test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \
2028         html: '<div><a><b><div><div><div><div><div><div><div><div><div><div></a>',
2029         expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"div",{},[tag:"div",{},[]]]]]]]]]]]]]'
2030 test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \
2031         html: '<div><a><b><u><i><code><div></a>',
2032         expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]'
2033 test_parser name: "html5lib aaa 16 (correctly nested 4b)", \
2034         html: '<b><b><b><b>x</b></b></b></b>y',
2035         expected: 'tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]],text:"y"'
2036 test_parser name: "html5lib aaa 17 (formatting, implied /p, noah's ark)", \
2037         html: '<p><b><b><b><b><p>x',
2038         expected: 'tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[]]]]],tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]]'
2039 test_parser name: "variation on html5lib aaa 17 (with attributes in various orders)", \
2040         html: '<p><b c="d" e="f"><b e="f" c="d"><b e="f" c="d"><b c="d" e="f"><p>x',
2041         expected: 'tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[]]]]],tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[text:"x"]]]]'
2042 test_parser name: "junk after attribute close-quote", \
2043         html: '<p><b c="d", e="f">foo<p>x',
2044         expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'