JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
fix implied_end_tags and </p>
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
23 # parsing.
24 #
25 # Instead, the data structure produced by this parser is an array of Nodes.
26
27
28 # stacks/lists
29 #
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
33 # (both as stacks)
34 #
35 # stacks grow downward (current element is index=0)
36 #
37 # example: open_els = [a, b, c, d, e, f, g]
38 #
39 # "grows downwards" means it's visualized like this: (index: el, names)
40 #
41 #   6: g "start of the list", "topmost", "first"
42 #   5: f
43 #   4: e "previous" (to d), "above", "before"
44 #   3: d   (previous/next are relative to this element)
45 #   2: c "next", "after", "lower", "below"
46 #   1: b
47 #   0: a "end of the list", "current node", "bottommost", "last"
48
49
50
51 # Each node is an obect of the Node class. Here are the Node types:
52 TYPE_TAG = 0 # name, {attributes}, [children]
53 TYPE_TEXT = 1 # "text"
54 TYPE_COMMENT = 2
55 TYPE_DOCTYPE = 3
56 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
57 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
58 TYPE_END_TAG = 5 # name
59 TYPE_EOF = 6
60 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
61 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
62
63 # namespace constants
64 NS_HTML = 1
65 NS_MATHML = 2
66 NS_SVG = 3
67
68 g_debug_log = []
69 debug_log_reset = ->
70         g_debug_log = []
71 debug_log = (str) ->
72         g_debug_log.push str
73 debug_log_each = (cb) ->
74         for str in g_debug_log
75                 cb str
76
77 prev_node_id = 0
78 class Node
79         constructor: (type, args = {}) ->
80                 @type = type # one of the TYPE_* constants above
81                 @name = args.name ? '' # tag name
82                 @text = args.text ? '' # contents for text/comment nodes
83                 @attrs = args.attrs ? {}
84                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
85                 @children = args.children ? []
86                 @namespace = args.namespace ? NS_HTML
87                 @parent = args.parent ? null
88                 if args.id?
89                         @id = "#{args.id}+"
90                 else
91                         @id = "#{++prev_node_id}"
92         shallow_clone: -> # return a new node that's the same except without the children or parent
93                 # WARNING this doesn't work right on open tags that are still being parsed
94                 attrs = {}
95                 attrs[k] = v for k, v of @attrs
96                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
97         serialize: (shallow = false, show_ids = false) -> # for unit tests
98                 ret = ''
99                 switch @type
100                         when TYPE_TAG
101                                 ret += 'tag:'
102                                 ret += JSON.stringify @name
103                                 ret += ','
104                                 if show_ids
105                                         ret += "##{@id},"
106                                 if shallow
107                                         break
108                                 ret += JSON.stringify @attrs
109                                 ret += ',['
110                                 sep = ''
111                                 for c in @children
112                                         ret += sep
113                                         sep = ','
114                                         ret += c.serialize shallow, show_ids
115                                 ret += ']'
116                         when TYPE_TEXT
117                                 ret += 'text:'
118                                 ret += JSON.stringify @text
119                         when TYPE_COMMENT
120                                 ret += 'comment:'
121                                 ret += JSON.stringify @text
122                         when TYPE_DOCTYPE
123                                 ret += 'doctype'
124                                 # FIXME
125                         when TYPE_AFE_MARKER
126                                 ret += 'marker'
127                         when TYPE_AAA_BOOKMARK
128                                 ret += 'aaa_bookmark'
129                         else
130                                 ret += 'unknown:'
131                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
132                 return ret
133
134 # helpers: (only take args that are normally known when parser creates nodes)
135 new_open_tag = (name) ->
136         return new Node TYPE_START_TAG, name: name
137 new_end_tag = (name) ->
138         return new Node TYPE_END_TAG, name: name
139 new_element = (name) ->
140         return new Node TYPE_TAG, name: name
141 new_text_node = (txt) ->
142         return new Node TYPE_TEXT, text: txt
143 new_comment_node = (txt) ->
144         return new Node TYPE_COMMENT, text: txt
145 new_eof_token = ->
146         return new Node TYPE_EOF
147 new_afe_marker = ->
148         return new Node TYPE_AFE_MARKER
149 new_aaa_bookmark = ->
150         return new Node TYPE_AAA_BOOKMARK
151
152 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
153 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
154 digits = "0123456789"
155 alnum = lc_alpha + uc_alpha + digits
156 hex_chars = digits + "abcdefABCDEF"
157
158 # some SVG elements have dashes in them
159 tag_name_chars = alnum + "-"
160
161 # http://www.w3.org/TR/html5/infrastructure.html#space-character
162 space_chars = "\u0009\u000a\u000c\u000d\u0020"
163
164 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
165 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
166
167 # These are the character references that don't need a terminating semicolon
168 # min length: 2, max: 6, none are a prefix of any other.
169 legacy_char_refs = {
170         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
171         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
172         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
173         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
174         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
175         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
176         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
177         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
178         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
179         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
180         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
181         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
182         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
183         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
184         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
185         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
186         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
187         yen: '¥', yuml: 'ÿ'
188 }
189
190 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
191 raw_text_elements = ['script', 'style']
192 escapable_raw_text_elements = ['textarea', 'title']
193 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
194 svg_elements = [
195         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
196         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
197         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
198         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
199         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
200         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
201         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
202         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
203         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
204         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
205         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
206         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
207         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
208         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
209         'view', 'vkern'
210 ]
211
212 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
213 mathml_elements = [
214         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
215         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
216         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
217         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
218         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
219         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
220         'determinant', 'diff', 'divergence', 'divide', 'domain',
221         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
222         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
223         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
224         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
225         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
226         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
227         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
228         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
229         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
230         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
231         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
232         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
233         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
234         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
235         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
236         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
237         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
238         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
239         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
240         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
241         'vectorproduct', 'xor'
242 ]
243 # foreign_elements = [svg_elements..., mathml_elements...]
244 #normal_elements = All other allowed HTML elements are normal elements.
245
246 special_elements = {
247         # HTML:
248         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
249         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
250         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
251         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
252         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
253         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
254         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
255         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
256         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
257         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
258         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
259         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
260         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
261         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
262         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
263         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
264         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
265         wbr:NS_HTML, xmp:NS_HTML,
266
267         # MathML:
268         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
269         'annotation-xml':NS_MATHML,
270
271         # SVG:
272         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
273 }
274
275 formatting_elements = {
276          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
277          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
278          u: true
279 }
280
281 foster_parenting_targets = {
282         table: true
283         tbody: true
284         tfoot: true
285         thead: true
286         tr: true
287 }
288
289 # all html I presume
290 end_tag_implied = {
291         dd: true
292         dt: true
293         li: true
294         option: true
295         optgroup: true
296         p: true
297         rb: true
298         rp: true
299         rt: true
300         rtc: true
301 }
302
303 el_is_special = (e) ->
304         return special_elements[e.name]?
305         # FIXME it should really be:
306         #return special_elements[e.name] is e.namespace
307
308 # decode_named_char_ref()
309 #
310 # The list of named character references is _huge_ so ask the browser to decode
311 # for us instead of wasting bandwidth/space on including the table here.
312 #
313 # Pass without the "&" but with the ";" examples:
314 #    for "&amp" pass "amp;"
315 #    for "&#x2032" pass "x2032;"
316 g_dncr = {
317         cache: {}
318         textarea: document.createElement('textarea')
319 }
320 # TODO test this in IE8
321 decode_named_char_ref = (txt) ->
322         txt = "&#{txt}"
323         decoded = g_dncr.cache[txt]
324         return decoded if decoded?
325         g_dncr.textarea.innerHTML = txt
326         decoded = g_dncr.textarea.value
327         return null if decoded is txt
328         return g_dncr.cache[txt] = decoded
329
330 parse_html = (txt, parse_error_cb = null) ->
331         cur = 0 # index of next char in txt to be parsed
332         # declare tree and tokenizer variables so they're in scope below
333         tree = null
334         open_els = [] # stack of open elements
335         insertion_mode = null
336         tok_state = null
337         tok_cur_tag = null # partially parsed tag
338         flag_frameset_ok = null
339         flag_parsing = null
340         flag_foster_parenting = null
341         form_element_pointer = null
342         afe = [] # active formatting elements
343
344         parse_error = ->
345                 if parse_error_cb?
346                         parse_error_cb cur
347                 else
348                         console.log "Parse error at character #{cur} of #{txt.length}"
349
350
351         # the functions below impliment the Tree Contstruction algorithm
352         # http://www.w3.org/TR/html5/syntax.html#tree-construction
353
354         # But first... the helpers
355         template_tag_is_open = ->
356                 for t in open_els
357                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
358                                 return true
359                 return false
360         is_in_scope_x = (tag_name, scope, namespace) ->
361                 for t in open_els
362                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
363                                 return true
364                         if scope[t.name] is t.namespace
365                                 return false
366                 return false
367         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
368                 for t in open_els
369                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
370                                 return true
371                         if scope[t.name] is t.namespace
372                                 return false
373                         if scope2[t.name] is t.namespace
374                                 return false
375                 return false
376         standard_scopers = { # FIXME these are supposed to be namespace specific
377                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
378                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
379                 template: NS_HTML, mi: NS_MATHML,
380
381                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
382                 'annotation-xml': NS_MATHML,
383
384                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
385         }
386         button_scopers = button: NS_HTML
387         li_scopers = ol: NS_HTML, ul: NS_HTML
388         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
389         is_in_scope = (tag_name, namespace = null) ->
390                 return is_in_scope_x tag_name, standard_scopers, namespace
391         is_in_button_scope = (tag_name, namespace = null) ->
392                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
393         is_in_table_scope = (tag_name, namespace = null) ->
394                 return is_in_scope_x tag_name, table_scopers, namespace
395         is_in_select_scope = (tag_name, namespace = null) ->
396                 for t in open_els
397                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
398                                 return true
399                         if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
400                                 return false
401                 return false
402         # this checks for a particular element, not by name
403         el_is_in_scope = (el) ->
404                 for t in open_els
405                         if t is el
406                                 return true
407                         if standard_scopers[t.name] is t.namespace
408                                 return false
409                 return false
410
411         # 8.2.3.1 ...
412         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
413         reset_insertion_mode = ->
414                 # 1. Let last be false.
415                 last = false
416                 # 2. Let node be the last node in the stack of open elements.
417                 node_i = 0
418                 node = open_els[node_i]
419                 # 3. Loop: If node is the first node in the stack of open elements,
420                 # then set last to true, and, if the parser was originally created as
421                 # part of the HTML fragment parsing algorithm (fragment case) set node
422                 # to the context element.
423                 loop
424                         if node_i is open_els.length - 1
425                                 last = true
426                                 # fixfull (fragment case)
427
428                         # 4. If node is a select element, run these substeps:
429                         if node.name is 'select'
430                                 # 1. If last is true, jump to the step below labeled done.
431                                 unless last
432                                         # 2. Let ancestor be node.
433                                         ancestor_i = node_i
434                                         ancestor = node
435                                         # 3. Loop: If ancestor is the first node in the stack of
436                                         # open elements, jump to the step below labeled done.
437                                         loop
438                                                 if ancestor_i is open_els.length - 1
439                                                         break
440                                                 # 4. Let ancestor be the node before ancestor in the stack
441                                                 # of open elements.
442                                                 ancestor_i += 1
443                                                 ancestor = open_els[ancestor_i]
444                                                 # 5. If ancestor is a template node, jump to the step below
445                                                 # labeled done.
446                                                 if ancestor.name is 'template'
447                                                         break
448                                                 # 6. If ancestor is a table node, switch the insertion mode
449                                                 # to "in select in table" and abort these steps.
450                                                 if ancestor.name is 'table'
451                                                         insertion_mode = ins_mode_in_select_in_table
452                                                         return
453                                                 # 7. Jump back to the step labeled loop.
454                                 # 8. Done: Switch the insertion mode to "in select" and abort
455                                 # these steps.
456                                 insertion_mode = ins_mode_in_select
457                                 return
458                         # 5. If node is a td or th element and last is false, then switch
459                         # the insertion mode to "in cell" and abort these steps.
460                         if (node.name is 'td' or node.name is 'th') and last is false
461                                 insertion_mode = ins_mode_in_cell
462                                 return
463                         # 6. If node is a tr element, then switch the insertion mode to "in
464                         # row" and abort these steps.
465                         if node.name is 'tr'
466                                 insertion_mode = ins_mode_in_row
467                                 return
468                         # 7. If node is a tbody, thead, or tfoot element, then switch the
469                         # insertion mode to "in table body" and abort these steps.
470                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
471                                 insertion_mode = ins_mode_in_table_body
472                                 return
473                         # 8. If node is a caption element, then switch the insertion mode
474                         # to "in caption" and abort these steps.
475                         if node.name is 'caption'
476                                 insertion_mode = ins_mode_in_caption
477                                 return
478                         # 9. If node is a colgroup element, then switch the insertion mode
479                         # to "in column group" and abort these steps.
480                         if node.name is 'colgroup'
481                                 insertion_mode = ins_mode_in_column_group
482                                 return
483                         # 10. If node is a table element, then switch the insertion mode to
484                         # "in table" and abort these steps.
485                         if node.name is 'table'
486                                 insertion_mode = ins_mode_in_table
487                                 return
488                         # 11. If node is a template element, then switch the insertion mode
489                         # to the current template insertion mode and abort these steps.
490                         # fixfull (template insertion mode stack)
491
492                         # 12. If node is a head element and last is true, then switch the
493                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
494                         # these steps. (fragment case)
495                         if node.name is 'head' and last
496                                 insertion_mode = ins_mode_in_body
497                                 return
498                         # 13. If node is a head element and last is false, then switch the
499                         # insertion mode to "in head" and abort these steps.
500                         if node.name is 'head' and last is false
501                                 insertion_mode = ins_mode_in_head
502                                 return
503                         # 14. If node is a body element, then switch the insertion mode to
504                         # "in body" and abort these steps.
505                         if node.name is 'body'
506                                 insertion_mode = ins_mode_in_body
507                                 return
508                         # 15. If node is a frameset element, then switch the insertion mode
509                         # to "in frameset" and abort these steps. (fragment case)
510                         if node.name is 'frameset'
511                                 insertion_mode = ins_mode_in_frameset
512                                 return
513                         # 16. If node is an html element, run these substeps:
514                         if node.name is 'html'
515                                 # 1. If the head element pointer is null, switch the insertion
516                                 # mode to "before head" and abort these steps. (fragment case)
517                                 # fixfull (fragment case)
518
519                                 # 2. Otherwise, the head element pointer is not null, switch
520                                 # the insertion mode to "after head" and abort these steps.
521                                 insertion_mode = ins_mode_in_body # FIXME fixfull
522                                 return
523                         # 17. If last is true, then switch the insertion mode to "in body"
524                         # and abort these steps. (fragment case)
525                         if last
526                                 insertion_mode = ins_mode_in_body
527                                 return
528                         # 18. Let node now be the node before node in the stack of open
529                         # elements.
530                         node_i += 1
531                         node = open_els[node_i]
532                         # 19. Return to the step labeled loop.
533
534         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
535         # this implementation is structured (mostly) as described at the link above.
536         # capitalized comments are the "labels" described at the link above.
537         reconstruct_active_formatting_elements = ->
538                 return if afe.length is 0
539                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
540                         return
541                 # Rewind
542                 i = 0
543                 loop
544                         if i is afe.length - 1
545                                 break
546                         i += 1
547                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
548                                 i -= 1 # Advance
549                                 break
550                 # Create
551                 loop
552                         el = afe[i].shallow_clone()
553                         tree_insert_element el
554                         afe[i] = el
555                         break if i is 0
556                         i -= 1 # Advance
557
558         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
559         # adoption agency algorithm
560         # overview here:
561         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
562         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
563         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
564         adoption_agency = (subject) ->
565                 debug_log "adoption_agency()"
566                 debug_log "tree: #{serialize_els tree.children, false, true}"
567                 debug_log "open_els: #{serialize_els open_els, true, true}"
568                 debug_log "afe: #{serialize_els afe, true, true}"
569                 if open_els[0].name is subject
570                         el = open_els[0]
571                         open_els.shift()
572                         # remove it from the list of active formatting elements (if found)
573                         for t, i in afe
574                                 if t is el
575                                         afe.splice i, 1
576                                         break
577                         debug_log "aaa: starting off with subject on top of stack, exiting"
578                         return
579                 outer = 0
580                 loop
581                         if outer >= 8
582                                 return
583                         outer += 1
584                         # 5. Let formatting element be the last element in the list of
585                         # active formatting elements that: is between the end of the list
586                         # and the last scope marker in the list, if any, or the start of
587                         # the list otherwise, and  has the tag name subject.
588                         fe = null
589                         for t, fe_of_afe in afe
590                                 if t.type is TYPE_AFE_MARKER
591                                         break
592                                 if t.name is subject
593                                         fe = t
594                                         break
595                         # If there is no such element, then abort these steps and instead
596                         # act as described in the "any other end tag" entry above.
597                         if fe is null
598                                 debug_log "aaa: fe not found in afe"
599                                 in_body_any_other_end_tag subject
600                                 return
601                         # 6. If formatting element is not in the stack of open elements,
602                         # then this is a parse error; remove the element from the list, and
603                         # abort these steps.
604                         in_open_els = false
605                         for t, fe_of_open_els in open_els
606                                 if t is fe
607                                         in_open_els = true
608                                         break
609                         unless in_open_els
610                                 debug_log "aaa: fe not found in open_els"
611                                 parse_error()
612                                 # "remove it from the list" must mean afe, since it's not in open_els
613                                 afe.splice fe_of_afe, 1
614                                 return
615                         # 7. If formatting element is in the stack of open elements, but
616                         # the element is not in scope, then this is a parse error; abort
617                         # these steps.
618                         unless el_is_in_scope fe
619                                 debug_log "aaa: fe not in scope"
620                                 parse_error()
621                                 return
622                         # 8. If formatting element is not the current node, this is a parse
623                         # error. (But do not abort these steps.)
624                         unless open_els[0] is fe
625                                 parse_error()
626                                 # continue
627                         # 9. Let furthest block be the topmost node in the stack of open
628                         # elements that is lower in the stack than formatting element, and
629                         # is an element in the special category. There might not be one.
630                         fb = null
631                         fb_of_open_els = null
632                         for t, i in open_els
633                                 if t is fe
634                                         break
635                                 if el_is_special t
636                                         fb = t
637                                         fb_of_open_els = i
638                                         # and continue, to see if there's one that's more "topmost"
639                         # 10. If there is no furthest block, then the UA must first pop all
640                         # the nodes from the bottom of the stack of open elements, from the
641                         # current node up to and including formatting element, then remove
642                         # formatting element from the list of active formatting elements,
643                         # and finally abort these steps.
644                         if fb is null
645                                 debug_log "aaa: no fb"
646                                 loop
647                                         t = open_els.shift()
648                                         if t is fe
649                                                 afe.splice fe_of_afe, 1
650                                                 return
651                         # 11. Let common ancestor be the element immediately above
652                         # formatting element in the stack of open elements.
653                         ca = open_els[fe_of_open_els + 1] # common ancestor
654
655                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
656                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
657                         bookmark = new_aaa_bookmark()
658                         for t, i in afe
659                                 if t is fe
660                                         afe.splice i, 0, bookmark
661                                         break
662                         node = last_node = fb
663                         inner = 0
664                         loop
665                                 inner += 1
666                                 # 3. Let node be the element immediately above node in the
667                                 # stack of open elements, or if node is no longer in the stack
668                                 # of open elements (e.g. because it got removed by this
669                                 # algorithm), the element that was immediately above node in
670                                 # the stack of open elements before node was removed.
671                                 node_next = null
672                                 for t, i in open_els
673                                         if t is node
674                                                 node_next = open_els[i + 1]
675                                                 break
676                                 node = node_next ? node_above
677                                 debug_log "inner loop #{inner}"
678                                 debug_log "tree: #{serialize_els tree.children, false, true}"
679                                 debug_log "open_els: #{serialize_els open_els, true, true}"
680                                 debug_log "afe: #{serialize_els afe, true, true}"
681                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
682                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
683                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
684                                 debug_log "node: #{node.serialize true, true}"
685                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
686
687                                 # 4. If node is formatting element, then go to the next step in
688                                 # the overall algorithm.
689                                 if node is fe
690                                         break
691                                 debug_log "the meat"
692                                 # 5. If inner loop counter is greater than three and node is in
693                                 # the list of active formatting elements, then remove node from
694                                 # the list of active formatting elements.
695                                 node_in_afe = false
696                                 for t, i in afe
697                                         if t is node
698                                                 if inner > 3
699                                                         afe.splice i, 1
700                                                         debug_log "max out inner"
701                                                 else
702                                                         node_in_afe = true
703                                                         debug_log "in afe"
704                                                 break
705                                 # 6. If node is not in the list of active formatting elements,
706                                 # then remove node from the stack of open elements and then go
707                                 # back to the step labeled inner loop.
708                                 unless node_in_afe
709                                         debug_log "not in afe"
710                                         for t, i in open_els
711                                                 if t is node
712                                                         node_above = open_els[i + 1]
713                                                         open_els.splice i, 1
714                                                         break
715                                         continue
716                                 debug_log "the bones"
717                                 # 7. create an element for the token for which the element node
718                                 # was created, in the HTML namespace, with common ancestor as
719                                 # the intended parent; replace the entry for node in the list
720                                 # of active formatting elements with an entry for the new
721                                 # element, replace the entry for node in the stack of open
722                                 # elements with an entry for the new element, and let node be
723                                 # the new element.
724                                 new_node = node.shallow_clone()
725                                 for t, i in afe
726                                         if t is node
727                                                 afe[i] = new_node
728                                                 debug_log "replaced in afe"
729                                                 break
730                                 for t, i in open_els
731                                         if t is node
732                                                 node_above = open_els[i + 1]
733                                                 open_els[i] = new_node
734                                                 debug_log "replaced in open_els"
735                                                 break
736                                 node = new_node
737                                 # 8. If last node is furthest block, then move the
738                                 # aforementioned bookmark to be immediately after the new node
739                                 # in the list of active formatting elements.
740                                 if last_node is fb
741                                         for t, i in afe
742                                                 if t is bookmark
743                                                         afe.splice i, 1
744                                                         debug_log "removed bookmark"
745                                                         break
746                                         for t, i in afe
747                                                 if t is node
748                                                         # "after" means lower
749                                                         afe.splice i, 0, bookmark # "after as <-
750                                                         debug_log "placed bookmark after node"
751                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
752                                                         break
753                                 # 9. Insert last node into node, first removing it from its
754                                 # previous parent node if any.
755                                 if last_node.parent?
756                                         debug_log "last_node has parent"
757                                         for c, i in last_node.parent.children
758                                                 if c is last_node
759                                                         debug_log "removing last_node from parent"
760                                                         last_node.parent.children.splice i, 1
761                                                         break
762                                 node.children.push last_node
763                                 last_node.parent = node
764                                 # 10. Let last node be node.
765                                 last_node = node
766                                 debug_log "at last"
767                                 # 11. Return to the step labeled inner loop.
768                         # 14. Insert whatever last node ended up being in the previous step
769                         # at the appropriate place for inserting a node, but using common
770                         # ancestor as the override target.
771
772                         # JASON: In the case where fe is immediately followed by fb:
773                         #   * inner loop exits out early (node==fe)
774                         #   * last_node is fb
775                         #   * last_node is still in the tree (not a duplicate)
776                         if last_node.parent?
777                                 debug_log "FEFIRST? last_node has parent"
778                                 for c, i in last_node.parent.children
779                                         if c is last_node
780                                                 debug_log "removing last_node from parent"
781                                                 last_node.parent.children.splice i, 1
782                                                 break
783
784                         debug_log "after aaa inner loop"
785                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
786                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
787                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
788                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
789                         debug_log "tree: #{serialize_els tree.children, false, true}"
790
791                         debug_log "insert"
792
793
794                         # can't use standard insert token thing, because it's already in
795                         # open_els and must stay at it's current position in open_els
796                         dest = adjusted_insertion_location ca
797                         dest[0].children.splice dest[1], 0, last_node
798                         last_node.parent = dest[0]
799
800
801                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
802                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
803                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
804                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
805                         debug_log "tree: #{serialize_els tree.children, false, true}"
806
807                         # 15. Create an element for the token for which formatting element
808                         # was created, in the HTML namespace, with furthest block as the
809                         # intended parent.
810                         new_element = fe.shallow_clone() # FIXME intended parent thing
811                         # 16. Take all of the child nodes of furthest block and append them
812                         # to the element created in the last step.
813                         while fb.children.length
814                                 t = fb.children.shift()
815                                 t.parent = new_element
816                                 new_element.children.push t
817                         # 17. Append that new element to furthest block.
818                         new_element.parent = fb
819                         fb.children.push new_element
820                         # 18. Remove formatting element from the list of active formatting
821                         # elements, and insert the new element into the list of active
822                         # formatting elements at the position of the aforementioned
823                         # bookmark.
824                         for t, i in afe
825                                 if t is fe
826                                         afe.splice i, 1
827                                         break
828                         for t, i in afe
829                                 if t is bookmark
830                                         afe[i] = new_element
831                                         break
832                         # 19. Remove formatting element from the stack of open elements,
833                         # and insert the new element into the stack of open elements
834                         # immediately below the position of furthest block in that stack.
835                         for t, i in open_els
836                                 if t is fe
837                                         open_els.splice i, 1
838                                         break
839                         for t, i in open_els
840                                 if t is fb
841                                         open_els.splice i, 0, new_element
842                                         break
843                         # 20. Jump back to the step labeled outer loop.
844                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
845                         debug_log "tree: #{serialize_els tree.children, false, true}"
846                         debug_log "open_els: #{serialize_els open_els, true, true}"
847                         debug_log "afe: #{serialize_els afe, true, true}"
848                 debug_log "AAA DONE"
849
850         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
851         # FIXME test this (particularly emplied end tags)
852         close_p_element = ->
853                 generate_implied_end_tags 'p' # arg is exception
854                 if open_els[0].name isnt 'p'
855                         parse_error()
856                 while open_els.length > 1 # just in case
857                         el = open_els.shift()
858                         if el.name is 'p'
859                                 return
860         close_p_if_in_button_scope = ->
861                 if is_in_button_scope 'p'
862                         close_a_p_element()
863
864         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
865         tree_insert_text = (t) ->
866                 dest = adjusted_insertion_location()
867                 # fixfull check for Document node
868                 if dest[1] > 0
869                         prev = dest[0].children[dest[1] - 1]
870                         if prev.type is TYPE_TEXT
871                                 prev.text += t.text
872                                 return
873                 dest[0].children.splice dest[1], 0, t
874
875         # 8.2.5.1
876         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
877         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
878         adjusted_insertion_location = (override_target = null) ->
879                 # 1. If there was an override target specified, then let target be the
880                 # override target.
881                 if override_target?
882                         target = override_target
883                 else # Otherwise, let target be the current node.
884                         target = open_els[0]
885                 # 2. Determine the adjusted insertion location using the first matching
886                 # steps from the following list:
887                 #
888                 # If foster parenting is enabled and target is a table, tbody, tfoot,
889                 # thead, or tr element Foster parenting happens when content is
890                 # misnested in tables.
891                 if flag_foster_parenting and foster_parenting_targets[target.name]
892                         loop # once. this is here so we can ``break`` to "abort these substeps"
893                                 # 1. Let last template be the last template element in the
894                                 # stack of open elements, if any.
895                                 last_template = null
896                                 last_template_i = null
897                                 for el, i in open_els
898                                         if el.name is 'template'
899                                                 last_template = el
900                                                 last_template_i = i
901                                                 break
902                                 # 2. Let last table be the last table element in the stack of
903                                 # open elements, if any.
904                                 last_table = null
905                                 last_table_i
906                                 for el, i in open_els
907                                         if el.name is 'table'
908                                                 last_table = el
909                                                 last_table_i = i
910                                                 break
911                                 # 3. If there is a last template and either there is no last
912                                 # table, or there is one, but last template is lower (more
913                                 # recently added) than last table in the stack of open
914                                 # elements, then: let adjusted insertion location be inside
915                                 # last template's template contents, after its last child (if
916                                 # any), and abort these substeps.
917                                 if last_template and (last_table is null or last_template_i < last_table_i)
918                                         target = template # fixfull should be it's contents
919                                         target_i = target.children.length
920                                         break
921                                 # 4. If there is no last table, then let adjusted insertion
922                                 # location be inside the first element in the stack of open
923                                 # elements (the html element), after its last child (if any),
924                                 # and abort these substeps. (fragment case)
925                                 if last_table is null
926                                         # this is odd
927                                         target = open_els[open_els.length - 1]
928                                         target_i = target.children.length
929                                 # 5. If last table has a parent element, then let adjusted
930                                 # insertion location be inside last table's parent element,
931                                 # immediately before last table, and abort these substeps.
932                                 if last_table.parent?
933                                         for c, i in last_table.parent.children
934                                                 if c is last_table
935                                                         target = last_table.parent
936                                                         target_i = i
937                                                         break
938                                         break
939                                 # 6. Let previous element be the element immediately above last
940                                 # table in the stack of open elements.
941                                 #
942                                 # huh? how could it not have a parent?
943                                 previous_element = open_els[last_table_i + 1]
944                                 # 7. Let adjusted insertion location be inside previous
945                                 # element, after its last child (if any).
946                                 target = previous_element
947                                 target_i = target.children.length
948                                 # Note: These steps are involved in part because it's possible
949                                 # for elements, the table element in this case in particular,
950                                 # to have been moved by a script around in the DOM, or indeed
951                                 # removed from the DOM entirely, after the element was inserted
952                                 # by the parser.
953                                 break # don't really loop
954                 else
955                         # Otherwise Let adjusted insertion location be inside target, after
956                         # its last child (if any).
957                         target_i = target.children.length
958
959                 # 3. If the adjusted insertion location is inside a template element,
960                 # let it instead be inside the template element's template contents,
961                 # after its last child (if any).
962                 # fixfull (template)
963
964                 # 4. Return the adjusted insertion location.
965                 return [target, target_i]
966
967         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
968         # aka create_an_element_for_token
969         token_to_element = (t, namespace, intended_parent) ->
970                 t.type = TYPE_TAG # not TYPE_START_TAG
971                 # convert attributes into a hash
972                 attrs = {}
973                 while t.attrs_a.length
974                         a = t.attrs_a.pop()
975                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
976                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
977
978                 # TODO 2. If the newly created element has an xmlns attribute in the
979                 # XMLNS namespace whose value is not exactly the same as the element's
980                 # namespace, that is a parse error. Similarly, if the newly created
981                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
982                 # value is not the XLink Namespace, that is a parse error.
983
984                 # fixfull: the spec says stuff about form pointers and ownerDocument
985
986                 return el
987
988         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
989         insert_foreign_element = (token, namespace) ->
990                 ail = adjusted_insertion_location()
991                 ail_el = ail[0]
992                 ail_i = ail[1]
993                 el = token_to_element token, namespace, ail_el
994                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
995                 el.parent = ail_el
996                 ail_el.children.splice ail_i, 0, el
997                 open_els.unshift el
998                 return el
999         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1000         insert_html_element = insert_foreign_element # (token, namespace) ->
1001
1002         # FIXME read implement "foster parenting" part
1003         # FIXME read spec, do this right
1004         # FIXME implement the override target thing
1005         # note: this assumes it's an open tag
1006         # FIXME what part of the spec is this?
1007         # TODO look through all callers of this, and see what they should really be doing.
1008         #   eg probably insert_html_element for tokens
1009         tree_insert_element = (el, override_target = null, namespace = null) ->
1010                 if namespace?
1011                         el.namespace = namespace
1012                 dest = adjusted_insertion_location override_target
1013                 if el.type is TYPE_START_TAG # means it's a "token"
1014                         el = token_to_element el, namespace, dest[0]
1015                 unless el.namespace?
1016                         namespace = dest.namespace
1017                 # fixfull: Document nodes sometimes can't accept more chidren
1018                 dest[0].children.splice dest[1], 0, el
1019                 el.parent = dest[0]
1020                 open_els.unshift el
1021                 return el
1022
1023         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1024         # position should be [node, index_within_children]
1025         tree_insert_a_comment = (t, position = null) ->
1026                 position ?= adjusted_insertion_location()
1027                 position[0].children.splice position[1], 0, t
1028
1029         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1030         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1031         generate_implied_end_tags = (except = null) ->
1032                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1033                         open_els.shift()
1034
1035         # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1036         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1037                 for node, i in open_els
1038                         if node.name is name # FIXME check namespace too
1039                                 generate_implied_end_tags name # arg is exception
1040                                 parse_error() unless i is 0
1041                                 while i >= 0
1042                                         open_els.shift()
1043                                         i -= 1
1044                                 return
1045                         if special_elements[node.name]? # FIXME check namespac too
1046                                 parse_error()
1047                                 return
1048         ins_mode_in_body = (t) ->
1049                 switch t.type
1050                         when TYPE_TEXT
1051                                 switch t.text
1052                                         when "\u0000"
1053                                                 parse_error()
1054                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
1055                                                 reconstruct_active_formatting_elements()
1056                                                 tree_insert_text t
1057                                         else
1058                                                 reconstruct_active_formatting_elements()
1059                                                 tree_insert_text t
1060                                                 flag_frameset_ok = false
1061                         when TYPE_COMMENT
1062                                 tree_insert_a_comment t
1063                         when TYPE_DOCTYPE
1064                                 parse_error()
1065                         when TYPE_START_TAG
1066                                 switch t.name
1067                                         when 'html'
1068                                                 parse_error()
1069                                                 return if template_tag_is_open()
1070                                                 root_attrs = open_els[open_els.length - 1].attrs
1071                                                 for k, v of t.attrs
1072                                                         root_attrs[k] = v unless root_attrs[k]?
1073                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1074                                                 # FIXME also do this for </template> (end tag)
1075                                                 return tree_in_head t
1076                                         when 'body'
1077                                                 parse_error()
1078                                                 # TODO
1079                                         when 'frameset'
1080                                                 parse_error()
1081                                                 # TODO
1082                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1083                                                 close_p_if_in_button_scope()
1084                                                 insert_html_element t
1085                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1086                                                 close_p_if_in_button_scope()
1087                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1088                                                         parse_error()
1089                                                         open_els.shift()
1090                                                 insert_html_element t
1091                                         # TODO lots more to implement here
1092                                         when 'a'
1093                                                 # If the list of active formatting elements
1094                                                 # contains an a element between the end of the list and
1095                                                 # the last marker on the list (or the start of the list
1096                                                 # if there is no marker on the list), then this is a
1097                                                 # parse error; run the adoption agency algorithm for
1098                                                 # the tag name "a", then remove that element from the
1099                                                 # list of active formatting elements and the stack of
1100                                                 # open elements if the adoption agency algorithm didn't
1101                                                 # already remove it (it might not have if the element
1102                                                 # is not in table scope).
1103                                                 found = false
1104                                                 for el in afe
1105                                                         if el.type is TYPE_AFE_MARKER
1106                                                                 break
1107                                                         if el.name is 'a'
1108                                                                 found = el
1109                                                 if found?
1110                                                         parse_error()
1111                                                         adoption_agency 'a'
1112                                                         for el, i in afe
1113                                                                 if el is found
1114                                                                         afe.splice i, 1
1115                                                         for el, i in open_els
1116                                                                 if el is found
1117                                                                         open_els.splice i, 1
1118                                                 reconstruct_active_formatting_elements()
1119                                                 el = insert_html_element t
1120                                                 afe.unshift el
1121                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1122                                                 reconstruct_active_formatting_elements()
1123                                                 el = insert_html_element t
1124                                                 afe.unshift el
1125                                         when 'table'
1126                                                 # fixfull quirksmode thing
1127                                                 close_p_if_in_button_scope()
1128                                                 insert_html_element t
1129                                                 insertion_mode = ins_mode_in_table
1130                                         # TODO lots more to implement here
1131                                         else # any other start tag
1132                                                 reconstruct_active_formatting_elements()
1133                                                 insert_html_element t
1134                         when TYPE_EOF
1135                                 ok_tags = {
1136                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1137                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1138                                 }
1139                                 for t in open_els
1140                                         unless ok_tags[t.name]?
1141                                                 parse_error()
1142                                                 break
1143                                 # TODO stack of template insertion modes thing
1144                                 flag_parsing = false # stop parsing
1145                         when TYPE_END_TAG
1146                                 switch t.name
1147                                         when 'body'
1148                                                 unless is_in_scope 'body'
1149                                                         parse_error()
1150                                                         return
1151                                                 # TODO implement parse error and move to tree_after_body
1152                                         when 'html'
1153                                                 unless is_in_scope 'body' # weird, but it's what the spec says
1154                                                         parse_error()
1155                                                         return
1156                                                 # TODO implement parse error and move to tree_after_body, reprocess
1157                                         when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1158                                                 unless is_in_scope t.name, NS_HTML
1159                                                         parse_error()
1160                                                         return
1161                                                 generate_implied_end_tags()
1162                                                 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1163                                                         parse_error()
1164                                                 loop
1165                                                         el = open_els.shift()
1166                                                         if el.name is t.name and el.namespace is NS_HTML
1167                                                                 return
1168                                         # TODO lots more close tags to implement here
1169                                         when 'p'
1170                                                 unless is_in_button_scope 'p'
1171                                                         parse_error()
1172                                                         insert_html_element new_open_tag 'p'
1173                                                 close_p_element()
1174                                         # TODO lots more close tags to implement here
1175                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1176                                                 adoption_agency t.name
1177                                         # TODO lots more close tags to implement here
1178                                         else
1179                                                 in_body_any_other_end_tag t.name
1180                 return
1181
1182         ins_mode_in_table_else = (t) ->
1183                 parse_error()
1184                 flag_foster_parenting = true # FIXME
1185                 ins_mode_in_body t
1186                 flag_foster_parenting = false
1187         can_in_table = {
1188                 'table': true
1189                 'tbody': true
1190                 'tfoot': true
1191                 'thead': true
1192                 'tr': true
1193         }
1194         clear_to_table_stopers = {
1195                 'table': true
1196                 'template': true
1197                 'html': true
1198         }
1199         clear_stack_to_table_context = ->
1200                 loop
1201                         if clear_to_table_stopers[open_els[0].name]?
1202                                 break
1203                         open_els.shift()
1204                 return
1205         clear_to_table_body_stopers = {
1206                 'tbody': true
1207                 'tfoot': true
1208                 'thead': true
1209                 'template': true
1210                 'html': true
1211         }
1212         clear_stack_to_table_body_context = ->
1213                 loop
1214                         if clear_to_table_body_stopers[open_els[0].name]?
1215                                 break
1216                         open_els.shift()
1217                 return
1218         clear_to_table_row_stopers = {
1219                 'tr': true
1220                 'template': true
1221                 'html': true
1222         }
1223         clear_stack_to_table_row_context = ->
1224                 loop
1225                         if clear_to_table_row_stopers[open_els[0].name]?
1226                                 break
1227                         open_els.shift()
1228                 return
1229         clear_afe_to_marker = ->
1230                 loop
1231                         el = afe.shift()
1232                         if el.type is TYPE_AFE_MARKER
1233                                 return
1234         ins_mode_in_table = (t) ->
1235                 switch t.type
1236                         when TYPE_TEXT
1237                                 if can_in_table[t.name]
1238                                         original_insertion_mode = insertion_mode
1239                                         insertion_mode = ins_mode_in_table_text
1240                                         insertion_mode t
1241                                 else
1242                                         ins_mode_in_table_else t
1243                         when TYPE_COMMENT
1244                                 tree_insert_a_comment t
1245                         when TYPE_DOCTYPE
1246                                 parse_error()
1247                         when TYPE_START_TAG
1248                                 switch t.name
1249                                         when 'caption'
1250                                                 clear_stack_to_table_context()
1251                                                 afe.unshift new_afe_marker()
1252                                                 insert_html_element t
1253                                                 insertion_mode = ins_mode_in_caption
1254                                         when 'colgroup'
1255                                                 clear_stack_to_table_context()
1256                                                 insert_html_element t
1257                                                 insertion_mode = ins_mode_in_column_group
1258                                         when 'col'
1259                                                 clear_stack_to_table_context()
1260                                                 insert_html_element new_open_tag 'colgroup'
1261                                                 insertion_mode = ins_mode_in_column_group
1262                                                 insertion_mode t
1263                                         when 'tbody', 'tfoot', 'thead'
1264                                                 clear_stack_to_table_context()
1265                                                 insert_html_element t
1266                                                 insertion_mode = ins_mode_in_table_body
1267                                         when 'td', 'th', 'tr'
1268                                                 clear_stack_to_table_context()
1269                                                 insert_html_element new_open_tag 'tbody'
1270                                                 insertion_mode = ins_mode_in_table_body
1271                                                 insertion_mode t
1272                                         when 'table'
1273                                                 parse_error()
1274                                                 if is_in_table_scope 'table'
1275                                                         loop
1276                                                                 el = open_els.shift()
1277                                                                 if el.name is 'table'
1278                                                                         break
1279                                                         reset_insertion_mode()
1280                                                         insertion_mode t
1281                                         when 'style', 'script', 'template'
1282                                                 ins_mode_in_head t
1283                                         when 'input'
1284                                                 if token_is_input_hidden t
1285                                                         ins_mode_in_table_else t
1286                                                 else
1287                                                         parse_error()
1288                                                         insert_html_element t
1289                                                         open_els.shift()
1290                                                         # fixfull acknowledge sef-closing flag
1291                                         when 'form'
1292                                                 parse_error()
1293                                                 if form_element_pointer?
1294                                                         return
1295                                                 if template_tag_is_open()
1296                                                         return
1297                                                 form_element_pointer = insert_html_element t
1298                                                 open_els.shift()
1299                                         else
1300                                                 ins_mode_in_table_else t
1301                         when TYPE_END_TAG
1302                                 switch t.name
1303                                         when 'table'
1304                                                 if is_in_table_scope 'table'
1305                                                         loop
1306                                                                 el = open_els.shift()
1307                                                                 if el.name is 'table'
1308                                                                         break
1309                                                         reset_insertion_mode()
1310                                                 else
1311                                                         parse_error
1312                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1313                                                 parse_error()
1314                                         when 'template'
1315                                                 ins_mode_in_head t
1316                                         else
1317                                                 ins_mode_in_table_else t
1318                         when TYPE_EOF
1319                                 ins_mode_in_body t
1320                         else
1321                                 ins_mode_in_table_else t
1322
1323
1324         ins_mode_in_table_text = (t) ->
1325                 switch t.type
1326                         when TYPE_TEXT
1327                                 switch t.text
1328                                         when "\u0000"
1329                                                 parse_error()
1330                                                 return
1331                 console.log "unimplemented ins_mode_in_table_text"
1332                 # FIXME CONTINUE
1333
1334         ins_mode_in_table_body = (t) ->
1335                 if t.type is TYPE_START_TAG and t.name is 'tr'
1336                         clear_stack_to_table_body_context()
1337                         insert_html_element t
1338                         return
1339                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1340                         parse_error()
1341                         clear_stack_to_table_body_context()
1342                         insert_html_element new_open_tag 'tr'
1343                         insertion_mode = ins_mode_in_row
1344                         insertion_mode t
1345                         return
1346                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1347                         unless is_in_table_scope t.name # fixfull check namespace
1348                                 parse_error()
1349                                 return
1350                         clear_stack_to_table_body_context()
1351                         open_els.shift()
1352                         insertion_mode = ins_mode_in_table
1353                         return
1354                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1355                         has = false
1356                         for el in open_els
1357                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1358                                         has = true
1359                                         break
1360                                 if table_scopers[el.name]
1361                                         break
1362                         if !has
1363                                 parse_error()
1364                                 return
1365                         clear_stack_to_table_body_context()
1366                         open_els.shift()
1367                         insertion_mode = ins_mode_in_table
1368                         insertion_mode t
1369                         return
1370                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1371                         parse_error()
1372                         return
1373                 # Anything else
1374                 ins_mode_in_table t
1375
1376         ins_mode_in_row = (t) ->
1377                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1378                         clear_stack_to_table_row_context()
1379                         insert_html_element t
1380                         insertion_mode = ins_mode_in_cell
1381                         afe.unshift new_afe_marker()
1382                         return
1383                 if t.type is TYPE_END_TAG and t.name is 'tr'
1384                         if is_in_table_scope 'tr'
1385                                 clear_stack_to_table_row_context()
1386                                 open_els.shift()
1387                                 insertion_mode = ins_mode_in_table_body
1388                         else
1389                                 parse_error()
1390                         return
1391                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1392                         if is_in_table_scope 'tr'
1393                                 clear_stack_to_table_row_context()
1394                                 open_els.shift()
1395                                 insertion_mode = ins_mode_in_table_body
1396                                 insertion_mode t
1397                         else
1398                                 parse_error()
1399                         return
1400                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1401                         if is_in_table_scope t.name # fixfull namespace
1402                                 if is_in_table_scope 'tr'
1403                                         clear_stack_to_table_row_context()
1404                                         open_els.shift()
1405                                         insertion_mode = ins_mode_in_table_body
1406                                         insertion_mode t
1407                         else
1408                                 parse_error()
1409                         return
1410                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1411                         parse_error()
1412                         return
1413                 # Anything else
1414                 ins_mode_in_table t
1415
1416         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1417         close_the_cell = ->
1418                 generate_implied_end_tags()
1419                 unless open_els[0].name is 'td' or open_els[0] is 'th'
1420                         parse_error()
1421                 loop
1422                         el = open_els.shift()
1423                         if el.name is 'td' or el.name is 'th'
1424                                 break
1425                 clear_afe_to_marker()
1426                 insertion_mode = ins_mode_in_row
1427
1428         # http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1429         ins_mode_in_cell = (t) ->
1430                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1431                         if is_in_table_scope t.name
1432                                 generate_implied_end_tags()
1433                                 if open_els[0].name isnt t.name
1434                                         parse_error
1435                                 loop
1436                                         el = open_els.shift()
1437                                         if el.name is t.name
1438                                                 break
1439                                 clear_afe_to_marker()
1440                                 insertion_mode = ins_mode_in_row
1441                         else
1442                                 parse_error()
1443                         return
1444                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1445                         has = false
1446                         for el in open_els
1447                                 if el.name is 'td' or el.name is 'th'
1448                                         has = true
1449                                         break
1450                                 if table_scopers[el.name]
1451                                         break
1452                         if !has
1453                                 parse_error()
1454                                 return
1455                         close_the_cell()
1456                         insertion_mode t
1457                         return
1458                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1459                         parse_error()
1460                         return
1461                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1462                         if is_in_table_scope t.name # fixfull namespace
1463                                 close_the_cell()
1464                                 insertion_mode t
1465                         else
1466                                 parse_error()
1467                         return
1468                 # Anything Else
1469                 ins_mode_in_body t
1470
1471
1472         # the functions below implement the tokenizer stats described here:
1473         # http://www.w3.org/TR/html5/syntax.html#tokenization
1474
1475         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
1476         tok_state_data = ->
1477                 switch c = txt.charAt(cur++)
1478                         when '&'
1479                                 return new_text_node tokenize_character_reference()
1480                         when '<'
1481                                 tok_state = tok_state_tag_open
1482                         when "\u0000"
1483                                 parse_error()
1484                                 return new_text_node c
1485                         when '' # EOF
1486                                 return new_eof_token()
1487                         else
1488                                 return new_text_node c
1489                 return null
1490
1491         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
1492         # not needed: tok_state_character_reference_in_data = ->
1493         # just call tok_state_character_reference_in_data()
1494
1495         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
1496         tok_state_tag_open = ->
1497                 switch c = txt.charAt(cur++)
1498                         when '!'
1499                                 tok_state = tok_state_markup_declaration_open
1500                         when '/'
1501                                 tok_state = tok_state_end_tag_open
1502                         when '?'
1503                                 parse_error()
1504                                 tok_state = tok_state_bogus_comment
1505                         else
1506                                 if lc_alpha.indexOf(c) > -1
1507                                         tok_cur_tag = new_open_tag c
1508                                         tok_state = tok_state_tag_name
1509                                 else if uc_alpha.indexOf(c) > -1
1510                                         tok_cur_tag = new_open_tag c.toLowerCase()
1511                                         tok_state = tok_state_tag_name
1512                                 else
1513                                         parse_error()
1514                                         tok_state = tok_state_data
1515                                         cur -= 1 # we didn't parse/handle the char after <
1516                                         return new_text_node '<'
1517                 return null
1518
1519         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
1520         tok_state_end_tag_open = ->
1521                 switch c = txt.charAt(cur++)
1522                         when '>'
1523                                 parse_error()
1524                                 tok_state = tok_state_data
1525                         when '' # EOF
1526                                 parse_error()
1527                                 tok_state = tok_state_data
1528                                 return new_text_node '</'
1529                         else
1530                                 if uc_alpha.indexOf(c) > -1
1531                                         tok_cur_tag = new_end_tag c.toLowerCase()
1532                                         tok_state = tok_state_tag_name
1533                                 else if lc_alpha.indexOf(c) > -1
1534                                         tok_cur_tag = new_end_tag c
1535                                         tok_state = tok_state_tag_name
1536                                 else
1537                                         parse_error()
1538                                         tok_state = tok_state_bogus_comment
1539                 return null
1540
1541         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
1542         tok_state_tag_name = ->
1543                 switch c = txt.charAt(cur++)
1544                         when "\t", "\n", "\u000c", ' '
1545                                 tok_state = tok_state_before_attribute_name
1546                         when '/'
1547                                 tok_state = tok_state_self_closing_start_tag
1548                         when '>'
1549                                 tok_state = tok_state_data
1550                                 tmp = tok_cur_tag
1551                                 tok_cur_tag = null
1552                                 return tmp
1553                         when "\u0000"
1554                                 parse_error()
1555                                 tok_cur_tag.name += "\ufffd"
1556                         when '' # EOF
1557                                 parse_error()
1558                                 tok_state = tok_state_data
1559                         else
1560                                 if uc_alpha.indexOf(c) > -1
1561                                         tok_cur_tag.name += c.toLowerCase()
1562                                 else
1563                                         tok_cur_tag.name += c
1564                 return null
1565
1566         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
1567         tok_state_before_attribute_name = ->
1568                 attr_name = null
1569                 switch c = txt.charAt(cur++)
1570                         when "\t", "\n", "\u000c", ' '
1571                                 return null
1572                         when '/'
1573                                 tok_state = tok_state_self_closing_start_tag
1574                                 return null
1575                         when '>'
1576                                 tok_state = tok_state_data
1577                                 tmp = tok_cur_tag
1578                                 tok_cur_tag = null
1579                                 return tmp
1580                         when "\u0000"
1581                                 parse_error()
1582                                 attr_name = "\ufffd"
1583                         when '"', "'", '<', '='
1584                                 parse_error()
1585                                 attr_name = c
1586                         when '' # EOF
1587                                 parse_error()
1588                                 tok_state = tok_state_data
1589                         else
1590                                 if uc_alpha.indexOf(c) > -1
1591                                         attr_name = c.toLowerCase()
1592                                 else
1593                                         attr_name = c
1594                 if attr_name?
1595                         tok_cur_tag.attrs_a.unshift [attr_name, '']
1596                         tok_state = tok_state_attribute_name
1597                 return null
1598
1599         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
1600         tok_state_attribute_name = ->
1601                 switch c = txt.charAt(cur++)
1602                         when "\t", "\n", "\u000c", ' '
1603                                 tok_state = tok_state_after_attribute_name
1604                         when '/'
1605                                 tok_state = tok_state_self_closing_start_tag
1606                         when '='
1607                                 tok_state = tok_state_before_attribute_value
1608                         when '>'
1609                                 tok_state = tok_state_data
1610                                 tmp = tok_cur_tag
1611                                 tok_cur_tag = null
1612                                 return tmp
1613                         when "\u0000"
1614                                 parse_error()
1615                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
1616                         when '"', "'", '<'
1617                                 parse_error()
1618                                 tok_cur_tag.attrs_a[0][0] = c
1619                         when '' # EOF
1620                                 parse_error()
1621                                 tok_state = tok_state_data
1622                         else
1623                                 if uc_alpha.indexOf(c) > -1
1624                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
1625                                 else
1626                                         tok_cur_tag.attrs_a[0][0] += c
1627                 return null
1628
1629         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
1630         tok_state_before_attribute_value = ->
1631                 switch c = txt.charAt(cur++)
1632                         when "\t", "\n", "\u000c", ' '
1633                                 return null
1634                         when '"'
1635                                 tok_state = tok_state_attribute_value_double_quoted
1636                         when '&'
1637                                 tok_state = tok_state_attribute_value_unquoted
1638                                 cur -= 1
1639                         when "'"
1640                                 tok_state = tok_state_attribute_value_single_quoted
1641                         when "\u0000"
1642                                 # Parse error
1643                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1644                                 tok_state = tok_state_attribute_value_unquoted
1645                         when '>'
1646                                 # Parse error
1647                                 tok_state = tok_state_data
1648                                 tmp = tok_cur_tag
1649                                 tok_cur_tag = null
1650                                 return tmp
1651                         when '' # EOF
1652                                 parse_error()
1653                                 tok_state = tok_state_data
1654                         else
1655                                 tok_cur_tag.attrs_a[0][1] += c
1656                                 tok_state = tok_state_attribute_value_unquoted
1657                 return null
1658
1659         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
1660         tok_state_attribute_value_double_quoted = ->
1661                 switch c = txt.charAt(cur++)
1662                         when '"'
1663                                 tok_state = tok_state_after_attribute_value_quoted
1664                         when '&'
1665                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
1666                         when "\u0000"
1667                                 # Parse error
1668                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1669                         when '' # EOF
1670                                 parse_error()
1671                                 tok_state = tok_state_data
1672                         else
1673                                 tok_cur_tag.attrs_a[0][1] += c
1674                 return null
1675
1676         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
1677         tok_state_attribute_value_single_quoted = ->
1678                 switch c = txt.charAt(cur++)
1679                         when "'"
1680                                 tok_state = tok_state_after_attribute_value_quoted
1681                         when '&'
1682                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
1683                         when "\u0000"
1684                                 # Parse error
1685                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1686                         when '' # EOF
1687                                 parse_error()
1688                                 tok_state = tok_state_data
1689                         else
1690                                 tok_cur_tag.attrs_a[0][1] += c
1691                 return null
1692
1693         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
1694         tok_state_attribute_value_unquoted = ->
1695                 switch c = txt.charAt(cur++)
1696                         when "\t", "\n", "\u000c", ' '
1697                                 tok_state = tok_state_before_attribute_name
1698                         when '&'
1699                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
1700                         when '>'
1701                                 tok_state = tok_state_data
1702                                 tmp = tok_cur_tag
1703                                 tok_cur_tag = null
1704                                 return tmp
1705                         when "\u0000"
1706                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1707                         when '' # EOF
1708                                 parse_error()
1709                                 tok_state = tok_state_data
1710                         else
1711                                 # Parse Error if ', <, = or ` (backtick)
1712                                 tok_cur_tag.attrs_a[0][1] += c
1713                 return null
1714
1715         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
1716         tok_state_after_attribute_value_quoted = ->
1717                 switch c = txt.charAt(cur++)
1718                         when "\t", "\n", "\u000c", ' '
1719                                 tok_state = tok_state_before_attribute_name
1720                         when '/'
1721                                 tok_state = tok_state_self_closing_start_tag
1722                         when '>'
1723                                 tok_state = tok_state_data
1724                                 tmp = tok_cur_tag
1725                                 tok_cur_tag = null
1726                                 return tmp
1727                         when '' # EOF
1728                                 parse_error()
1729                                 tok_state = tok_state_data
1730                         else
1731                                 # Parse Error
1732                                 tok_state = tok_state_before_attribute_name
1733                                 cur -= 1 # we didn't handle that char
1734                 return null
1735
1736         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
1737         # Don't set this as a state, just call it
1738         # returns a string (NOT a text node)
1739         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
1740                 if cur >= txt.length
1741                         return '&'
1742                 switch c = txt.charAt(cur)
1743                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
1744                                 # explicitly not a parse error
1745                                 return '&'
1746                         when ';'
1747                                 # there has to be "one or more" alnums between & and ; to be a parse error
1748                                 return '&'
1749                         when '#'
1750                                 if cur + 1 >= txt.length
1751                                         return '&'
1752                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
1753                                         prefix = '#x'
1754                                         charset = hex_chars
1755                                         start = cur + 2
1756                                 else
1757                                         charset = digits
1758                                         start = cur + 1
1759                                         prefix = '#'
1760                                 i = 0
1761                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
1762                                         i += 1
1763                                 if i is 0
1764                                         return '&'
1765                                 if txt.charAt(start + i) is ';'
1766                                         i += 1
1767                                 # FIXME This is supposed to generate parse errors for some chars
1768                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
1769                                 if decoded?
1770                                         cur = start + i
1771                                         return decoded
1772                                 return '&'
1773                         else
1774                                 for i in [0...31]
1775                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
1776                                                 break
1777                                 if i is 0
1778                                         # exit early, because parse_error() below needs at least one alnum
1779                                         return '&'
1780                                 if txt.charAt(cur + i) is ';'
1781                                         i += 1 # include ';' terminator in value
1782                                         decoded = decode_named_char_ref txt.substr(cur, i)
1783                                         if decoded?
1784                                                 cur += i
1785                                                 return decoded
1786                                         parse_error()
1787                                         return '&'
1788                                 else
1789                                         # no ';' terminator (only legacy char refs)
1790                                         max = i
1791                                         for i in [2..max] # no prefix matches, so ok to check shortest first
1792                                                 c = legacy_char_refs[txt.substr(cur, i)]
1793                                                 if c?
1794                                                         if in_attr
1795                                                                 if txt.charAt(cur + i) is '='
1796                                                                         # "because some legacy user agents will
1797                                                                         # misinterpret the markup in those cases"
1798                                                                         parse_error()
1799                                                                         return '&'
1800                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
1801                                                                         # this makes attributes forgiving about url args
1802                                                                         return '&'
1803                                                         # ok, and besides the weird exceptions for attributes...
1804                                                         # return the matching char
1805                                                         cur += i # consume entity chars
1806                                                         parse_error() # because no terminating ";"
1807                                                         return c
1808                                         parse_error()
1809                                         return '&'
1810                 return # never reached
1811
1812         # tree constructor initialization
1813         # see comments on TYPE_TAG/etc for the structure of this data
1814         tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
1815         open_els = [tree]
1816         insertion_mode = ins_mode_in_body
1817         flag_frameset_ok = true
1818         flag_parsing = true
1819         flag_foster_parenting = false
1820         form_element_pointer = null
1821         afe = [] # active formatting elements
1822
1823         # tokenizer initialization
1824         tok_state = tok_state_data
1825
1826         # proccess input
1827         while flag_parsing
1828                 t = tok_state()
1829                 if t?
1830                         insertion_mode t
1831         return tree.children
1832
1833 # everything below is tests on the above
1834 test_equals = (description, output, expected_output) ->
1835         if output is expected_output
1836                 console.log "passed." # don't say name, so smart consoles can merge all of these
1837         else
1838                 console.log "FAILED: \"#{description}\""
1839                 console.log "   Expected: #{expected_output}"
1840                 console.log "     Actual: #{output}"
1841 serialize_els = (els, shallow, show_ids) ->
1842         serialized = ''
1843         sep = ''
1844         for t in els
1845                 serialized += sep
1846                 sep = ','
1847                 serialized += t.serialize shallow, show_ids
1848         return serialized
1849 test_parser = (args) ->
1850         debug_log_reset()
1851         parse_errors = []
1852         errors_cb = (i) ->
1853                 parse_errors.push i
1854         prev_node_id = 0 # reset counter
1855         parsed = parse_html args.html, errors_cb
1856         serialized = serialize_els parsed, false, false
1857         if serialized isnt args.expected
1858                 debug_log_each (str) ->
1859                         console.log str
1860                 console.log "FAILED: \"#{args.name}\""
1861                 console.log "      Input: #{args.html}"
1862                 console.log "    Correct: #{args.expected}"
1863                 console.log "     Output: #{serialized}"
1864                 if parse_errors.length > 0
1865                         console.log " parse errs: #{JSON.stringify parse_errors}"
1866                 else
1867                         console.log "   No parse errors"
1868         else
1869                 console.log "passed \"#{args.name}\""
1870
1871 test_parser name: "empty", \
1872         html: "",
1873         expected: ''
1874 test_parser name: "just text", \
1875         html: "abc",
1876         expected: 'text:"abc"'
1877 test_parser name: "named entity", \
1878         html: "a&amp;1234",
1879         expected: 'text:"a&1234"'
1880 test_parser name: "broken named character references", \
1881         html: "1&amp2&&amp;3&aabbcc;",
1882         expected: 'text:"1&2&&3&aabbcc;"'
1883 test_parser name: "numbered entity overrides", \
1884         html: "1&#X80&#x80; &#x83",
1885         expected: 'text:"1€€ ƒ"'
1886 test_parser name: "open tag", \
1887         html: "foo<span>bar",
1888         expected: 'text:"foo",tag:"span",{},[text:"bar"]'
1889 test_parser name: "open tag with attributes", \
1890         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
1891         expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
1892 test_parser name: "open tag with attributes of various quotings", \
1893         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
1894         expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]'
1895 test_parser name: "attribute entity exceptions dq", \
1896         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
1897         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
1898 test_parser name: "attribute entity exceptions sq", \
1899         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
1900         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
1901 test_parser name: "attribute entity exceptions uq", \
1902         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
1903         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
1904 test_parser name: "matching closing tags", \
1905         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
1906         expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"'
1907 test_parser name: "missing closing tag inside", \
1908         html: "foo<div>bar<span>baz</div>qux",
1909         expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"'
1910 test_parser name: "mis-matched closing tags", \
1911         html: "<span>12<div>34</span>56</div>78",
1912         expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]'
1913 test_parser name: "mis-matched formatting elements", \
1914         html: "12<b>34<i>56</b>78</i>90",
1915         expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"'
1916 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
1917         html: '<p>1<b>2<i>3</b>4</i>5</p>',
1918         expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]'
1919 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
1920         html: '<b>1<p>2</b>3</p>',
1921         expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]'
1922 test_parser name: "crazy formatting elements test", \
1923         html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
1924         # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
1925         # firefox does this:
1926         expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
1927 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
1928 test_parser name: "html5lib aaa 1", \
1929         html: '<a><p></a></p>',
1930         expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]'
1931 test_parser name: "html5lib aaa 2", \
1932         html: '<a>1<p>2</a>3</p>',
1933         expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]'
1934 test_parser name: "html5lib aaa 3", \
1935         html: '<a>1<button>2</a>3</button>',
1936         expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]'
1937 test_parser name: "html5lib aaa 4", \
1938         html: '<a>1<b>2</a>3</b>',
1939         expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]'
1940 test_parser name: "html5lib aaa 5 (two divs deep)", \
1941         html: '<a>1<div>2<div>3</a>4</div>5</div>',
1942         expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]'
1943 test_parser name: "html5lib aaa 6 (foster parenting)", \
1944         html: '<table><a>1<p>2</a>3</p>',
1945         expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]'
1946 test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \
1947         html: '<p>1<s id="A">2<b id="B">3</p>4</s>5</b>',
1948         expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]'
1949 test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
1950         html: '<table><a>1<td>2</td>3</table>',
1951         expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]'