JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
fix test case (for predictable attrs serialization)
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
23 # parsing.
24 #
25 # Instead, the data structure produced by this parser is an array of Nodes.
26
27
28 # stacks/lists
29 #
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
33 # (both as stacks)
34 #
35 # stacks grow downward (current element is index=0)
36 #
37 # example: open_els = [a, b, c, d, e, f, g]
38 #
39 # "grows downwards" means it's visualized like this: (index: el, names)
40 #
41 #   6: g "start of the list", "topmost", "first"
42 #   5: f
43 #   4: e "previous" (to d), "above", "before"
44 #   3: d   (previous/next are relative to this element)
45 #   2: c "next", "after", "lower", "below"
46 #   1: b
47 #   0: a "end of the list", "current node", "bottommost", "last"
48
49
50
51 # Each node is an obect of the Node class. Here are the Node types:
52 TYPE_TAG = 0 # name, {attributes}, [children]
53 TYPE_TEXT = 1 # "text"
54 TYPE_COMMENT = 2
55 TYPE_DOCTYPE = 3
56 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
57 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
58 TYPE_END_TAG = 5 # name
59 TYPE_EOF = 6
60 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
61 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
62
63 # namespace constants
64 NS_HTML = 1
65 NS_MATHML = 2
66 NS_SVG = 3
67
68 g_debug_log = []
69 debug_log_reset = ->
70         g_debug_log = []
71 debug_log = (str) ->
72         g_debug_log.push str
73 debug_log_each = (cb) ->
74         for str in g_debug_log
75                 cb str
76
77 prev_node_id = 0
78 class Node
79         constructor: (type, args = {}) ->
80                 @type = type # one of the TYPE_* constants above
81                 @name = args.name ? '' # tag name
82                 @text = args.text ? '' # contents for text/comment nodes
83                 @attrs = args.attrs ? {}
84                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
85                 @children = args.children ? []
86                 @namespace = args.namespace ? NS_HTML
87                 @parent = args.parent ? null
88                 if args.id?
89                         @id = "#{args.id}+"
90                 else
91                         @id = "#{++prev_node_id}"
92         shallow_clone: -> # return a new node that's the same except without the children or parent
93                 # WARNING this doesn't work right on open tags that are still being parsed
94                 attrs = {}
95                 attrs[k] = v for k, v of @attrs
96                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
97         acknowledge_self_closing: ->
98                 # fixfull
99         serialize: (shallow = false, show_ids = false) -> # for unit tests
100                 ret = ''
101                 switch @type
102                         when TYPE_TAG
103                                 ret += 'tag:'
104                                 ret += JSON.stringify @name
105                                 ret += ','
106                                 if show_ids
107                                         ret += "##{@id},"
108                                 if shallow
109                                         break
110                                 attr_keys = []
111                                 for k of @attrs
112                                         attr_keys.push k
113                                 attr_keys.sort()
114                                 ret += '{'
115                                 sep = ''
116                                 for k in attr_keys
117                                         ret += sep
118                                         sep = ','
119                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
120                                 ret += '},['
121                                 sep = ''
122                                 for c in @children
123                                         ret += sep
124                                         sep = ','
125                                         ret += c.serialize shallow, show_ids
126                                 ret += ']'
127                         when TYPE_TEXT
128                                 ret += 'text:'
129                                 ret += JSON.stringify @text
130                         when TYPE_COMMENT
131                                 ret += 'comment:'
132                                 ret += JSON.stringify @text
133                         when TYPE_DOCTYPE
134                                 ret += 'doctype'
135                                 # FIXME
136                         when TYPE_AFE_MARKER
137                                 ret += 'marker'
138                         when TYPE_AAA_BOOKMARK
139                                 ret += 'aaa_bookmark'
140                         else
141                                 ret += 'unknown:'
142                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
143                 return ret
144
145 # helpers: (only take args that are normally known when parser creates nodes)
146 new_open_tag = (name) ->
147         return new Node TYPE_START_TAG, name: name
148 new_end_tag = (name) ->
149         return new Node TYPE_END_TAG, name: name
150 new_element = (name) ->
151         return new Node TYPE_TAG, name: name
152 new_text_node = (txt) ->
153         return new Node TYPE_TEXT, text: txt
154 new_character_token = new_text_node
155 new_comment_node = (txt) ->
156         return new Node TYPE_COMMENT, text: txt
157 new_eof_token = ->
158         return new Node TYPE_EOF
159 new_afe_marker = ->
160         return new Node TYPE_AFE_MARKER
161 new_aaa_bookmark = ->
162         return new Node TYPE_AAA_BOOKMARK
163
164 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
165 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
166 digits = "0123456789"
167 alnum = lc_alpha + uc_alpha + digits
168 hex_chars = digits + "abcdefABCDEF"
169
170 # some SVG elements have dashes in them
171 tag_name_chars = alnum + "-"
172
173 # http://www.w3.org/TR/html5/infrastructure.html#space-character
174 space_chars = "\u0009\u000a\u000c\u000d\u0020"
175
176 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
177 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
178
179 # These are the character references that don't need a terminating semicolon
180 # min length: 2, max: 6, none are a prefix of any other.
181 legacy_char_refs = {
182         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
183         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
184         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
185         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
186         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
187         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
188         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
189         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
190         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
191         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
192         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
193         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
194         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
195         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
196         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
197         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
198         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
199         yen: '¥', yuml: 'ÿ'
200 }
201
202 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
203 raw_text_elements = ['script', 'style']
204 escapable_raw_text_elements = ['textarea', 'title']
205 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
206 svg_elements = [
207         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
208         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
209         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
210         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
211         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
212         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
213         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
214         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
215         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
216         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
217         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
218         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
219         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
220         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
221         'view', 'vkern'
222 ]
223
224 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
225 mathml_elements = [
226         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
227         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
228         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
229         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
230         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
231         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
232         'determinant', 'diff', 'divergence', 'divide', 'domain',
233         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
234         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
235         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
236         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
237         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
238         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
239         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
240         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
241         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
242         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
243         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
244         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
245         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
246         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
247         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
248         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
249         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
250         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
251         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
252         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
253         'vectorproduct', 'xor'
254 ]
255 # foreign_elements = [svg_elements..., mathml_elements...]
256 #normal_elements = All other allowed HTML elements are normal elements.
257
258 special_elements = {
259         # HTML:
260         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
261         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
262         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
263         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
264         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
265         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
266         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
267         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
268         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
269         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
270         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
271         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
272         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
273         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
274         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
275         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
276         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
277         wbr:NS_HTML, xmp:NS_HTML,
278
279         # MathML:
280         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
281         'annotation-xml':NS_MATHML,
282
283         # SVG:
284         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
285 }
286
287 formatting_elements = {
288          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
289          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
290          u: true
291 }
292
293 foster_parenting_targets = {
294         table: true
295         tbody: true
296         tfoot: true
297         thead: true
298         tr: true
299 }
300
301 # all html I presume
302 end_tag_implied = {
303         dd: true
304         dt: true
305         li: true
306         option: true
307         optgroup: true
308         p: true
309         rb: true
310         rp: true
311         rt: true
312         rtc: true
313 }
314
315 el_is_special = (e) ->
316         return special_elements[e.name]?
317         # FIXME it should really be:
318         #return special_elements[e.name] is e.namespace
319
320 # decode_named_char_ref()
321 #
322 # The list of named character references is _huge_ so ask the browser to decode
323 # for us instead of wasting bandwidth/space on including the table here.
324 #
325 # Pass without the "&" but with the ";" examples:
326 #    for "&amp" pass "amp;"
327 #    for "&#x2032" pass "x2032;"
328 g_dncr = {
329         cache: {}
330         textarea: document.createElement('textarea')
331 }
332 # TODO test this in IE8
333 decode_named_char_ref = (txt) ->
334         txt = "&#{txt}"
335         decoded = g_dncr.cache[txt]
336         return decoded if decoded?
337         g_dncr.textarea.innerHTML = txt
338         decoded = g_dncr.textarea.value
339         return null if decoded is txt
340         return g_dncr.cache[txt] = decoded
341
342 parse_html = (txt, parse_error_cb = null) ->
343         cur = 0 # index of next char in txt to be parsed
344         # declare tree and tokenizer variables so they're in scope below
345         tree = null
346         open_els = null # stack of open elements
347         afe = null # active formatting elements
348         template_insertion_modes = null
349         insertion_mode = null
350         original_insertion_mode = null
351         tok_state = null
352         tok_cur_tag = null # partially parsed tag
353         flag_scripting = null
354         flag_frameset_ok = null
355         flag_parsing = null
356         flag_foster_parenting = null
357         form_element_pointer = null
358         temporary_buffer = null
359
360         parse_error = ->
361                 if parse_error_cb?
362                         parse_error_cb cur
363                 else
364                         console.log "Parse error at character #{cur} of #{txt.length}"
365
366         afe_push = (new_el) ->
367                 matches = 0
368                 for el, i in afe
369                         if el.name is new_el.name and el.namespace is new_el.namespace
370                                 for k, v of el.attrs
371                                         continue unless new_el.attrs[k] is v
372                                 for k, v of new_el.attrs
373                                         continue unless el.attrs[k] is v
374                                 matches += 1
375                                 if matches is 3
376                                         afe.splice i, 1
377                                         break
378                 afe.unshift new_el
379         afe_push_marker = ->
380                 afe.unshift new_afe_marker()
381
382         # the functions below impliment the Tree Contstruction algorithm
383         # http://www.w3.org/TR/html5/syntax.html#tree-construction
384
385         # But first... the helpers
386         template_tag_is_open = ->
387                 for t in open_els
388                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
389                                 return true
390                 return false
391         is_in_scope_x = (tag_name, scope, namespace) ->
392                 for t in open_els
393                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
394                                 return true
395                         if scope[t.name] is t.namespace
396                                 return false
397                 return false
398         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
399                 for t in open_els
400                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
401                                 return true
402                         if scope[t.name] is t.namespace
403                                 return false
404                         if scope2[t.name] is t.namespace
405                                 return false
406                 return false
407         standard_scopers = { # FIXME these are supposed to be namespace specific
408                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
409                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
410                 template: NS_HTML, mi: NS_MATHML,
411
412                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
413                 'annotation-xml': NS_MATHML,
414
415                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
416         }
417         button_scopers = button: NS_HTML
418         li_scopers = ol: NS_HTML, ul: NS_HTML
419         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
420         is_in_scope = (tag_name, namespace = null) ->
421                 return is_in_scope_x tag_name, standard_scopers, namespace
422         is_in_button_scope = (tag_name, namespace = null) ->
423                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
424         is_in_table_scope = (tag_name, namespace = null) ->
425                 return is_in_scope_x tag_name, table_scopers, namespace
426         is_in_select_scope = (tag_name, namespace = null) ->
427                 for t in open_els
428                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
429                                 return true
430                         if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
431                                 return false
432                 return false
433         # this checks for a particular element, not by name
434         el_is_in_scope = (el) ->
435                 for t in open_els
436                         if t is el
437                                 return true
438                         if standard_scopers[t.name] is t.namespace
439                                 return false
440                 return false
441
442         # 8.2.3.1 ...
443         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
444         reset_insertion_mode = ->
445                 # 1. Let last be false.
446                 last = false
447                 # 2. Let node be the last node in the stack of open elements.
448                 node_i = 0
449                 node = open_els[node_i]
450                 # 3. Loop: If node is the first node in the stack of open elements,
451                 # then set last to true, and, if the parser was originally created as
452                 # part of the HTML fragment parsing algorithm (fragment case) set node
453                 # to the context element.
454                 loop
455                         if node_i is open_els.length - 1
456                                 last = true
457                                 # fixfull (fragment case)
458
459                         # 4. If node is a select element, run these substeps:
460                         if node.name is 'select'
461                                 # 1. If last is true, jump to the step below labeled done.
462                                 unless last
463                                         # 2. Let ancestor be node.
464                                         ancestor_i = node_i
465                                         ancestor = node
466                                         # 3. Loop: If ancestor is the first node in the stack of
467                                         # open elements, jump to the step below labeled done.
468                                         loop
469                                                 if ancestor_i is open_els.length - 1
470                                                         break
471                                                 # 4. Let ancestor be the node before ancestor in the stack
472                                                 # of open elements.
473                                                 ancestor_i += 1
474                                                 ancestor = open_els[ancestor_i]
475                                                 # 5. If ancestor is a template node, jump to the step below
476                                                 # labeled done.
477                                                 if ancestor.name is 'template'
478                                                         break
479                                                 # 6. If ancestor is a table node, switch the insertion mode
480                                                 # to "in select in table" and abort these steps.
481                                                 if ancestor.name is 'table'
482                                                         insertion_mode = ins_mode_in_select_in_table
483                                                         return
484                                                 # 7. Jump back to the step labeled loop.
485                                 # 8. Done: Switch the insertion mode to "in select" and abort
486                                 # these steps.
487                                 insertion_mode = ins_mode_in_select
488                                 return
489                         # 5. If node is a td or th element and last is false, then switch
490                         # the insertion mode to "in cell" and abort these steps.
491                         if (node.name is 'td' or node.name is 'th') and last is false
492                                 insertion_mode = ins_mode_in_cell
493                                 return
494                         # 6. If node is a tr element, then switch the insertion mode to "in
495                         # row" and abort these steps.
496                         if node.name is 'tr'
497                                 insertion_mode = ins_mode_in_row
498                                 return
499                         # 7. If node is a tbody, thead, or tfoot element, then switch the
500                         # insertion mode to "in table body" and abort these steps.
501                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
502                                 insertion_mode = ins_mode_in_table_body
503                                 return
504                         # 8. If node is a caption element, then switch the insertion mode
505                         # to "in caption" and abort these steps.
506                         if node.name is 'caption'
507                                 insertion_mode = ins_mode_in_caption
508                                 return
509                         # 9. If node is a colgroup element, then switch the insertion mode
510                         # to "in column group" and abort these steps.
511                         if node.name is 'colgroup'
512                                 insertion_mode = ins_mode_in_column_group
513                                 return
514                         # 10. If node is a table element, then switch the insertion mode to
515                         # "in table" and abort these steps.
516                         if node.name is 'table'
517                                 insertion_mode = ins_mode_in_table
518                                 return
519                         # 11. If node is a template element, then switch the insertion mode
520                         # to the current template insertion mode and abort these steps.
521                         # fixfull (template insertion mode stack)
522
523                         # 12. If node is a head element and last is true, then switch the
524                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
525                         # these steps. (fragment case)
526                         if node.name is 'head' and last
527                                 insertion_mode = ins_mode_in_body
528                                 return
529                         # 13. If node is a head element and last is false, then switch the
530                         # insertion mode to "in head" and abort these steps.
531                         if node.name is 'head' and last is false
532                                 insertion_mode = ins_mode_in_head
533                                 return
534                         # 14. If node is a body element, then switch the insertion mode to
535                         # "in body" and abort these steps.
536                         if node.name is 'body'
537                                 insertion_mode = ins_mode_in_body
538                                 return
539                         # 15. If node is a frameset element, then switch the insertion mode
540                         # to "in frameset" and abort these steps. (fragment case)
541                         if node.name is 'frameset'
542                                 insertion_mode = ins_mode_in_frameset
543                                 return
544                         # 16. If node is an html element, run these substeps:
545                         if node.name is 'html'
546                                 # 1. If the head element pointer is null, switch the insertion
547                                 # mode to "before head" and abort these steps. (fragment case)
548                                 # fixfull (fragment case)
549
550                                 # 2. Otherwise, the head element pointer is not null, switch
551                                 # the insertion mode to "after head" and abort these steps.
552                                 insertion_mode = ins_mode_in_body # FIXME fixfull
553                                 return
554                         # 17. If last is true, then switch the insertion mode to "in body"
555                         # and abort these steps. (fragment case)
556                         if last
557                                 insertion_mode = ins_mode_in_body
558                                 return
559                         # 18. Let node now be the node before node in the stack of open
560                         # elements.
561                         node_i += 1
562                         node = open_els[node_i]
563                         # 19. Return to the step labeled loop.
564
565         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
566         # this implementation is structured (mostly) as described at the link above.
567         # capitalized comments are the "labels" described at the link above.
568         reconstruct_active_formatting_elements = ->
569                 return if afe.length is 0
570                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
571                         return
572                 # Rewind
573                 i = 0
574                 loop
575                         if i is afe.length - 1
576                                 break
577                         i += 1
578                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
579                                 i -= 1 # Advance
580                                 break
581                 # Create
582                 loop
583                         el = afe[i].shallow_clone()
584                         tree_insert_element el
585                         afe[i] = el
586                         break if i is 0
587                         i -= 1 # Advance
588
589         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
590         # adoption agency algorithm
591         # overview here:
592         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
593         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
594         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
595         adoption_agency = (subject) ->
596                 debug_log "adoption_agency()"
597                 debug_log "tree: #{serialize_els tree.children, false, true}"
598                 debug_log "open_els: #{serialize_els open_els, true, true}"
599                 debug_log "afe: #{serialize_els afe, true, true}"
600                 if open_els[0].name is subject
601                         el = open_els[0]
602                         open_els.shift()
603                         # remove it from the list of active formatting elements (if found)
604                         for t, i in afe
605                                 if t is el
606                                         afe.splice i, 1
607                                         break
608                         debug_log "aaa: starting off with subject on top of stack, exiting"
609                         return
610                 outer = 0
611                 loop
612                         if outer >= 8
613                                 return
614                         outer += 1
615                         # 5. Let formatting element be the last element in the list of
616                         # active formatting elements that: is between the end of the list
617                         # and the last scope marker in the list, if any, or the start of
618                         # the list otherwise, and  has the tag name subject.
619                         fe = null
620                         for t, fe_of_afe in afe
621                                 if t.type is TYPE_AFE_MARKER
622                                         break
623                                 if t.name is subject
624                                         fe = t
625                                         break
626                         # If there is no such element, then abort these steps and instead
627                         # act as described in the "any other end tag" entry above.
628                         if fe is null
629                                 debug_log "aaa: fe not found in afe"
630                                 in_body_any_other_end_tag subject
631                                 return
632                         # 6. If formatting element is not in the stack of open elements,
633                         # then this is a parse error; remove the element from the list, and
634                         # abort these steps.
635                         in_open_els = false
636                         for t, fe_of_open_els in open_els
637                                 if t is fe
638                                         in_open_els = true
639                                         break
640                         unless in_open_els
641                                 debug_log "aaa: fe not found in open_els"
642                                 parse_error()
643                                 # "remove it from the list" must mean afe, since it's not in open_els
644                                 afe.splice fe_of_afe, 1
645                                 return
646                         # 7. If formatting element is in the stack of open elements, but
647                         # the element is not in scope, then this is a parse error; abort
648                         # these steps.
649                         unless el_is_in_scope fe
650                                 debug_log "aaa: fe not in scope"
651                                 parse_error()
652                                 return
653                         # 8. If formatting element is not the current node, this is a parse
654                         # error. (But do not abort these steps.)
655                         unless open_els[0] is fe
656                                 parse_error()
657                                 # continue
658                         # 9. Let furthest block be the topmost node in the stack of open
659                         # elements that is lower in the stack than formatting element, and
660                         # is an element in the special category. There might not be one.
661                         fb = null
662                         fb_of_open_els = null
663                         for t, i in open_els
664                                 if t is fe
665                                         break
666                                 if el_is_special t
667                                         fb = t
668                                         fb_of_open_els = i
669                                         # and continue, to see if there's one that's more "topmost"
670                         # 10. If there is no furthest block, then the UA must first pop all
671                         # the nodes from the bottom of the stack of open elements, from the
672                         # current node up to and including formatting element, then remove
673                         # formatting element from the list of active formatting elements,
674                         # and finally abort these steps.
675                         if fb is null
676                                 debug_log "aaa: no fb"
677                                 loop
678                                         t = open_els.shift()
679                                         if t is fe
680                                                 afe.splice fe_of_afe, 1
681                                                 return
682                         # 11. Let common ancestor be the element immediately above
683                         # formatting element in the stack of open elements.
684                         ca = open_els[fe_of_open_els + 1] # common ancestor
685
686                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
687                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
688                         bookmark = new_aaa_bookmark()
689                         for t, i in afe
690                                 if t is fe
691                                         afe.splice i, 0, bookmark
692                                         break
693                         node = last_node = fb
694                         inner = 0
695                         loop
696                                 inner += 1
697                                 # 3. Let node be the element immediately above node in the
698                                 # stack of open elements, or if node is no longer in the stack
699                                 # of open elements (e.g. because it got removed by this
700                                 # algorithm), the element that was immediately above node in
701                                 # the stack of open elements before node was removed.
702                                 node_next = null
703                                 for t, i in open_els
704                                         if t is node
705                                                 node_next = open_els[i + 1]
706                                                 break
707                                 node = node_next ? node_above
708                                 debug_log "inner loop #{inner}"
709                                 debug_log "tree: #{serialize_els tree.children, false, true}"
710                                 debug_log "open_els: #{serialize_els open_els, true, true}"
711                                 debug_log "afe: #{serialize_els afe, true, true}"
712                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
713                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
714                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
715                                 debug_log "node: #{node.serialize true, true}"
716                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
717
718                                 # 4. If node is formatting element, then go to the next step in
719                                 # the overall algorithm.
720                                 if node is fe
721                                         break
722                                 debug_log "the meat"
723                                 # 5. If inner loop counter is greater than three and node is in
724                                 # the list of active formatting elements, then remove node from
725                                 # the list of active formatting elements.
726                                 node_in_afe = false
727                                 for t, i in afe
728                                         if t is node
729                                                 if inner > 3
730                                                         afe.splice i, 1
731                                                         debug_log "max out inner"
732                                                 else
733                                                         node_in_afe = true
734                                                         debug_log "in afe"
735                                                 break
736                                 # 6. If node is not in the list of active formatting elements,
737                                 # then remove node from the stack of open elements and then go
738                                 # back to the step labeled inner loop.
739                                 unless node_in_afe
740                                         debug_log "not in afe"
741                                         for t, i in open_els
742                                                 if t is node
743                                                         node_above = open_els[i + 1]
744                                                         open_els.splice i, 1
745                                                         break
746                                         continue
747                                 debug_log "the bones"
748                                 # 7. create an element for the token for which the element node
749                                 # was created, in the HTML namespace, with common ancestor as
750                                 # the intended parent; replace the entry for node in the list
751                                 # of active formatting elements with an entry for the new
752                                 # element, replace the entry for node in the stack of open
753                                 # elements with an entry for the new element, and let node be
754                                 # the new element.
755                                 new_node = node.shallow_clone()
756                                 for t, i in afe
757                                         if t is node
758                                                 afe[i] = new_node
759                                                 debug_log "replaced in afe"
760                                                 break
761                                 for t, i in open_els
762                                         if t is node
763                                                 node_above = open_els[i + 1]
764                                                 open_els[i] = new_node
765                                                 debug_log "replaced in open_els"
766                                                 break
767                                 node = new_node
768                                 # 8. If last node is furthest block, then move the
769                                 # aforementioned bookmark to be immediately after the new node
770                                 # in the list of active formatting elements.
771                                 if last_node is fb
772                                         for t, i in afe
773                                                 if t is bookmark
774                                                         afe.splice i, 1
775                                                         debug_log "removed bookmark"
776                                                         break
777                                         for t, i in afe
778                                                 if t is node
779                                                         # "after" means lower
780                                                         afe.splice i, 0, bookmark # "after as <-
781                                                         debug_log "placed bookmark after node"
782                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
783                                                         break
784                                 # 9. Insert last node into node, first removing it from its
785                                 # previous parent node if any.
786                                 if last_node.parent?
787                                         debug_log "last_node has parent"
788                                         for c, i in last_node.parent.children
789                                                 if c is last_node
790                                                         debug_log "removing last_node from parent"
791                                                         last_node.parent.children.splice i, 1
792                                                         break
793                                 node.children.push last_node
794                                 last_node.parent = node
795                                 # 10. Let last node be node.
796                                 last_node = node
797                                 debug_log "at last"
798                                 # 11. Return to the step labeled inner loop.
799                         # 14. Insert whatever last node ended up being in the previous step
800                         # at the appropriate place for inserting a node, but using common
801                         # ancestor as the override target.
802
803                         # JASON: In the case where fe is immediately followed by fb:
804                         #   * inner loop exits out early (node==fe)
805                         #   * last_node is fb
806                         #   * last_node is still in the tree (not a duplicate)
807                         if last_node.parent?
808                                 debug_log "FEFIRST? last_node has parent"
809                                 for c, i in last_node.parent.children
810                                         if c is last_node
811                                                 debug_log "removing last_node from parent"
812                                                 last_node.parent.children.splice i, 1
813                                                 break
814
815                         debug_log "after aaa inner loop"
816                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
817                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
818                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
819                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
820                         debug_log "tree: #{serialize_els tree.children, false, true}"
821
822                         debug_log "insert"
823
824
825                         # can't use standard insert token thing, because it's already in
826                         # open_els and must stay at it's current position in open_els
827                         dest = adjusted_insertion_location ca
828                         dest[0].children.splice dest[1], 0, last_node
829                         last_node.parent = dest[0]
830
831
832                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
833                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
834                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
835                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
836                         debug_log "tree: #{serialize_els tree.children, false, true}"
837
838                         # 15. Create an element for the token for which formatting element
839                         # was created, in the HTML namespace, with furthest block as the
840                         # intended parent.
841                         new_element = fe.shallow_clone() # FIXME intended parent thing
842                         # 16. Take all of the child nodes of furthest block and append them
843                         # to the element created in the last step.
844                         while fb.children.length
845                                 t = fb.children.shift()
846                                 t.parent = new_element
847                                 new_element.children.push t
848                         # 17. Append that new element to furthest block.
849                         new_element.parent = fb
850                         fb.children.push new_element
851                         # 18. Remove formatting element from the list of active formatting
852                         # elements, and insert the new element into the list of active
853                         # formatting elements at the position of the aforementioned
854                         # bookmark.
855                         for t, i in afe
856                                 if t is fe
857                                         afe.splice i, 1
858                                         break
859                         for t, i in afe
860                                 if t is bookmark
861                                         afe[i] = new_element
862                                         break
863                         # 19. Remove formatting element from the stack of open elements,
864                         # and insert the new element into the stack of open elements
865                         # immediately below the position of furthest block in that stack.
866                         for t, i in open_els
867                                 if t is fe
868                                         open_els.splice i, 1
869                                         break
870                         for t, i in open_els
871                                 if t is fb
872                                         open_els.splice i, 0, new_element
873                                         break
874                         # 20. Jump back to the step labeled outer loop.
875                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
876                         debug_log "tree: #{serialize_els tree.children, false, true}"
877                         debug_log "open_els: #{serialize_els open_els, true, true}"
878                         debug_log "afe: #{serialize_els afe, true, true}"
879                 debug_log "AAA DONE"
880
881         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
882         close_p_element = ->
883                 generate_implied_end_tags 'p' # arg is exception
884                 if open_els[0].name isnt 'p'
885                         parse_error()
886                 while open_els.length > 1 # just in case
887                         el = open_els.shift()
888                         if el.name is 'p'
889                                 return
890         close_p_if_in_button_scope = ->
891                 if is_in_button_scope 'p'
892                         close_p_element()
893
894         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
895         # aka insert_a_character = (t) ->
896         insert_character = (t) ->
897                 dest = adjusted_insertion_location()
898                 # fixfull check for Document node
899                 if dest[1] > 0
900                         prev = dest[0].children[dest[1] - 1]
901                         if prev.type is TYPE_TEXT
902                                 prev.text += t.text
903                                 return
904                 dest[0].children.splice dest[1], 0, t
905
906         # 8.2.5.1
907         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
908         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
909         adjusted_insertion_location = (override_target = null) ->
910                 # 1. If there was an override target specified, then let target be the
911                 # override target.
912                 if override_target?
913                         target = override_target
914                 else # Otherwise, let target be the current node.
915                         target = open_els[0]
916                 # 2. Determine the adjusted insertion location using the first matching
917                 # steps from the following list:
918                 #
919                 # If foster parenting is enabled and target is a table, tbody, tfoot,
920                 # thead, or tr element Foster parenting happens when content is
921                 # misnested in tables.
922                 if flag_foster_parenting and foster_parenting_targets[target.name]
923                         loop # once. this is here so we can ``break`` to "abort these substeps"
924                                 # 1. Let last template be the last template element in the
925                                 # stack of open elements, if any.
926                                 last_template = null
927                                 last_template_i = null
928                                 for el, i in open_els
929                                         if el.name is 'template'
930                                                 last_template = el
931                                                 last_template_i = i
932                                                 break
933                                 # 2. Let last table be the last table element in the stack of
934                                 # open elements, if any.
935                                 last_table = null
936                                 last_table_i
937                                 for el, i in open_els
938                                         if el.name is 'table'
939                                                 last_table = el
940                                                 last_table_i = i
941                                                 break
942                                 # 3. If there is a last template and either there is no last
943                                 # table, or there is one, but last template is lower (more
944                                 # recently added) than last table in the stack of open
945                                 # elements, then: let adjusted insertion location be inside
946                                 # last template's template contents, after its last child (if
947                                 # any), and abort these substeps.
948                                 if last_template and (last_table is null or last_template_i < last_table_i)
949                                         target = template # fixfull should be it's contents
950                                         target_i = target.children.length
951                                         break
952                                 # 4. If there is no last table, then let adjusted insertion
953                                 # location be inside the first element in the stack of open
954                                 # elements (the html element), after its last child (if any),
955                                 # and abort these substeps. (fragment case)
956                                 if last_table is null
957                                         # this is odd
958                                         target = open_els[open_els.length - 1]
959                                         target_i = target.children.length
960                                 # 5. If last table has a parent element, then let adjusted
961                                 # insertion location be inside last table's parent element,
962                                 # immediately before last table, and abort these substeps.
963                                 if last_table.parent?
964                                         for c, i in last_table.parent.children
965                                                 if c is last_table
966                                                         target = last_table.parent
967                                                         target_i = i
968                                                         break
969                                         break
970                                 # 6. Let previous element be the element immediately above last
971                                 # table in the stack of open elements.
972                                 #
973                                 # huh? how could it not have a parent?
974                                 previous_element = open_els[last_table_i + 1]
975                                 # 7. Let adjusted insertion location be inside previous
976                                 # element, after its last child (if any).
977                                 target = previous_element
978                                 target_i = target.children.length
979                                 # Note: These steps are involved in part because it's possible
980                                 # for elements, the table element in this case in particular,
981                                 # to have been moved by a script around in the DOM, or indeed
982                                 # removed from the DOM entirely, after the element was inserted
983                                 # by the parser.
984                                 break # don't really loop
985                 else
986                         # Otherwise Let adjusted insertion location be inside target, after
987                         # its last child (if any).
988                         target_i = target.children.length
989
990                 # 3. If the adjusted insertion location is inside a template element,
991                 # let it instead be inside the template element's template contents,
992                 # after its last child (if any).
993                 # fixfull (template)
994
995                 # 4. Return the adjusted insertion location.
996                 return [target, target_i]
997
998         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
999         # aka create_an_element_for_token
1000         token_to_element = (t, namespace, intended_parent) ->
1001                 t.type = TYPE_TAG # not TYPE_START_TAG
1002                 # convert attributes into a hash
1003                 attrs = {}
1004                 while t.attrs_a.length
1005                         a = t.attrs_a.pop()
1006                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1007                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
1008
1009                 # TODO 2. If the newly created element has an xmlns attribute in the
1010                 # XMLNS namespace whose value is not exactly the same as the element's
1011                 # namespace, that is a parse error. Similarly, if the newly created
1012                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1013                 # value is not the XLink Namespace, that is a parse error.
1014
1015                 # fixfull: the spec says stuff about form pointers and ownerDocument
1016
1017                 return el
1018
1019         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1020         insert_foreign_element = (token, namespace) ->
1021                 ail = adjusted_insertion_location()
1022                 ail_el = ail[0]
1023                 ail_i = ail[1]
1024                 el = token_to_element token, namespace, ail_el
1025                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1026                 el.parent = ail_el
1027                 ail_el.children.splice ail_i, 0, el
1028                 open_els.unshift el
1029                 return el
1030         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1031         insert_html_element = insert_foreign_element # (token, namespace) ->
1032
1033         # FIXME read implement "foster parenting" part
1034         # FIXME read spec, do this right
1035         # FIXME implement the override target thing
1036         # note: this assumes it's an open tag
1037         # FIXME what part of the spec is this?
1038         # TODO look through all callers of this, and see what they should really be doing.
1039         #   eg probably insert_html_element for tokens
1040         tree_insert_element = (el, override_target = null, namespace = null) ->
1041                 if namespace?
1042                         el.namespace = namespace
1043                 dest = adjusted_insertion_location override_target
1044                 if el.type is TYPE_START_TAG # means it's a "token"
1045                         el = token_to_element el, namespace, dest[0]
1046                 unless el.namespace?
1047                         namespace = dest.namespace
1048                 # fixfull: Document nodes sometimes can't accept more chidren
1049                 dest[0].children.splice dest[1], 0, el
1050                 el.parent = dest[0]
1051                 open_els.unshift el
1052                 return el
1053
1054         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1055         # position should be [node, index_within_children]
1056         tree_insert_comment = (t, position = null) ->
1057                 position ?= adjusted_insertion_location()
1058                 position[0].children.splice position[1], 0, t
1059
1060         # 8.2.5.2
1061         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1062         parse_generic_raw_text = (t) ->
1063                 insert_html_element t
1064                 tok_state = tok_state_rawtext
1065                 original_insertion_mode = insertion_mode
1066                 insertion_mode = ins_mode_text
1067         parse_generic_rcdata_text = (t) ->
1068                 insert_html_element t
1069                 tok_state = tok_state_rcdata
1070                 original_insertion_mode = insertion_mode
1071                 insertion_mode = ins_mode_text
1072
1073         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1074         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1075         generate_implied_end_tags = (except = null) ->
1076                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1077                         open_els.shift()
1078
1079         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1080         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1081                 open_els.shift() # spec says this will be a 'head' node
1082                 insertion_mode = ins_mode_after_head
1083                 insertion_mode t
1084         ins_mode_in_head = (t) ->
1085                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1086                         insert_character t
1087                         return
1088                 if t.type is TYPE_COMMENT
1089                         tree_insert_comment t
1090                         return
1091                 if t.type is TYPE_DOCTYPE
1092                         parse_error()
1093                         return
1094                 if t.type is TYPE_START_TAG and t.name is 'html'
1095                         ins_mode_in_body t
1096                         return
1097                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1098                         el = insert_html_element t
1099                         open_els.shift()
1100                         el.acknowledge_self_closing()
1101                         return
1102                 if t.type is TYPE_START_TAG and t.name is 'meta'
1103                         el = insert_html_element t
1104                         open_els.shift()
1105                         el.acknowledge_self_closing()
1106                         # fixfull encoding stuff
1107                         return
1108                 if t.type is TYPE_START_TAG and t.name is 'title'
1109                         parse_generic_rcdata_element t
1110                         return
1111                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1112                         parse_generic_raw_text t
1113                         return
1114                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1115                         insert_html_element t
1116                         insertion_mode = in_head_noscript # FIXME implement
1117                         return
1118                 if t.type is TYPE_START_TAG and t.name is 'script'
1119                         ail = adjusted_insertion_location()
1120                         el = token_to_element t, NS_HTML, ail
1121                         el.flag_parser_inserted true # FIXME implement
1122                         # fixfull frament case
1123                         ail[0].children.splice ail[1], 0, el
1124                         open_els.unshift el
1125                         tok_state = tok_state_script_data
1126                         original_insertion_mode = insertion_mode # make sure orig... is defined
1127                         insertion_mode = ins_mode_text # FIXME implement
1128                         return
1129                 if t.type is TYPE_END_TAG and t.name is 'head'
1130                         open_els.shift() # will be a head element... spec says so
1131                         insertion_mode = ins_mode_after_head
1132                         return
1133                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1134                         ins_mode_in_head_else t
1135                         return
1136                 if t.type is TYPE_START_TAG and t.name is 'template'
1137                         insert_html_element t
1138                         afe_push_marker()
1139                         flag_frameset_ok = false
1140                         insertion_mode = ins_mode_in_template
1141                         template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1142                         return
1143                 if t.type is TYPE_END_TAG and t.name is 'template'
1144                         if template_tag_is_open()
1145                                 generate_implied_end_tags
1146                                 if open_els[0].name isnt 'template'
1147                                         parse_error()
1148                                 loop
1149                                         el = open_els.shift()
1150                                         if el.name is 'template'
1151                                                 break
1152                                 clear_afe_to_marker()
1153                                 template_insertion_modes.shift()
1154                                 reset_insertion_mode()
1155                         else
1156                                 parse_error()
1157                         return
1158                 if (t.type is TYPE_OPEN_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1159                         parse_error()
1160                         return
1161                 ins_mode_in_head_else t
1162
1163         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1164         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1165                 for node, i in open_els
1166                         if node.name is name # FIXME check namespace too
1167                                 generate_implied_end_tags name # arg is exception
1168                                 parse_error() unless i is 0
1169                                 while i >= 0
1170                                         open_els.shift()
1171                                         i -= 1
1172                                 return
1173                         if special_elements[node.name]? # FIXME check namespac too
1174                                 parse_error()
1175                                 return
1176         ins_mode_in_body = (t) ->
1177                 switch t.type
1178                         when TYPE_TEXT
1179                                 switch t.text
1180                                         when "\u0000"
1181                                                 parse_error()
1182                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
1183                                                 reconstruct_active_formatting_elements()
1184                                                 insert_character t
1185                                         else
1186                                                 reconstruct_active_formatting_elements()
1187                                                 insert_character t
1188                                                 flag_frameset_ok = false
1189                         when TYPE_COMMENT
1190                                 tree_insert_comment t
1191                         when TYPE_DOCTYPE
1192                                 parse_error()
1193                         when TYPE_START_TAG
1194                                 switch t.name
1195                                         when 'html'
1196                                                 parse_error()
1197                                                 return if template_tag_is_open()
1198                                                 root_attrs = open_els[open_els.length - 1].attrs
1199                                                 for k, v of t.attrs
1200                                                         root_attrs[k] = v unless root_attrs[k]?
1201                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1202                                                 # FIXME also do this for </template> (end tag)
1203                                                 return ins_mode_in_head t
1204                                         when 'body'
1205                                                 parse_error()
1206                                                 # TODO
1207                                         when 'frameset'
1208                                                 parse_error()
1209                                                 # TODO
1210                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1211                                                 close_p_if_in_button_scope()
1212                                                 insert_html_element t
1213                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1214                                                 close_p_if_in_button_scope()
1215                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1216                                                         parse_error()
1217                                                         open_els.shift()
1218                                                 insert_html_element t
1219                                         # TODO lots more to implement here
1220                                         when 'a'
1221                                                 # If the list of active formatting elements
1222                                                 # contains an a element between the end of the list and
1223                                                 # the last marker on the list (or the start of the list
1224                                                 # if there is no marker on the list), then this is a
1225                                                 # parse error; run the adoption agency algorithm for
1226                                                 # the tag name "a", then remove that element from the
1227                                                 # list of active formatting elements and the stack of
1228                                                 # open elements if the adoption agency algorithm didn't
1229                                                 # already remove it (it might not have if the element
1230                                                 # is not in table scope).
1231                                                 found = false
1232                                                 for el in afe
1233                                                         if el.type is TYPE_AFE_MARKER
1234                                                                 break
1235                                                         if el.name is 'a'
1236                                                                 found = el
1237                                                 if found?
1238                                                         parse_error()
1239                                                         adoption_agency 'a'
1240                                                         for el, i in afe
1241                                                                 if el is found
1242                                                                         afe.splice i, 1
1243                                                         for el, i in open_els
1244                                                                 if el is found
1245                                                                         open_els.splice i, 1
1246                                                 reconstruct_active_formatting_elements()
1247                                                 el = insert_html_element t
1248                                                 afe_push el
1249                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1250                                                 reconstruct_active_formatting_elements()
1251                                                 el = insert_html_element t
1252                                                 afe_push el
1253                                         when 'table'
1254                                                 # fixfull quirksmode thing
1255                                                 close_p_if_in_button_scope()
1256                                                 insert_html_element t
1257                                                 insertion_mode = ins_mode_in_table
1258                                         # TODO lots more to implement here
1259                                         else # any other start tag
1260                                                 reconstruct_active_formatting_elements()
1261                                                 insert_html_element t
1262                         when TYPE_EOF
1263                                 ok_tags = {
1264                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1265                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1266                                 }
1267                                 for t in open_els
1268                                         unless ok_tags[t.name]?
1269                                                 parse_error()
1270                                                 break
1271                                 # TODO stack of template insertion modes thing
1272                                 flag_parsing = false # stop parsing
1273                         when TYPE_END_TAG
1274                                 switch t.name
1275                                         when 'body'
1276                                                 unless is_in_scope 'body'
1277                                                         parse_error()
1278                                                         return
1279                                                 # TODO implement parse error and move to tree_after_body
1280                                         when 'html'
1281                                                 unless is_in_scope 'body' # weird, but it's what the spec says
1282                                                         parse_error()
1283                                                         return
1284                                                 # TODO implement parse error and move to tree_after_body, reprocess
1285                                         when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1286                                                 unless is_in_scope t.name, NS_HTML
1287                                                         parse_error()
1288                                                         return
1289                                                 generate_implied_end_tags()
1290                                                 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1291                                                         parse_error()
1292                                                 loop
1293                                                         el = open_els.shift()
1294                                                         if el.name is t.name and el.namespace is NS_HTML
1295                                                                 return
1296                                         # TODO lots more close tags to implement here
1297                                         when 'p'
1298                                                 unless is_in_button_scope 'p'
1299                                                         parse_error()
1300                                                         insert_html_element new_open_tag 'p'
1301                                                 close_p_element()
1302                                         # TODO lots more close tags to implement here
1303                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1304                                                 adoption_agency t.name
1305                                         # TODO lots more close tags to implement here
1306                                         else
1307                                                 in_body_any_other_end_tag t.name
1308                 return
1309
1310         ins_mode_in_table_else = (t) ->
1311                 parse_error()
1312                 flag_foster_parenting = true # FIXME
1313                 ins_mode_in_body t
1314                 flag_foster_parenting = false
1315         can_in_table = {
1316                 'table': true
1317                 'tbody': true
1318                 'tfoot': true
1319                 'thead': true
1320                 'tr': true
1321         }
1322         clear_to_table_stopers = {
1323                 'table': true
1324                 'template': true
1325                 'html': true
1326         }
1327         clear_stack_to_table_context = ->
1328                 loop
1329                         if clear_to_table_stopers[open_els[0].name]?
1330                                 break
1331                         open_els.shift()
1332                 return
1333         clear_to_table_body_stopers = {
1334                 'tbody': true
1335                 'tfoot': true
1336                 'thead': true
1337                 'template': true
1338                 'html': true
1339         }
1340         clear_stack_to_table_body_context = ->
1341                 loop
1342                         if clear_to_table_body_stopers[open_els[0].name]?
1343                                 break
1344                         open_els.shift()
1345                 return
1346         clear_to_table_row_stopers = {
1347                 'tr': true
1348                 'template': true
1349                 'html': true
1350         }
1351         clear_stack_to_table_row_context = ->
1352                 loop
1353                         if clear_to_table_row_stopers[open_els[0].name]?
1354                                 break
1355                         open_els.shift()
1356                 return
1357         clear_afe_to_marker = ->
1358                 loop
1359                         el = afe.shift()
1360                         if el.type is TYPE_AFE_MARKER
1361                                 return
1362
1363         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1364         ins_mode_text = (t) ->
1365                 if t.type is TYPE_TEXT
1366                         insert_character t
1367                         return
1368                 if t.type is TYPE_EOF
1369                         parse_error()
1370                         if open_els[0].name is 'script'
1371                                 open_els[0].flag 'already started', true
1372                         open_els.shift()
1373                         insertion_mode = original_insertion_mode
1374                         insertion_mode t
1375                         return
1376                 if t.type is TYPE_END_TAG and t.name is 'script'
1377                         open_els.shift()
1378                         insertion_mode = original_insertion_mode
1379                         # fixfull the spec seems to assume that I'm going to run the script
1380                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1381                         return
1382                 if t.type is TYPE_END_TAG
1383                         open_els.shift()
1384                         insertion_mode = original_insertion_mode
1385                         return
1386                 console.log 'warning: end of ins_mode_text reached'
1387
1388         # the functions below implement the tokenizer stats described here:
1389         # http://www.w3.org/TR/html5/syntax.html#tokenization
1390
1391         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1392         ins_mode_in_table = (t) ->
1393                 switch t.type
1394                         when TYPE_TEXT
1395                                 if can_in_table[t.name]
1396                                         original_insertion_mode = insertion_mode
1397                                         insertion_mode = ins_mode_in_table_text
1398                                         insertion_mode t
1399                                 else
1400                                         ins_mode_in_table_else t
1401                         when TYPE_COMMENT
1402                                 tree_insert_comment t
1403                         when TYPE_DOCTYPE
1404                                 parse_error()
1405                         when TYPE_START_TAG
1406                                 switch t.name
1407                                         when 'caption'
1408                                                 clear_stack_to_table_context()
1409                                                 afe_push_marker()
1410                                                 insert_html_element t
1411                                                 insertion_mode = ins_mode_in_caption
1412                                         when 'colgroup'
1413                                                 clear_stack_to_table_context()
1414                                                 insert_html_element t
1415                                                 insertion_mode = ins_mode_in_column_group
1416                                         when 'col'
1417                                                 clear_stack_to_table_context()
1418                                                 insert_html_element new_open_tag 'colgroup'
1419                                                 insertion_mode = ins_mode_in_column_group
1420                                                 insertion_mode t
1421                                         when 'tbody', 'tfoot', 'thead'
1422                                                 clear_stack_to_table_context()
1423                                                 insert_html_element t
1424                                                 insertion_mode = ins_mode_in_table_body
1425                                         when 'td', 'th', 'tr'
1426                                                 clear_stack_to_table_context()
1427                                                 insert_html_element new_open_tag 'tbody'
1428                                                 insertion_mode = ins_mode_in_table_body
1429                                                 insertion_mode t
1430                                         when 'table'
1431                                                 parse_error()
1432                                                 if is_in_table_scope 'table'
1433                                                         loop
1434                                                                 el = open_els.shift()
1435                                                                 if el.name is 'table'
1436                                                                         break
1437                                                         reset_insertion_mode()
1438                                                         insertion_mode t
1439                                         when 'style', 'script', 'template'
1440                                                 ins_mode_in_head t
1441                                         when 'input'
1442                                                 if token_is_input_hidden t
1443                                                         ins_mode_in_table_else t
1444                                                 else
1445                                                         parse_error()
1446                                                         el = insert_html_element t
1447                                                         open_els.shift()
1448                                                         el.acknowledge_self_closing()
1449                                         when 'form'
1450                                                 parse_error()
1451                                                 if form_element_pointer?
1452                                                         return
1453                                                 if template_tag_is_open()
1454                                                         return
1455                                                 form_element_pointer = insert_html_element t
1456                                                 open_els.shift()
1457                                         else
1458                                                 ins_mode_in_table_else t
1459                         when TYPE_END_TAG
1460                                 switch t.name
1461                                         when 'table'
1462                                                 if is_in_table_scope 'table'
1463                                                         loop
1464                                                                 el = open_els.shift()
1465                                                                 if el.name is 'table'
1466                                                                         break
1467                                                         reset_insertion_mode()
1468                                                 else
1469                                                         parse_error
1470                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1471                                                 parse_error()
1472                                         when 'template'
1473                                                 ins_mode_in_head t
1474                                         else
1475                                                 ins_mode_in_table_else t
1476                         when TYPE_EOF
1477                                 ins_mode_in_body t
1478                         else
1479                                 ins_mode_in_table_else t
1480
1481
1482         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1483         ins_mode_in_table_text = (t) ->
1484                 switch t.type
1485                         when TYPE_TEXT
1486                                 switch t.text
1487                                         when "\u0000"
1488                                                 parse_error()
1489                                                 return
1490                 console.log "unimplemented ins_mode_in_table_text"
1491                 # FIXME CONTINUE
1492
1493         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1494         ins_mode_in_table_body = (t) ->
1495                 if t.type is TYPE_START_TAG and t.name is 'tr'
1496                         clear_stack_to_table_body_context()
1497                         insert_html_element t
1498                         return
1499                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1500                         parse_error()
1501                         clear_stack_to_table_body_context()
1502                         insert_html_element new_open_tag 'tr'
1503                         insertion_mode = ins_mode_in_row
1504                         insertion_mode t
1505                         return
1506                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1507                         unless is_in_table_scope t.name # fixfull check namespace
1508                                 parse_error()
1509                                 return
1510                         clear_stack_to_table_body_context()
1511                         open_els.shift()
1512                         insertion_mode = ins_mode_in_table
1513                         return
1514                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1515                         has = false
1516                         for el in open_els
1517                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1518                                         has = true
1519                                         break
1520                                 if table_scopers[el.name]
1521                                         break
1522                         if !has
1523                                 parse_error()
1524                                 return
1525                         clear_stack_to_table_body_context()
1526                         open_els.shift()
1527                         insertion_mode = ins_mode_in_table
1528                         insertion_mode t
1529                         return
1530                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1531                         parse_error()
1532                         return
1533                 # Anything else
1534                 ins_mode_in_table t
1535
1536         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1537         ins_mode_in_row = (t) ->
1538                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1539                         clear_stack_to_table_row_context()
1540                         insert_html_element t
1541                         insertion_mode = ins_mode_in_cell
1542                         afe_push_marker()
1543                         return
1544                 if t.type is TYPE_END_TAG and t.name is 'tr'
1545                         if is_in_table_scope 'tr'
1546                                 clear_stack_to_table_row_context()
1547                                 open_els.shift()
1548                                 insertion_mode = ins_mode_in_table_body
1549                         else
1550                                 parse_error()
1551                         return
1552                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1553                         if is_in_table_scope 'tr'
1554                                 clear_stack_to_table_row_context()
1555                                 open_els.shift()
1556                                 insertion_mode = ins_mode_in_table_body
1557                                 insertion_mode t
1558                         else
1559                                 parse_error()
1560                         return
1561                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1562                         if is_in_table_scope t.name # fixfull namespace
1563                                 if is_in_table_scope 'tr'
1564                                         clear_stack_to_table_row_context()
1565                                         open_els.shift()
1566                                         insertion_mode = ins_mode_in_table_body
1567                                         insertion_mode t
1568                         else
1569                                 parse_error()
1570                         return
1571                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1572                         parse_error()
1573                         return
1574                 # Anything else
1575                 ins_mode_in_table t
1576
1577         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1578         close_the_cell = ->
1579                 generate_implied_end_tags()
1580                 unless open_els[0].name is 'td' or open_els[0] is 'th'
1581                         parse_error()
1582                 loop
1583                         el = open_els.shift()
1584                         if el.name is 'td' or el.name is 'th'
1585                                 break
1586                 clear_afe_to_marker()
1587                 insertion_mode = ins_mode_in_row
1588
1589         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1590         ins_mode_in_cell = (t) ->
1591                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1592                         if is_in_table_scope t.name
1593                                 generate_implied_end_tags()
1594                                 if open_els[0].name isnt t.name
1595                                         parse_error
1596                                 loop
1597                                         el = open_els.shift()
1598                                         if el.name is t.name
1599                                                 break
1600                                 clear_afe_to_marker()
1601                                 insertion_mode = ins_mode_in_row
1602                         else
1603                                 parse_error()
1604                         return
1605                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1606                         has = false
1607                         for el in open_els
1608                                 if el.name is 'td' or el.name is 'th'
1609                                         has = true
1610                                         break
1611                                 if table_scopers[el.name]
1612                                         break
1613                         if !has
1614                                 parse_error()
1615                                 return
1616                         close_the_cell()
1617                         insertion_mode t
1618                         return
1619                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1620                         parse_error()
1621                         return
1622                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1623                         if is_in_table_scope t.name # fixfull namespace
1624                                 close_the_cell()
1625                                 insertion_mode t
1626                         else
1627                                 parse_error()
1628                         return
1629                 # Anything Else
1630                 ins_mode_in_body t
1631
1632         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
1633         tok_state_data = ->
1634                 switch c = txt.charAt(cur++)
1635                         when '&'
1636                                 return new_text_node parse_character_reference()
1637                         when '<'
1638                                 tok_state = tok_state_tag_open
1639                         when "\u0000"
1640                                 parse_error()
1641                                 return new_text_node c
1642                         when '' # EOF
1643                                 return new_eof_token()
1644                         else
1645                                 return new_text_node c
1646                 return null
1647
1648         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
1649         # not needed: tok_state_character_reference_in_data = ->
1650         # just call parse_character_reference()
1651
1652         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
1653         tok_state_rcdata = ->
1654                 switch c = txt.charAt(cur++)
1655                         when '&'
1656                                 return new_text_node parse_character_reference()
1657                         when '<'
1658                                 tok_state = tok_state_rcdata_less_than_sign
1659                         when "\u0000"
1660                                 parse_error()
1661                                 return new_character_token "\ufffd"
1662                         when '' # EOF
1663                                 return new_eof_token()
1664                         else
1665                                 return new_character_token c
1666                 return null
1667
1668         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
1669         # not needed: tok_state_character_reference_in_rcdata = ->
1670         # just call parse_character_reference()
1671
1672         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
1673         tok_state_rawtext = ->
1674                 switch c = txt.charAt(cur++)
1675                         when '<'
1676                                 tok_state = tok_state_rawtext_less_than_sign
1677                         when "\u0000"
1678                                 parse_error()
1679                                 return new_character_token "\ufffd"
1680                         when '' # EOF
1681                                 return new_eof_token()
1682                         else
1683                                 return new_character_token c
1684                 return null
1685
1686         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
1687         tok_state_script_data = ->
1688                 switch c = txt.charAt(cur++)
1689                         when '<'
1690                                 tok_state = tok_state_script_data_less_than_sign
1691                         when "\u0000"
1692                                 parse_error()
1693                                 return new_character_token "\ufffd"
1694                         when '' # EOF
1695                                 return new_eof_token()
1696                         else
1697                                 return new_character_token c
1698                 return null
1699
1700         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
1701         tok_state_plaintext = ->
1702                 switch c = txt.charAt(cur++)
1703                         when "\u0000"
1704                                 parse_error()
1705                                 return new_character_token "\ufffd"
1706                         when '' # EOF
1707                                 return new_eof_token()
1708                         else
1709                                 return new_character_token c
1710                 return null
1711
1712
1713         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
1714         tok_state_tag_open = ->
1715                 switch c = txt.charAt(cur++)
1716                         when '!'
1717                                 tok_state = tok_state_markup_declaration_open
1718                         when '/'
1719                                 tok_state = tok_state_end_tag_open
1720                         when '?'
1721                                 parse_error()
1722                                 tok_state = tok_state_bogus_comment
1723                         else
1724                                 if lc_alpha.indexOf(c) > -1
1725                                         tok_cur_tag = new_open_tag c
1726                                         tok_state = tok_state_tag_name
1727                                 else if uc_alpha.indexOf(c) > -1
1728                                         tok_cur_tag = new_open_tag c.toLowerCase()
1729                                         tok_state = tok_state_tag_name
1730                                 else
1731                                         parse_error()
1732                                         tok_state = tok_state_data
1733                                         cur -= 1 # we didn't parse/handle the char after <
1734                                         return new_text_node '<'
1735                 return null
1736
1737         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
1738         tok_state_end_tag_open = ->
1739                 switch c = txt.charAt(cur++)
1740                         when '>'
1741                                 parse_error()
1742                                 tok_state = tok_state_data
1743                         when '' # EOF
1744                                 parse_error()
1745                                 tok_state = tok_state_data
1746                                 return new_text_node '</'
1747                         else
1748                                 if uc_alpha.indexOf(c) > -1
1749                                         tok_cur_tag = new_end_tag c.toLowerCase()
1750                                         tok_state = tok_state_tag_name
1751                                 else if lc_alpha.indexOf(c) > -1
1752                                         tok_cur_tag = new_end_tag c
1753                                         tok_state = tok_state_tag_name
1754                                 else
1755                                         parse_error()
1756                                         tok_state = tok_state_bogus_comment
1757                 return null
1758
1759         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
1760         tok_state_tag_name = ->
1761                 switch c = txt.charAt(cur++)
1762                         when "\t", "\n", "\u000c", ' '
1763                                 tok_state = tok_state_before_attribute_name
1764                         when '/'
1765                                 tok_state = tok_state_self_closing_start_tag
1766                         when '>'
1767                                 tok_state = tok_state_data
1768                                 tmp = tok_cur_tag
1769                                 tok_cur_tag = null
1770                                 return tmp
1771                         when "\u0000"
1772                                 parse_error()
1773                                 tok_cur_tag.name += "\ufffd"
1774                         when '' # EOF
1775                                 parse_error()
1776                                 tok_state = tok_state_data
1777                         else
1778                                 if uc_alpha.indexOf(c) > -1
1779                                         tok_cur_tag.name += c.toLowerCase()
1780                                 else
1781                                         tok_cur_tag.name += c
1782                 return null
1783
1784         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
1785         tok_state_rcdata_less_than_sign = ->
1786                 c = txt.charAt(cur++)
1787                 if c is '/'
1788                         temporary_buffer = ''
1789                         tok_state = tok_state_rcdata_end_tag_open
1790                         return null
1791                 # Anything else
1792                 tok_state = tok_state_rcdata
1793                 cur -= 1 # reconsume the input character
1794                 return new_character_token '<'
1795
1796         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
1797         tok_state_rcdata_end_tag_open = ->
1798                 c = txt.charAt(cur++)
1799                 if uc_alpha.indexOf(c) > -1
1800                         tok_cur_tag = new_end_tag c.toLowerCase()
1801                         temporary_buffer += c
1802                         tok_state = tok_state_rcdata_end_tag_name
1803                         return null
1804                 if lc_alpha.indexOf(c) > -1
1805                         tok_cur_tag = new_end_tag c
1806                         temporary_buffer += c
1807                         tok_state = tok_state_rcdata_end_tag_name
1808                         return null
1809                 # Anything else
1810                 tok_state = tok_state_rcdata
1811                 cur -= 1 # reconsume the input character
1812                 return new_character_token "</" # fixfull separate these
1813
1814         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
1815         is_appropriate_end_tag = (t) ->
1816                 # spec says to check against "the tag name of the last start tag to
1817                 # have been emitted from this tokenizer", but this is only called from
1818                 # the various "raw" states, which I'm pretty sure all push the start
1819                 # token onto open_els. TODO: verify this after the script data states
1820                 # are implemented
1821                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
1822                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
1823
1824         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
1825         tok_state_rcdata_end_tag_name = ->
1826                 c = txt.charAt(cur++)
1827                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
1828                         if is_appropriate_end_tag tok_cur_tag
1829                                 tok_state = tok_state_before_attribute_name
1830                                 return
1831                         # else fall through to "Anything else"
1832                 if c is '/'
1833                         if is_appropriate_end_tag tok_cur_tag
1834                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
1835                                 return
1836                         # else fall through to "Anything else"
1837                 if c is '>'
1838                         if is_appropriate_end_tag tok_cur_tag
1839                                 tok_state = tok_state_data
1840                                 return tok_cur_tag
1841                         # else fall through to "Anything else"
1842                 if uc_alpha.indexOf(c) > -1
1843                         tok_cur_tag.name += c.toLowerCase()
1844                         temporary_buffer += c
1845                         return null
1846                 if lc_alpha.indexOf(c) > -1
1847                         tok_cur_tag.name += c
1848                         temporary_buffer += c
1849                         return null
1850                 # Anything else
1851                 tok_state = tok_state_rcdata
1852                 cur -= 1 # reconsume the input character
1853                 return new_character_token '</' + temporary_buffer # fixfull separate these
1854
1855         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
1856         tok_state_rawtext_less_than_sign = ->
1857                 c = txt.charAt(cur++)
1858                 if c is '/'
1859                         temporary_buffer = ''
1860                         tok_state = tok_state_rawtext_end_tag_open
1861                         return null
1862                 # Anything else
1863                 tok_state = tok_state_rawtext
1864                 cur -= 1 # reconsume the input character
1865                 return new_character_token '<'
1866
1867         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
1868         tok_state_rawtext_end_tag_open = ->
1869                 c = txt.charAt(cur++)
1870                 if uc_alpha.indexOf(c) > -1
1871                         tok_cur_tag = new_end_tag c.toLowerCase()
1872                         temporary_buffer += c
1873                         tok_state = tok_state_rawtext_end_tag_name
1874                         return null
1875                 if lc_alpha.indexOf(c) > -1
1876                         tok_cur_tag = new_end_tag c
1877                         temporary_buffer += c
1878                         tok_state = tok_state_rawtext_end_tag_name
1879                         return null
1880                 # Anything else
1881                 tok_state = tok_state_rawtext
1882                 cur -= 1 # reconsume the input character
1883                 return new_character_token "</" # fixfull separate these
1884
1885         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
1886         tok_state_rawtext_end_tag_name = ->
1887                 c = txt.charAt(cur++)
1888                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
1889                         if is_appropriate_end_tag tok_cur_tag
1890                                 tok_state = tok_state_before_attribute_name
1891                                 return
1892                         # else fall through to "Anything else"
1893                 if c is '/'
1894                         if is_appropriate_end_tag tok_cur_tag
1895                                 tok_state = tok_state_self_closing_start_tag
1896                                 return
1897                         # else fall through to "Anything else"
1898                 if c is '>'
1899                         if is_appropriate_end_tag tok_cur_tag
1900                                 tok_state = tok_state_data
1901                                 return tok_cur_tag
1902                         # else fall through to "Anything else"
1903                 if uc_alpha.indexOf(c) > -1
1904                         tok_cur_tag.name += c.toLowerCase()
1905                         temporary_buffer += c
1906                         return null
1907                 if lc_alpha.indexOf(c) > -1
1908                         tok_cur_tag.name += c
1909                         temporary_buffer += c
1910                         return null
1911                 # Anything else
1912                 tok_state = tok_state_rawtext
1913                 cur -= 1 # reconsume the input character
1914                 return new_character_token '</' + temporary_buffer # fixfull separate these
1915
1916         # TODO _all_ of the missing states here (17-33) are for parsing script tags
1917
1918         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
1919         tok_state_before_attribute_name = ->
1920                 attr_name = null
1921                 switch c = txt.charAt(cur++)
1922                         when "\t", "\n", "\u000c", ' '
1923                                 return null
1924                         when '/'
1925                                 tok_state = tok_state_self_closing_start_tag
1926                                 return null
1927                         when '>'
1928                                 tok_state = tok_state_data
1929                                 tmp = tok_cur_tag
1930                                 tok_cur_tag = null
1931                                 return tmp
1932                         when "\u0000"
1933                                 parse_error()
1934                                 attr_name = "\ufffd"
1935                         when '"', "'", '<', '='
1936                                 parse_error()
1937                                 attr_name = c
1938                         when '' # EOF
1939                                 parse_error()
1940                                 tok_state = tok_state_data
1941                         else
1942                                 if uc_alpha.indexOf(c) > -1
1943                                         attr_name = c.toLowerCase()
1944                                 else
1945                                         attr_name = c
1946                 if attr_name?
1947                         tok_cur_tag.attrs_a.unshift [attr_name, '']
1948                         tok_state = tok_state_attribute_name
1949                 return null
1950
1951         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
1952         tok_state_attribute_name = ->
1953                 switch c = txt.charAt(cur++)
1954                         when "\t", "\n", "\u000c", ' '
1955                                 tok_state = tok_state_after_attribute_name
1956                         when '/'
1957                                 tok_state = tok_state_self_closing_start_tag
1958                         when '='
1959                                 tok_state = tok_state_before_attribute_value
1960                         when '>'
1961                                 tok_state = tok_state_data
1962                                 tmp = tok_cur_tag
1963                                 tok_cur_tag = null
1964                                 return tmp
1965                         when "\u0000"
1966                                 parse_error()
1967                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
1968                         when '"', "'", '<'
1969                                 parse_error()
1970                                 tok_cur_tag.attrs_a[0][0] = c
1971                         when '' # EOF
1972                                 parse_error()
1973                                 tok_state = tok_state_data
1974                         else
1975                                 if uc_alpha.indexOf(c) > -1
1976                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
1977                                 else
1978                                         tok_cur_tag.attrs_a[0][0] += c
1979                 return null
1980
1981         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
1982         tok_state_after_attribute_name = ->
1983                 c = txt.charAt(cur++)
1984                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
1985                         return
1986                 if c is '/'
1987                         tok_state = tok_state_self_closing_start_tag
1988                         return
1989                 if c is '='
1990                         tok_state = tok_state_before_attribute_value
1991                         return
1992                 if c is '>'
1993                         tok_state = tok_state_data
1994                         return
1995                 if uc_alpha.indexOf(c) > -1
1996                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
1997                         tok_state = tok_state_attribute_name
1998                         return
1999                 if c is "\u0000"
2000                         parse_error()
2001                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2002                         tok_state = tok_state_attribute_name
2003                         return
2004                 if c is '' # EOF
2005                         parse_error()
2006                         tok_state = tok_state_data
2007                         cur -= 1 # reconsume
2008                         return
2009                 if c is '"' or c is "'" or c is '<'
2010                         parse_error()
2011                         # fall through to Anything else
2012                 # Anything else
2013                 tok_cur_tag.attrs_a.unshift [c, '']
2014                 tok_state = tok_state_attribute_name
2015
2016         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2017         tok_state_before_attribute_value = ->
2018                 switch c = txt.charAt(cur++)
2019                         when "\t", "\n", "\u000c", ' '
2020                                 return null
2021                         when '"'
2022                                 tok_state = tok_state_attribute_value_double_quoted
2023                         when '&'
2024                                 tok_state = tok_state_attribute_value_unquoted
2025                                 cur -= 1
2026                         when "'"
2027                                 tok_state = tok_state_attribute_value_single_quoted
2028                         when "\u0000"
2029                                 # Parse error
2030                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2031                                 tok_state = tok_state_attribute_value_unquoted
2032                         when '>'
2033                                 # Parse error
2034                                 tok_state = tok_state_data
2035                                 tmp = tok_cur_tag
2036                                 tok_cur_tag = null
2037                                 return tmp
2038                         when '' # EOF
2039                                 parse_error()
2040                                 tok_state = tok_state_data
2041                         else
2042                                 tok_cur_tag.attrs_a[0][1] += c
2043                                 tok_state = tok_state_attribute_value_unquoted
2044                 return null
2045
2046         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2047         tok_state_attribute_value_double_quoted = ->
2048                 switch c = txt.charAt(cur++)
2049                         when '"'
2050                                 tok_state = tok_state_after_attribute_value_quoted
2051                         when '&'
2052                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2053                         when "\u0000"
2054                                 # Parse error
2055                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2056                         when '' # EOF
2057                                 parse_error()
2058                                 tok_state = tok_state_data
2059                         else
2060                                 tok_cur_tag.attrs_a[0][1] += c
2061                 return null
2062
2063         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2064         tok_state_attribute_value_single_quoted = ->
2065                 switch c = txt.charAt(cur++)
2066                         when "'"
2067                                 tok_state = tok_state_after_attribute_value_quoted
2068                         when '&'
2069                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2070                         when "\u0000"
2071                                 # Parse error
2072                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2073                         when '' # EOF
2074                                 parse_error()
2075                                 tok_state = tok_state_data
2076                         else
2077                                 tok_cur_tag.attrs_a[0][1] += c
2078                 return null
2079
2080         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2081         tok_state_attribute_value_unquoted = ->
2082                 switch c = txt.charAt(cur++)
2083                         when "\t", "\n", "\u000c", ' '
2084                                 tok_state = tok_state_before_attribute_name
2085                         when '&'
2086                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2087                         when '>'
2088                                 tok_state = tok_state_data
2089                                 tmp = tok_cur_tag
2090                                 tok_cur_tag = null
2091                                 return tmp
2092                         when "\u0000"
2093                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2094                         when '' # EOF
2095                                 parse_error()
2096                                 tok_state = tok_state_data
2097                         else
2098                                 # Parse Error if ', <, = or ` (backtick)
2099                                 tok_cur_tag.attrs_a[0][1] += c
2100                 return null
2101
2102         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2103         tok_state_after_attribute_value_quoted = ->
2104                 switch c = txt.charAt(cur++)
2105                         when "\t", "\n", "\u000c", ' '
2106                                 tok_state = tok_state_before_attribute_name
2107                         when '/'
2108                                 tok_state = tok_state_self_closing_start_tag
2109                         when '>'
2110                                 tok_state = tok_state_data
2111                                 tmp = tok_cur_tag
2112                                 tok_cur_tag = null
2113                                 return tmp
2114                         when '' # EOF
2115                                 parse_error()
2116                                 tok_state = tok_state_data
2117                         else
2118                                 # Parse Error
2119                                 tok_state = tok_state_before_attribute_name
2120                                 cur -= 1 # we didn't handle that char
2121                 return null
2122
2123         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
2124         # Don't set this as a state, just call it
2125         # returns a string (NOT a text node)
2126         parse_character_reference = (allowed_char = null, in_attr = false) ->
2127                 if cur >= txt.length
2128                         return '&'
2129                 switch c = txt.charAt(cur)
2130                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
2131                                 # explicitly not a parse error
2132                                 return '&'
2133                         when ';'
2134                                 # there has to be "one or more" alnums between & and ; to be a parse error
2135                                 return '&'
2136                         when '#'
2137                                 if cur + 1 >= txt.length
2138                                         return '&'
2139                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
2140                                         prefix = '#x'
2141                                         charset = hex_chars
2142                                         start = cur + 2
2143                                 else
2144                                         charset = digits
2145                                         start = cur + 1
2146                                         prefix = '#'
2147                                 i = 0
2148                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
2149                                         i += 1
2150                                 if i is 0
2151                                         return '&'
2152                                 if txt.charAt(start + i) is ';'
2153                                         i += 1
2154                                 # FIXME This is supposed to generate parse errors for some chars
2155                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
2156                                 if decoded?
2157                                         cur = start + i
2158                                         return decoded
2159                                 return '&'
2160                         else
2161                                 for i in [0...31]
2162                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
2163                                                 break
2164                                 if i is 0
2165                                         # exit early, because parse_error() below needs at least one alnum
2166                                         return '&'
2167                                 if txt.charAt(cur + i) is ';'
2168                                         i += 1 # include ';' terminator in value
2169                                         decoded = decode_named_char_ref txt.substr(cur, i)
2170                                         if decoded?
2171                                                 cur += i
2172                                                 return decoded
2173                                         parse_error()
2174                                         return '&'
2175                                 else
2176                                         # no ';' terminator (only legacy char refs)
2177                                         max = i
2178                                         for i in [2..max] # no prefix matches, so ok to check shortest first
2179                                                 c = legacy_char_refs[txt.substr(cur, i)]
2180                                                 if c?
2181                                                         if in_attr
2182                                                                 if txt.charAt(cur + i) is '='
2183                                                                         # "because some legacy user agents will
2184                                                                         # misinterpret the markup in those cases"
2185                                                                         parse_error()
2186                                                                         return '&'
2187                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
2188                                                                         # this makes attributes forgiving about url args
2189                                                                         return '&'
2190                                                         # ok, and besides the weird exceptions for attributes...
2191                                                         # return the matching char
2192                                                         cur += i # consume entity chars
2193                                                         parse_error() # because no terminating ";"
2194                                                         return c
2195                                         parse_error()
2196                                         return '&'
2197                 return # never reached
2198
2199         # tree constructor initialization
2200         # see comments on TYPE_TAG/etc for the structure of this data
2201         tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
2202         open_els = [tree]
2203         afe = [] # active formatting elements
2204         template_insertion_modes = []
2205         insertion_mode = ins_mode_in_body
2206         original_insertion_mode = insertion_mode # TODO check spec
2207         flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
2208         flag_frameset_ok = true
2209         flag_parsing = true
2210         flag_foster_parenting = false
2211         form_element_pointer = null
2212         temporary_buffer = null
2213
2214         # tokenizer initialization
2215         tok_state = tok_state_data
2216
2217         # proccess input
2218         while flag_parsing
2219                 t = tok_state()
2220                 if t?
2221                         insertion_mode t
2222         return tree.children
2223
2224 # everything below is tests on the above
2225 test_equals = (description, output, expected_output) ->
2226         if output is expected_output
2227                 console.log "passed." # don't say name, so smart consoles can merge all of these
2228         else
2229                 console.log "FAILED: \"#{description}\""
2230                 console.log "   Expected: #{expected_output}"
2231                 console.log "     Actual: #{output}"
2232 serialize_els = (els, shallow, show_ids) ->
2233         serialized = ''
2234         sep = ''
2235         for t in els
2236                 serialized += sep
2237                 sep = ','
2238                 serialized += t.serialize shallow, show_ids
2239         return serialized
2240 test_parser = (args) ->
2241         debug_log_reset()
2242         parse_errors = []
2243         errors_cb = (i) ->
2244                 parse_errors.push i
2245         prev_node_id = 0 # reset counter
2246         parsed = parse_html args.html, errors_cb
2247         serialized = serialize_els parsed, false, false
2248         if serialized isnt args.expected
2249                 debug_log_each (str) ->
2250                         console.log str
2251                 console.log "FAILED: \"#{args.name}\""
2252                 console.log "      Input: #{args.html}"
2253                 console.log "    Correct: #{args.expected}"
2254                 console.log "     Output: #{serialized}"
2255                 if parse_errors.length > 0
2256                         console.log " parse errs: #{JSON.stringify parse_errors}"
2257                 else
2258                         console.log "   No parse errors"
2259         else
2260                 console.log "passed \"#{args.name}\""
2261
2262 test_parser name: "empty", \
2263         html: "",
2264         expected: ''
2265 test_parser name: "just text", \
2266         html: "abc",
2267         expected: 'text:"abc"'
2268 test_parser name: "named entity", \
2269         html: "a&amp;1234",
2270         expected: 'text:"a&1234"'
2271 test_parser name: "broken named character references", \
2272         html: "1&amp2&&amp;3&aabbcc;",
2273         expected: 'text:"1&2&&3&aabbcc;"'
2274 test_parser name: "numbered entity overrides", \
2275         html: "1&#X80&#x80; &#x83",
2276         expected: 'text:"1€€ ƒ"'
2277 test_parser name: "open tag", \
2278         html: "foo<span>bar",
2279         expected: 'text:"foo",tag:"span",{},[text:"bar"]'
2280 test_parser name: "open tag with attributes", \
2281         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
2282         expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
2283 test_parser name: "open tag with attributes of various quotings", \
2284         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
2285         expected: 'text:"foo",tag:"span",{"abc":"def","autofocus":"","g":"hij","klm":"nopqrstuv\\""},[text:"bar"]'
2286 test_parser name: "attribute entity exceptions dq", \
2287         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
2288         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
2289 test_parser name: "attribute entity exceptions sq", \
2290         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
2291         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
2292 test_parser name: "attribute entity exceptions uq", \
2293         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
2294         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
2295 test_parser name: "matching closing tags", \
2296         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
2297         expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"'
2298 test_parser name: "missing closing tag inside", \
2299         html: "foo<div>bar<span>baz</div>qux",
2300         expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"'
2301 test_parser name: "mis-matched closing tags", \
2302         html: "<span>12<div>34</span>56</div>78",
2303         expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]'
2304 test_parser name: "mis-matched formatting elements", \
2305         html: "12<b>34<i>56</b>78</i>90",
2306         expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"'
2307 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
2308         html: '<p>1<b>2<i>3</b>4</i>5</p>',
2309         expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]'
2310 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
2311         html: '<b>1<p>2</b>3</p>',
2312         expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]'
2313 test_parser name: "crazy formatting elements test", \
2314         html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
2315         # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
2316         # firefox does this:
2317         expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
2318 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
2319 test_parser name: "html5lib aaa 1", \
2320         html: '<a><p></a></p>',
2321         expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]'
2322 test_parser name: "html5lib aaa 2", \
2323         html: '<a>1<p>2</a>3</p>',
2324         expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]'
2325 test_parser name: "html5lib aaa 3", \
2326         html: '<a>1<button>2</a>3</button>',
2327         expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]'
2328 test_parser name: "html5lib aaa 4", \
2329         html: '<a>1<b>2</a>3</b>',
2330         expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]'
2331 test_parser name: "html5lib aaa 5 (two divs deep)", \
2332         html: '<a>1<div>2<div>3</a>4</div>5</div>',
2333         expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]'
2334 test_parser name: "html5lib aaa 6 (foster parenting)", \
2335         html: '<table><a>1<p>2</a>3</p>',
2336         expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]'
2337 test_parser name: "html5lib aaa 7 (aaa, eof) 1", \
2338         html: '<b><b><a><p></a>',
2339         expected: 'tag:"b",{},[tag:"b",{},[tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]]]'
2340 test_parser name: "html5lib aaa 8 (aaa, eof) 2", \
2341         html: '<b><a><b><p></a>',
2342         expected: 'tag:"b",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2343 test_parser name: "html5lib aaa 9 (aaa, eof) 3", \
2344         html: '<a><b><b><p></a>',
2345         expected: 'tag:"a",{},[tag:"b",{},[tag:"b",{},[]]],tag:"b",{},[tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2346 test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \
2347         html: '<p>1<s id="A">2<b id="B">3</p>4</s>5</b>',
2348         expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]'
2349 test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
2350         html: '<table><a>1<td>2</td>3</table>',
2351         expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]'
2352 test_parser name: "html5lib aaa 12 (table with foster parenting, split text)", \
2353         html: '<table>A<td>B</td>C</table>',
2354         expected: 'text:"AC",tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2355 # TODO implement svg and namespacing
2356 #test_parser name: "html5lib aaa 13 (svg tr input)", \
2357 #       html: '<a><svg><tr><input></a>',
2358 #       expected: 'tag:"a",{},[svg:"svg",{},[svg:"tr",{},[svg:"input"]]]'
2359 test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \
2360         html: '<div><a><b><div><div><div><div><div><div><div><div><div><div></a>',
2361         expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"div",{},[tag:"div",{},[]]]]]]]]]]]]]'
2362 test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \
2363         html: '<div><a><b><u><i><code><div></a>',
2364         expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]'
2365 test_parser name: "html5lib aaa 16 (correctly nested 4b)", \
2366         html: '<b><b><b><b>x</b></b></b></b>y',
2367         expected: 'tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]],text:"y"'
2368 test_parser name: "html5lib aaa 17 (formatting, implied /p, noah's ark)", \
2369         html: '<p><b><b><b><b><p>x',
2370         expected: 'tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[]]]]],tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]]'
2371 test_parser name: "variation on html5lib aaa 17 (with attributes in various orders)", \
2372         html: '<p><b c="d" e="f"><b e="f" c="d"><b e="f" c="d"><b c="d" e="f"><p>x',
2373         expected: 'tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[]]]]],tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[text:"x"]]]]'
2374 test_parser name: "junk after attribute close-quote", \
2375         html: '<p><b c="d", e="f">foo<p>x',
2376         expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'
2377 test_parser name: "html5lib aaa02 1", \
2378         html: '<b>1<i>2<p>3</b>4',
2379         expected: 'tag:"b",{},[text:"1",tag:"i",{},[text:"2"]],tag:"i",{},[tag:"p",{},[tag:"b",{},[text:"3"],text:"4"]]'
2380 test_parser name: "html5lib aaa02 2", \
2381         html: '<a><div><style></style><address><a>',
2382         expected: 'tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"style",{},[]],tag:"address",{},[tag:"a",{},[],tag:"a",{},[]]]'