parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
  20 # or <body> tags, nor does it produce the top level "document" node in the dom
  21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
  22 # indicate places where additional code is needed for full HTML document
  23 # parsing.
  24 #
  25 # Instead, the data structure produced by this parser is an array of Nodes.
  26
  27
  28 # stacks/lists
  29 #
  30 # the spec uses a many different words do indicate which ends of lists/stacks
  31 # they are talking about (and relative movement within the lists/stacks). This
  32 # section splains. I'm implementing "lists" (afe and open_els) the same way
  33 # (both as stacks)
  34 #
  35 # stacks grow downward (current element is index=0)
  36 #
  37 # example: open_els = [a, b, c, d, e, f, g]
  38 #
  39 # "grows downwards" means it's visualized like this: (index: el, names)
  40 #
  41 #   6: g "start of the list", "topmost", "first"
  42 #   5: f
  43 #   4: e "previous" (to d), "above", "before"
  44 #   3: d   (previous/next are relative to this element)
  45 #   2: c "next", "after", "lower", "below"
  46 #   1: b
  47 #   0: a "end of the list", "current node", "bottommost", "last"
  48
  49
  50
  51 # Each node is an obect of the Node class. Here are the Node types:
  52 TYPE_TAG = 0 # name, {attributes}, [children]
  53 TYPE_TEXT = 1 # "text"
  54 TYPE_COMMENT = 2
  55 TYPE_DOCTYPE = 3
  56 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  57 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
  58 TYPE_END_TAG = 5 # name
  59 TYPE_EOF = 6
  60 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
  61 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
  62
  63 # namespace constants
  64 NS_HTML = 1
  65 NS_MATHML = 2
  66 NS_SVG = 3
  67
  68 g_debug_log = []
  69 debug_log_reset = ->
  70         g_debug_log = []
  71 debug_log = (str) ->
  72         g_debug_log.push str
  73 debug_log_each = (cb) ->
  74         for str in g_debug_log
  75                 cb str
  76
  77 prev_node_id = 0
  78 class Node
  79         constructor: (type, args = {}) ->
  80                 @type = type # one of the TYPE_* constants above
  81                 @name = args.name ? '' # tag name
  82                 @text = args.text ? '' # contents for text/comment nodes
  83                 @attrs = args.attrs ? {}
  84                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
  85                 @children = args.children ? []
  86                 @namespace = args.namespace ? NS_HTML
  87                 @parent = args.parent ? null
  88                 if args.id?
  89                         @id = "#{args.id}+"
  90                 else
  91                         @id = "#{++prev_node_id}"
  92         shallow_clone: -> # return a new node that's the same except without the children or parent
  93                 # WARNING this doesn't work right on open tags that are still being parsed
  94                 attrs = {}
  95                 attrs[k] = v for k, v of @attrs
  96                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
  97         acknowledge_self_closing: ->
  98                 @flag 'did_self_close', true
  99         flag: ->
 100                 # fixfull
 101         serialize: (shallow = false, show_ids = false) -> # for unit tests
 102                 ret = ''
 103                 switch @type
 104                         when TYPE_TAG
 105                                 ret += 'tag:'
 106                                 ret += JSON.stringify @name
 107                                 ret += ','
 108                                 if show_ids
 109                                         ret += "##{@id},"
 110                                 if shallow
 111                                         break
 112                                 attr_keys = []
 113                                 for k of @attrs
 114                                         attr_keys.push k
 115                                 attr_keys.sort()
 116                                 ret += '{'
 117                                 sep = ''
 118                                 for k in attr_keys
 119                                         ret += sep
 120                                         sep = ','
 121                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
 122                                 ret += '},['
 123                                 sep = ''
 124                                 for c in @children
 125                                         ret += sep
 126                                         sep = ','
 127                                         ret += c.serialize shallow, show_ids
 128                                 ret += ']'
 129                         when TYPE_TEXT
 130                                 ret += 'text:'
 131                                 ret += JSON.stringify @text
 132                         when TYPE_COMMENT
 133                                 ret += 'comment:'
 134                                 ret += JSON.stringify @text
 135                         when TYPE_DOCTYPE
 136                                 ret += 'doctype'
 137                                 # FIXME
 138                         when TYPE_AFE_MARKER
 139                                 ret += 'marker'
 140                         when TYPE_AAA_BOOKMARK
 141                                 ret += 'aaa_bookmark'
 142                         else
 143                                 ret += 'unknown:'
 144                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
 145                 return ret
 146
 147 # helpers: (only take args that are normally known when parser creates nodes)
 148 new_open_tag = (name) ->
 149         return new Node TYPE_START_TAG, name: name
 150 new_end_tag = (name) ->
 151         return new Node TYPE_END_TAG, name: name
 152 new_element = (name) ->
 153         return new Node TYPE_TAG, name: name
 154 new_text_node = (txt) ->
 155         return new Node TYPE_TEXT, text: txt
 156 new_character_token = new_text_node
 157 new_comment_node = (txt) ->
 158         return new Node TYPE_COMMENT, text: txt
 159 new_eof_token = ->
 160         return new Node TYPE_EOF
 161 new_afe_marker = ->
 162         return new Node TYPE_AFE_MARKER
 163 new_aaa_bookmark = ->
 164         return new Node TYPE_AAA_BOOKMARK
 165
 166 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
 167 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 168 digits = "0123456789"
 169 alnum = lc_alpha + uc_alpha + digits
 170 hex_chars = digits + "abcdefABCDEF"
 171
 172 # some SVG elements have dashes in them
 173 tag_name_chars = alnum + "-"
 174
 175 # http://www.w3.org/TR/html5/infrastructure.html#space-character
 176 space_chars = "\u0009\u000a\u000c\u000d\u0020"
 177 is_space = (txt) ->
 178         return txt.length is 1 and space_chars.indexOf(txt) > -1
 179 is_space_tok = (t) ->
 180         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
 181
 182 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
 183 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
 184
 185 # These are the character references that don't need a terminating semicolon
 186 # min length: 2, max: 6, none are a prefix of any other.
 187 legacy_char_refs = {
 188         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
 189         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
 190         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
 191         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
 192         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
 193         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
 194         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
 195         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
 196         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
 197         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
 198         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
 199         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
 200         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
 201         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
 202         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
 203         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
 204         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
 205         yen: '¥', yuml: 'ÿ'
 206 }
 207
 208 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
 209 raw_text_elements = ['script', 'style']
 210 escapable_raw_text_elements = ['textarea', 'title']
 211 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
 212 svg_elements = [
 213         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
 214         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
 215         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
 216         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
 217         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
 218         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
 219         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
 220         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
 221         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
 222         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
 223         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
 224         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
 225         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
 226         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
 227         'view', 'vkern'
 228 ]
 229
 230 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
 231 mathml_elements = [
 232         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
 233         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
 234         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
 235         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
 236         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
 237         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 238         'determinant', 'diff', 'divergence', 'divide', 'domain',
 239         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 240         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 241         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 242         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 243         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 244         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 245         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 246         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 247         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 248         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 249         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 250         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 251         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 252         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 253         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 254         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 255         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 256         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 257         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 258         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 259         'vectorproduct', 'xor'
 260 ]
 261 # foreign_elements = [svg_elements..., mathml_elements...]
 262 #normal_elements = All other allowed HTML elements are normal elements.
 263
 264 special_elements = {
 265         # HTML:
 266         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
 267         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
 268         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
 269         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
 270         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
 271         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
 272         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
 273         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
 274         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
 275         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
 276         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
 277         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
 278         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
 279         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
 280         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
 281         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
 282         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
 283         wbr:NS_HTML, xmp:NS_HTML,
 284
 285         # MathML:
 286         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
 287         'annotation-xml':NS_MATHML,
 288
 289         # SVG:
 290         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
 291 }
 292
 293 formatting_elements = {
 294          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
 295          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
 296          u: true
 297 }
 298
 299 foster_parenting_targets = {
 300         table: true
 301         tbody: true
 302         tfoot: true
 303         thead: true
 304         tr: true
 305 }
 306
 307 # all html I presume
 308 end_tag_implied = {
 309         dd: true
 310         dt: true
 311         li: true
 312         option: true
 313         optgroup: true
 314         p: true
 315         rb: true
 316         rp: true
 317         rt: true
 318         rtc: true
 319 }
 320
 321 el_is_special = (e) ->
 322         return special_elements[e.name] is e.namespace
 323
 324 # decode_named_char_ref()
 325 #
 326 # The list of named character references is _huge_ so ask the browser to decode
 327 # for us instead of wasting bandwidth/space on including the table here.
 328 #
 329 # Pass without the "&" but with the ";" examples:
 330 #    for "&amp" pass "amp;"
 331 #    for "&#x2032" pass "x2032;"
 332 g_dncr = {
 333         cache: {}
 334         textarea: document.createElement('textarea')
 335 }
 336 # TODO test this in IE8
 337 decode_named_char_ref = (txt) ->
 338         txt = "&#{txt}"
 339         decoded = g_dncr.cache[txt]
 340         return decoded if decoded?
 341         g_dncr.textarea.innerHTML = txt
 342         decoded = g_dncr.textarea.value
 343         return null if decoded is txt
 344         return g_dncr.cache[txt] = decoded
 345
 346 parse_html = (txt, parse_error_cb = null) ->
 347         cur = 0 # index of next char in txt to be parsed
 348         # declare doc and tokenizer variables so they're in scope below
 349         doc = null
 350         open_els = null # stack of open elements
 351         afe = null # active formatting elements
 352         template_insertion_modes = null
 353         insertion_mode = null
 354         original_insertion_mode = null
 355         tok_state = null
 356         tok_cur_tag = null # partially parsed tag
 357         flag_scripting = null
 358         flag_frameset_ok = null
 359         flag_parsing = null
 360         flag_foster_parenting = null
 361         form_element_pointer = null
 362         temporary_buffer = null
 363         pending_table_character_tokens = null
 364         head_element_pointer = null
 365
 366         parse_error = ->
 367                 if parse_error_cb?
 368                         parse_error_cb cur
 369                 else
 370                         console.log "Parse error at character #{cur} of #{txt.length}"
 371
 372         afe_push = (new_el) ->
 373                 matches = 0
 374                 for el, i in afe
 375                         if el.name is new_el.name and el.namespace is new_el.namespace
 376                                 for k, v of el.attrs
 377                                         continue unless new_el.attrs[k] is v
 378                                 for k, v of new_el.attrs
 379                                         continue unless el.attrs[k] is v
 380                                 matches += 1
 381                                 if matches is 3
 382                                         afe.splice i, 1
 383                                         break
 384                 afe.unshift new_el
 385         afe_push_marker = ->
 386                 afe.unshift new_afe_marker()
 387
 388         # the functions below impliment the Tree Contstruction algorithm
 389         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 390
 391         # But first... the helpers
 392         template_tag_is_open = ->
 393                 for t in open_els
 394                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
 395                                 return true
 396                 return false
 397         is_in_scope_x = (tag_name, scope, namespace) ->
 398                 for t in open_els
 399                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 400                                 return true
 401                         if scope[t.name] is t.namespace
 402                                 return false
 403                 return false
 404         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
 405                 for t in open_els
 406                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 407                                 return true
 408                         if scope[t.name] is t.namespace
 409                                 return false
 410                         if scope2[t.name] is t.namespace
 411                                 return false
 412                 return false
 413         standard_scopers = { # FIXME these are supposed to be namespace specific
 414                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
 415                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
 416                 template: NS_HTML, mi: NS_MATHML,
 417
 418                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
 419                 'annotation-xml': NS_MATHML,
 420
 421                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
 422         }
 423         button_scopers = button: NS_HTML
 424         li_scopers = ol: NS_HTML, ul: NS_HTML
 425         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
 426         is_in_scope = (tag_name, namespace = null) ->
 427                 return is_in_scope_x tag_name, standard_scopers, namespace
 428         is_in_button_scope = (tag_name, namespace = null) ->
 429                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
 430         is_in_table_scope = (tag_name, namespace = null) ->
 431                 return is_in_scope_x tag_name, table_scopers, namespace
 432         is_in_select_scope = (tag_name, namespace = null) ->
 433                 for t in open_els
 434                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 435                                 return true
 436                         if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
 437                                 return false
 438                 return false
 439         # this checks for a particular element, not by name
 440         el_is_in_scope = (el) ->
 441                 for t in open_els
 442                         if t is el
 443                                 return true
 444                         if standard_scopers[t.name] is t.namespace
 445                                 return false
 446                 return false
 447
 448         clear_to_table_stopers = {
 449                 'table': true
 450                 'template': true
 451                 'html': true
 452         }
 453         clear_stack_to_table_context = ->
 454                 loop
 455                         if clear_to_table_stopers[open_els[0].name]?
 456                                 break
 457                         open_els.shift()
 458                 return
 459         clear_to_table_body_stopers = {
 460                 'tbody': true
 461                 'tfoot': true
 462                 'thead': true
 463                 'template': true
 464                 'html': true
 465         }
 466         clear_stack_to_table_body_context = ->
 467                 loop
 468                         if clear_to_table_body_stopers[open_els[0].name]?
 469                                 break
 470                         open_els.shift()
 471                 return
 472         clear_to_table_row_stopers = {
 473                 'tr': true
 474                 'template': true
 475                 'html': true
 476         }
 477         clear_stack_to_table_row_context = ->
 478                 loop
 479                         if clear_to_table_row_stopers[open_els[0].name]?
 480                                 break
 481                         open_els.shift()
 482                 return
 483         clear_afe_to_marker = ->
 484                 loop
 485                         el = afe.shift()
 486                         if el.type is TYPE_AFE_MARKER
 487                                 return
 488
 489         # 8.2.3.1 ...
 490         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
 491         reset_insertion_mode = ->
 492                 # 1. Let last be false.
 493                 last = false
 494                 # 2. Let node be the last node in the stack of open elements.
 495                 node_i = 0
 496                 node = open_els[node_i]
 497                 # 3. Loop: If node is the first node in the stack of open elements,
 498                 # then set last to true, and, if the parser was originally created as
 499                 # part of the HTML fragment parsing algorithm (fragment case) set node
 500                 # to the context element.
 501                 loop
 502                         if node_i is open_els.length - 1
 503                                 last = true
 504                                 # fixfull (fragment case)
 505
 506                         # 4. If node is a select element, run these substeps:
 507                         if node.name is 'select'
 508                                 # 1. If last is true, jump to the step below labeled done.
 509                                 unless last
 510                                         # 2. Let ancestor be node.
 511                                         ancestor_i = node_i
 512                                         ancestor = node
 513                                         # 3. Loop: If ancestor is the first node in the stack of
 514                                         # open elements, jump to the step below labeled done.
 515                                         loop
 516                                                 if ancestor_i is open_els.length - 1
 517                                                         break
 518                                                 # 4. Let ancestor be the node before ancestor in the stack
 519                                                 # of open elements.
 520                                                 ancestor_i += 1
 521                                                 ancestor = open_els[ancestor_i]
 522                                                 # 5. If ancestor is a template node, jump to the step below
 523                                                 # labeled done.
 524                                                 if ancestor.name is 'template'
 525                                                         break
 526                                                 # 6. If ancestor is a table node, switch the insertion mode
 527                                                 # to "in select in table" and abort these steps.
 528                                                 if ancestor.name is 'table'
 529                                                         insertion_mode = ins_mode_in_select_in_table
 530                                                         return
 531                                                 # 7. Jump back to the step labeled loop.
 532                                 # 8. Done: Switch the insertion mode to "in select" and abort
 533                                 # these steps.
 534                                 insertion_mode = ins_mode_in_select
 535                                 return
 536                         # 5. If node is a td or th element and last is false, then switch
 537                         # the insertion mode to "in cell" and abort these steps.
 538                         if (node.name is 'td' or node.name is 'th') and last is false
 539                                 insertion_mode = ins_mode_in_cell
 540                                 return
 541                         # 6. If node is a tr element, then switch the insertion mode to "in
 542                         # row" and abort these steps.
 543                         if node.name is 'tr'
 544                                 insertion_mode = ins_mode_in_row
 545                                 return
 546                         # 7. If node is a tbody, thead, or tfoot element, then switch the
 547                         # insertion mode to "in table body" and abort these steps.
 548                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
 549                                 insertion_mode = ins_mode_in_table_body
 550                                 return
 551                         # 8. If node is a caption element, then switch the insertion mode
 552                         # to "in caption" and abort these steps.
 553                         if node.name is 'caption'
 554                                 insertion_mode = ins_mode_in_caption
 555                                 return
 556                         # 9. If node is a colgroup element, then switch the insertion mode
 557                         # to "in column group" and abort these steps.
 558                         if node.name is 'colgroup'
 559                                 insertion_mode = ins_mode_in_column_group
 560                                 return
 561                         # 10. If node is a table element, then switch the insertion mode to
 562                         # "in table" and abort these steps.
 563                         if node.name is 'table'
 564                                 insertion_mode = ins_mode_in_table
 565                                 return
 566                         # 11. If node is a template element, then switch the insertion mode
 567                         # to the current template insertion mode and abort these steps.
 568                         # fixfull (template insertion mode stack)
 569
 570                         # 12. If node is a head element and last is true, then switch the
 571                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
 572                         # these steps. (fragment case)
 573                         if node.name is 'head' and last
 574                                 insertion_mode = ins_mode_in_body
 575                                 return
 576                         # 13. If node is a head element and last is false, then switch the
 577                         # insertion mode to "in head" and abort these steps.
 578                         if node.name is 'head' and last is false
 579                                 insertion_mode = ins_mode_in_head
 580                                 return
 581                         # 14. If node is a body element, then switch the insertion mode to
 582                         # "in body" and abort these steps.
 583                         if node.name is 'body'
 584                                 insertion_mode = ins_mode_in_body
 585                                 return
 586                         # 15. If node is a frameset element, then switch the insertion mode
 587                         # to "in frameset" and abort these steps. (fragment case)
 588                         if node.name is 'frameset'
 589                                 insertion_mode = ins_mode_in_frameset
 590                                 return
 591                         # 16. If node is an html element, run these substeps:
 592                         if node.name is 'html'
 593                                 # 1. If the head element pointer is null, switch the insertion
 594                                 # mode to "before head" and abort these steps. (fragment case)
 595                                 # fixfull (fragment case)
 596
 597                                 # 2. Otherwise, the head element pointer is not null, switch
 598                                 # the insertion mode to "after head" and abort these steps.
 599                                 insertion_mode = ins_mode_in_body # FIXME fixfull
 600                                 return
 601                         # 17. If last is true, then switch the insertion mode to "in body"
 602                         # and abort these steps. (fragment case)
 603                         if last
 604                                 insertion_mode = ins_mode_in_body
 605                                 return
 606                         # 18. Let node now be the node before node in the stack of open
 607                         # elements.
 608                         node_i += 1
 609                         node = open_els[node_i]
 610                         # 19. Return to the step labeled loop.
 611
 612         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
 613         # this implementation is structured (mostly) as described at the link above.
 614         # capitalized comments are the "labels" described at the link above.
 615         reconstruct_active_formatting_elements = ->
 616                 return if afe.length is 0
 617                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
 618                         return
 619                 # Rewind
 620                 i = 0
 621                 loop
 622                         if i is afe.length - 1
 623                                 break
 624                         i += 1
 625                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
 626                                 i -= 1 # Advance
 627                                 break
 628                 # Create
 629                 loop
 630                         el = afe[i].shallow_clone()
 631                         tree_insert_element el
 632                         afe[i] = el
 633                         break if i is 0
 634                         i -= 1 # Advance
 635
 636         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
 637         # adoption agency algorithm
 638         # overview here:
 639         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
 640         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
 641         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
 642         adoption_agency = (subject) ->
 643                 debug_log "adoption_agency()"
 644                 debug_log "tree: #{serialize_els doc.children, false, true}"
 645                 debug_log "open_els: #{serialize_els open_els, true, true}"
 646                 debug_log "afe: #{serialize_els afe, true, true}"
 647                 if open_els[0].name is subject
 648                         el = open_els[0]
 649                         open_els.shift()
 650                         # remove it from the list of active formatting elements (if found)
 651                         for t, i in afe
 652                                 if t is el
 653                                         afe.splice i, 1
 654                                         break
 655                         debug_log "aaa: starting off with subject on top of stack, exiting"
 656                         return
 657                 outer = 0
 658                 loop
 659                         if outer >= 8
 660                                 return
 661                         outer += 1
 662                         # 5. Let formatting element be the last element in the list of
 663                         # active formatting elements that: is between the end of the list
 664                         # and the last scope marker in the list, if any, or the start of
 665                         # the list otherwise, and  has the tag name subject.
 666                         fe = null
 667                         for t, fe_of_afe in afe
 668                                 if t.type is TYPE_AFE_MARKER
 669                                         break
 670                                 if t.name is subject
 671                                         fe = t
 672                                         break
 673                         # If there is no such element, then abort these steps and instead
 674                         # act as described in the "any other end tag" entry above.
 675                         if fe is null
 676                                 debug_log "aaa: fe not found in afe"
 677                                 in_body_any_other_end_tag subject
 678                                 return
 679                         # 6. If formatting element is not in the stack of open elements,
 680                         # then this is a parse error; remove the element from the list, and
 681                         # abort these steps.
 682                         in_open_els = false
 683                         for t, fe_of_open_els in open_els
 684                                 if t is fe
 685                                         in_open_els = true
 686                                         break
 687                         unless in_open_els
 688                                 debug_log "aaa: fe not found in open_els"
 689                                 parse_error()
 690                                 # "remove it from the list" must mean afe, since it's not in open_els
 691                                 afe.splice fe_of_afe, 1
 692                                 return
 693                         # 7. If formatting element is in the stack of open elements, but
 694                         # the element is not in scope, then this is a parse error; abort
 695                         # these steps.
 696                         unless el_is_in_scope fe
 697                                 debug_log "aaa: fe not in scope"
 698                                 parse_error()
 699                                 return
 700                         # 8. If formatting element is not the current node, this is a parse
 701                         # error. (But do not abort these steps.)
 702                         unless open_els[0] is fe
 703                                 parse_error()
 704                                 # continue
 705                         # 9. Let furthest block be the topmost node in the stack of open
 706                         # elements that is lower in the stack than formatting element, and
 707                         # is an element in the special category. There might not be one.
 708                         fb = null
 709                         fb_of_open_els = null
 710                         for t, i in open_els
 711                                 if t is fe
 712                                         break
 713                                 if el_is_special t
 714                                         fb = t
 715                                         fb_of_open_els = i
 716                                         # and continue, to see if there's one that's more "topmost"
 717                         # 10. If there is no furthest block, then the UA must first pop all
 718                         # the nodes from the bottom of the stack of open elements, from the
 719                         # current node up to and including formatting element, then remove
 720                         # formatting element from the list of active formatting elements,
 721                         # and finally abort these steps.
 722                         if fb is null
 723                                 debug_log "aaa: no fb"
 724                                 loop
 725                                         t = open_els.shift()
 726                                         if t is fe
 727                                                 afe.splice fe_of_afe, 1
 728                                                 return
 729                         # 11. Let common ancestor be the element immediately above
 730                         # formatting element in the stack of open elements.
 731                         ca = open_els[fe_of_open_els + 1] # common ancestor
 732
 733                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
 734                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
 735                         bookmark = new_aaa_bookmark()
 736                         for t, i in afe
 737                                 if t is fe
 738                                         afe.splice i, 0, bookmark
 739                                         break
 740                         node = last_node = fb
 741                         inner = 0
 742                         loop
 743                                 inner += 1
 744                                 # 3. Let node be the element immediately above node in the
 745                                 # stack of open elements, or if node is no longer in the stack
 746                                 # of open elements (e.g. because it got removed by this
 747                                 # algorithm), the element that was immediately above node in
 748                                 # the stack of open elements before node was removed.
 749                                 node_next = null
 750                                 for t, i in open_els
 751                                         if t is node
 752                                                 node_next = open_els[i + 1]
 753                                                 break
 754                                 node = node_next ? node_above
 755                                 debug_log "inner loop #{inner}"
 756                                 debug_log "tree: #{serialize_els doc.children, false, true}"
 757                                 debug_log "open_els: #{serialize_els open_els, true, true}"
 758                                 debug_log "afe: #{serialize_els afe, true, true}"
 759                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
 760                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
 761                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
 762                                 debug_log "node: #{node.serialize true, true}"
 763                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
 764
 765                                 # 4. If node is formatting element, then go to the next step in
 766                                 # the overall algorithm.
 767                                 if node is fe
 768                                         break
 769                                 debug_log "the meat"
 770                                 # 5. If inner loop counter is greater than three and node is in
 771                                 # the list of active formatting elements, then remove node from
 772                                 # the list of active formatting elements.
 773                                 node_in_afe = false
 774                                 for t, i in afe
 775                                         if t is node
 776                                                 if inner > 3
 777                                                         afe.splice i, 1
 778                                                         debug_log "max out inner"
 779                                                 else
 780                                                         node_in_afe = true
 781                                                         debug_log "in afe"
 782                                                 break
 783                                 # 6. If node is not in the list of active formatting elements,
 784                                 # then remove node from the stack of open elements and then go
 785                                 # back to the step labeled inner loop.
 786                                 unless node_in_afe
 787                                         debug_log "not in afe"
 788                                         for t, i in open_els
 789                                                 if t is node
 790                                                         node_above = open_els[i + 1]
 791                                                         open_els.splice i, 1
 792                                                         break
 793                                         continue
 794                                 debug_log "the bones"
 795                                 # 7. create an element for the token for which the element node
 796                                 # was created, in the HTML namespace, with common ancestor as
 797                                 # the intended parent; replace the entry for node in the list
 798                                 # of active formatting elements with an entry for the new
 799                                 # element, replace the entry for node in the stack of open
 800                                 # elements with an entry for the new element, and let node be
 801                                 # the new element.
 802                                 new_node = node.shallow_clone()
 803                                 for t, i in afe
 804                                         if t is node
 805                                                 afe[i] = new_node
 806                                                 debug_log "replaced in afe"
 807                                                 break
 808                                 for t, i in open_els
 809                                         if t is node
 810                                                 node_above = open_els[i + 1]
 811                                                 open_els[i] = new_node
 812                                                 debug_log "replaced in open_els"
 813                                                 break
 814                                 node = new_node
 815                                 # 8. If last node is furthest block, then move the
 816                                 # aforementioned bookmark to be immediately after the new node
 817                                 # in the list of active formatting elements.
 818                                 if last_node is fb
 819                                         for t, i in afe
 820                                                 if t is bookmark
 821                                                         afe.splice i, 1
 822                                                         debug_log "removed bookmark"
 823                                                         break
 824                                         for t, i in afe
 825                                                 if t is node
 826                                                         # "after" means lower
 827                                                         afe.splice i, 0, bookmark # "after as <-
 828                                                         debug_log "placed bookmark after node"
 829                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
 830                                                         break
 831                                 # 9. Insert last node into node, first removing it from its
 832                                 # previous parent node if any.
 833                                 if last_node.parent?
 834                                         debug_log "last_node has parent"
 835                                         for c, i in last_node.parent.children
 836                                                 if c is last_node
 837                                                         debug_log "removing last_node from parent"
 838                                                         last_node.parent.children.splice i, 1
 839                                                         break
 840                                 node.children.push last_node
 841                                 last_node.parent = node
 842                                 # 10. Let last node be node.
 843                                 last_node = node
 844                                 debug_log "at last"
 845                                 # 11. Return to the step labeled inner loop.
 846                         # 14. Insert whatever last node ended up being in the previous step
 847                         # at the appropriate place for inserting a node, but using common
 848                         # ancestor as the override target.
 849
 850                         # In the case where fe is immediately followed by fb:
 851                         #   * inner loop exits out early (node==fe)
 852                         #   * last_node is fb
 853                         #   * last_node is still in the tree (not a duplicate)
 854                         if last_node.parent?
 855                                 debug_log "FEFIRST? last_node has parent"
 856                                 for c, i in last_node.parent.children
 857                                         if c is last_node
 858                                                 debug_log "removing last_node from parent"
 859                                                 last_node.parent.children.splice i, 1
 860                                                 break
 861
 862                         debug_log "after aaa inner loop"
 863                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
 864                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
 865                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
 866                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
 867                         debug_log "tree: #{serialize_els doc.children, false, true}"
 868
 869                         debug_log "insert"
 870
 871
 872                         # can't use standard insert token thing, because it's already in
 873                         # open_els and must stay at it's current position in open_els
 874                         dest = adjusted_insertion_location ca
 875                         dest[0].children.splice dest[1], 0, last_node
 876                         last_node.parent = dest[0]
 877
 878
 879                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
 880                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
 881                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
 882                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
 883                         debug_log "tree: #{serialize_els doc.children, false, true}"
 884
 885                         # 15. Create an element for the token for which formatting element
 886                         # was created, in the HTML namespace, with furthest block as the
 887                         # intended parent.
 888                         new_element = fe.shallow_clone() # FIXME intended parent thing
 889                         # 16. Take all of the child nodes of furthest block and append them
 890                         # to the element created in the last step.
 891                         while fb.children.length
 892                                 t = fb.children.shift()
 893                                 t.parent = new_element
 894                                 new_element.children.push t
 895                         # 17. Append that new element to furthest block.
 896                         new_element.parent = fb
 897                         fb.children.push new_element
 898                         # 18. Remove formatting element from the list of active formatting
 899                         # elements, and insert the new element into the list of active
 900                         # formatting elements at the position of the aforementioned
 901                         # bookmark.
 902                         for t, i in afe
 903                                 if t is fe
 904                                         afe.splice i, 1
 905                                         break
 906                         for t, i in afe
 907                                 if t is bookmark
 908                                         afe[i] = new_element
 909                                         break
 910                         # 19. Remove formatting element from the stack of open elements,
 911                         # and insert the new element into the stack of open elements
 912                         # immediately below the position of furthest block in that stack.
 913                         for t, i in open_els
 914                                 if t is fe
 915                                         open_els.splice i, 1
 916                                         break
 917                         for t, i in open_els
 918                                 if t is fb
 919                                         open_els.splice i, 0, new_element
 920                                         break
 921                         # 20. Jump back to the step labeled outer loop.
 922                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
 923                         debug_log "tree: #{serialize_els doc.children, false, true}"
 924                         debug_log "open_els: #{serialize_els open_els, true, true}"
 925                         debug_log "afe: #{serialize_els afe, true, true}"
 926                 debug_log "AAA DONE"
 927
 928         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
 929         close_p_element = ->
 930                 generate_implied_end_tags 'p' # arg is exception
 931                 if open_els[0].name isnt 'p'
 932                         parse_error()
 933                 while open_els.length > 1 # just in case
 934                         el = open_els.shift()
 935                         if el.name is 'p'
 936                                 return
 937         close_p_if_in_button_scope = ->
 938                 if is_in_button_scope 'p'
 939                         close_p_element()
 940
 941         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
 942         # aka insert_a_character = (t) ->
 943         insert_character = (t) ->
 944                 dest = adjusted_insertion_location()
 945                 # fixfull check for Document node
 946                 if dest[1] > 0
 947                         prev = dest[0].children[dest[1] - 1]
 948                         if prev.type is TYPE_TEXT
 949                                 prev.text += t.text
 950                                 return
 951                 dest[0].children.splice dest[1], 0, t
 952
 953         # 8.2.5.1
 954         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
 955         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
 956         adjusted_insertion_location = (override_target = null) ->
 957                 # 1. If there was an override target specified, then let target be the
 958                 # override target.
 959                 if override_target?
 960                         target = override_target
 961                 else # Otherwise, let target be the current node.
 962                         target = open_els[0]
 963                 # 2. Determine the adjusted insertion location using the first matching
 964                 # steps from the following list:
 965                 #
 966                 # If foster parenting is enabled and target is a table, tbody, tfoot,
 967                 # thead, or tr element Foster parenting happens when content is
 968                 # misnested in tables.
 969                 if flag_foster_parenting and foster_parenting_targets[target.name]
 970                         loop # once. this is here so we can ``break`` to "abort these substeps"
 971                                 # 1. Let last template be the last template element in the
 972                                 # stack of open elements, if any.
 973                                 last_template = null
 974                                 last_template_i = null
 975                                 for el, i in open_els
 976                                         if el.name is 'template'
 977                                                 last_template = el
 978                                                 last_template_i = i
 979                                                 break
 980                                 # 2. Let last table be the last table element in the stack of
 981                                 # open elements, if any.
 982                                 last_table = null
 983                                 last_table_i
 984                                 for el, i in open_els
 985                                         if el.name is 'table'
 986                                                 last_table = el
 987                                                 last_table_i = i
 988                                                 break
 989                                 # 3. If there is a last template and either there is no last
 990                                 # table, or there is one, but last template is lower (more
 991                                 # recently added) than last table in the stack of open
 992                                 # elements, then: let adjusted insertion location be inside
 993                                 # last template's template contents, after its last child (if
 994                                 # any), and abort these substeps.
 995                                 if last_template and (last_table is null or last_template_i < last_table_i)
 996                                         target = template # fixfull should be it's contents
 997                                         target_i = target.children.length
 998                                         break
 999                                 # 4. If there is no last table, then let adjusted insertion
1000                                 # location be inside the first element in the stack of open
1001                                 # elements (the html element), after its last child (if any),
1002                                 # and abort these substeps. (fragment case)
1003                                 if last_table is null
1004                                         # this is odd
1005                                         target = open_els[open_els.length - 1]
1006                                         target_i = target.children.length
1007                                 # 5. If last table has a parent element, then let adjusted
1008                                 # insertion location be inside last table's parent element,
1009                                 # immediately before last table, and abort these substeps.
1010                                 if last_table.parent?
1011                                         for c, i in last_table.parent.children
1012                                                 if c is last_table
1013                                                         target = last_table.parent
1014                                                         target_i = i
1015                                                         break
1016                                         break
1017                                 # 6. Let previous element be the element immediately above last
1018                                 # table in the stack of open elements.
1019                                 #
1020                                 # huh? how could it not have a parent?
1021                                 previous_element = open_els[last_table_i + 1]
1022                                 # 7. Let adjusted insertion location be inside previous
1023                                 # element, after its last child (if any).
1024                                 target = previous_element
1025                                 target_i = target.children.length
1026                                 # Note: These steps are involved in part because it's possible
1027                                 # for elements, the table element in this case in particular,
1028                                 # to have been moved by a script around in the DOM, or indeed
1029                                 # removed from the DOM entirely, after the element was inserted
1030                                 # by the parser.
1031                                 break # don't really loop
1032                 else
1033                         # Otherwise Let adjusted insertion location be inside target, after
1034                         # its last child (if any).
1035                         target_i = target.children.length
1036
1037                 # 3. If the adjusted insertion location is inside a template element,
1038                 # let it instead be inside the template element's template contents,
1039                 # after its last child (if any).
1040                 # fixfull (template)
1041
1042                 # 4. Return the adjusted insertion location.
1043                 return [target, target_i]
1044
1045         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1046         # aka create_an_element_for_token
1047         token_to_element = (t, namespace, intended_parent) ->
1048                 t.type = TYPE_TAG # not TYPE_START_TAG
1049                 # convert attributes into a hash
1050                 attrs = {}
1051                 while t.attrs_a.length
1052                         a = t.attrs_a.pop()
1053                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1054                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
1055
1056                 # TODO 2. If the newly created element has an xmlns attribute in the
1057                 # XMLNS namespace whose value is not exactly the same as the element's
1058                 # namespace, that is a parse error. Similarly, if the newly created
1059                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1060                 # value is not the XLink Namespace, that is a parse error.
1061
1062                 # fixfull: the spec says stuff about form pointers and ownerDocument
1063
1064                 return el
1065
1066         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1067         insert_foreign_element = (token, namespace) ->
1068                 ail = adjusted_insertion_location()
1069                 ail_el = ail[0]
1070                 ail_i = ail[1]
1071                 el = token_to_element token, namespace, ail_el
1072                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1073                 el.parent = ail_el
1074                 ail_el.children.splice ail_i, 0, el
1075                 open_els.unshift el
1076                 return el
1077         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1078         insert_html_element = insert_foreign_element # (token, namespace) ->
1079
1080         # FIXME read implement "foster parenting" part
1081         # FIXME read spec, do this right
1082         # FIXME implement the override target thing
1083         # note: this assumes it's an open tag
1084         # FIXME what part of the spec is this?
1085         # TODO look through all callers of this, and see what they should really be doing.
1086         #   eg probably insert_html_element for tokens
1087         tree_insert_element = (el, override_target = null, namespace = null) ->
1088                 if namespace?
1089                         el.namespace = namespace
1090                 dest = adjusted_insertion_location override_target
1091                 if el.type is TYPE_START_TAG # means it's a "token"
1092                         el = token_to_element el, namespace, dest[0]
1093                 unless el.namespace?
1094                         namespace = dest.namespace
1095                 # fixfull: Document nodes sometimes can't accept more chidren
1096                 dest[0].children.splice dest[1], 0, el
1097                 el.parent = dest[0]
1098                 open_els.unshift el
1099                 return el
1100
1101         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1102         # position should be [node, index_within_children]
1103         insert_comment = (t, position = null) ->
1104                 position ?= adjusted_insertion_location()
1105                 position[0].children.splice position[1], 0, t
1106
1107         # 8.2.5.2
1108         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1109         parse_generic_raw_text = (t) ->
1110                 insert_html_element t
1111                 tok_state = tok_state_rawtext
1112                 original_insertion_mode = insertion_mode
1113                 insertion_mode = ins_mode_text
1114         parse_generic_rcdata_text = (t) ->
1115                 insert_html_element t
1116                 tok_state = tok_state_rcdata
1117                 original_insertion_mode = insertion_mode
1118                 insertion_mode = ins_mode_text
1119
1120         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1121         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1122         generate_implied_end_tags = (except = null) ->
1123                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1124                         open_els.shift()
1125
1126         # 8.2.5.4 The rules for parsing tokens in HTML content
1127         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1128
1129         # 8.2.5.4.1 The "initial" insertion mode
1130         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1131         ins_mode_initial = (t) ->
1132                 if is_space_tok t
1133                         return
1134                 if t.type is TYPE_COMMENT
1135                         # fixfull this is supposed to be "the last child of the document object"
1136                         doc.children.push t
1137                         return
1138                 if t.type is TYPE_DOCTYPE
1139                         # fixfull
1140                         t.name = 'html'
1141                         doc.children.push t
1142                         insertion_mode = ins_mode_before_html
1143                         return
1144                 # Anything else
1145                 #fixfull (iframe, quirks)
1146                 insertion_mode = ins_mode_before_html
1147                 insertion_mode t # reprocess the token
1148                 return
1149
1150         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1151         ins_mode_before_html = (t) ->
1152                 if t.type is TYPE_DOCTYPE
1153                         parse_error()
1154                         return
1155                 if t.type is TYPE_COMMENT
1156                         doc.children.push t
1157                         return
1158                 if is_space_tok t
1159                         return
1160                 if t.type is TYPE_START_TAG and t.name is 'html'
1161                         el = token_to_element t, NS_HTML, doc
1162                         open_els.unshift(el)
1163                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1164                         insertion_mode = ins_mode_before_head
1165                         return
1166                 if t.type is TYPE_END_TAG
1167                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1168                                 # fall through to "anything else"
1169                         else
1170                                 parse_error()
1171                                 return
1172                 # Anything else
1173                 html_tok = new_open_tag 'html'
1174                 el = token_to_element html_tok, NS_HTML, doc
1175                 doc.children.push el
1176                 open_els.unshift el
1177                 # ?fixfull browsing context
1178                 insertion_mode = ins_mode_before_head
1179                 insertion_mode t
1180                 return
1181
1182         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1183         ins_mode_before_head = (t) ->
1184                 if is_space_tok t
1185                         return
1186                 if t.type is TYPE_COMMENT
1187                         insert_comment t
1188                         return
1189                 if t.type is TYPE_DOCTYPE
1190                         parse_error()
1191                         return
1192                 if t.type is TYPE_START_TAG and t.name is 'html'
1193                         ins_mode_in_body t
1194                         return
1195                 if t.type is TYPE_START_TAG and t.name is 'head'
1196                         el = insert_html_element t
1197                         head_element_pointer = el
1198                         insertion_mode = ins_mode_in_head
1199                 if t.type is TYPE_END_TAG
1200                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1201                                 # fall through to Anything else below
1202                         else
1203                                 parse_error()
1204                                 return
1205                 # Anything else
1206                 head_tok = new_open_tag 'head'
1207                 el = insert_html_element head_tok
1208                 head_element_pointer = el
1209                 insertion_mode = ins_mode_in_head
1210                 insertion_mode t # reprocess current token
1211
1212         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1213         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1214                 open_els.shift() # spec says this will be a 'head' node
1215                 insertion_mode = ins_mode_after_head
1216                 insertion_mode t
1217         ins_mode_in_head = (t) ->
1218                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1219                         insert_character t
1220                         return
1221                 if t.type is TYPE_COMMENT
1222                         insert_comment t
1223                         return
1224                 if t.type is TYPE_DOCTYPE
1225                         parse_error()
1226                         return
1227                 if t.type is TYPE_START_TAG and t.name is 'html'
1228                         ins_mode_in_body t
1229                         return
1230                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1231                         el = insert_html_element t
1232                         open_els.shift()
1233                         el.acknowledge_self_closing()
1234                         return
1235                 if t.type is TYPE_START_TAG and t.name is 'meta'
1236                         el = insert_html_element t
1237                         open_els.shift()
1238                         el.acknowledge_self_closing()
1239                         # fixfull encoding stuff
1240                         return
1241                 if t.type is TYPE_START_TAG and t.name is 'title'
1242                         parse_generic_rcdata_element t
1243                         return
1244                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1245                         parse_generic_raw_text t
1246                         return
1247                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1248                         insert_html_element t
1249                         insertion_mode = in_head_noscript # FIXME implement
1250                         return
1251                 if t.type is TYPE_START_TAG and t.name is 'script'
1252                         ail = adjusted_insertion_location()
1253                         el = token_to_element t, NS_HTML, ail
1254                         el.flag_parser_inserted true # FIXME implement
1255                         # fixfull frament case
1256                         ail[0].children.splice ail[1], 0, el
1257                         open_els.unshift el
1258                         tok_state = tok_state_script_data
1259                         original_insertion_mode = insertion_mode # make sure orig... is defined
1260                         insertion_mode = ins_mode_text # FIXME implement
1261                         return
1262                 if t.type is TYPE_END_TAG and t.name is 'head'
1263                         open_els.shift() # will be a head element... spec says so
1264                         insertion_mode = ins_mode_after_head
1265                         return
1266                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1267                         ins_mode_in_head_else t
1268                         return
1269                 if t.type is TYPE_START_TAG and t.name is 'template'
1270                         insert_html_element t
1271                         afe_push_marker()
1272                         flag_frameset_ok = false
1273                         insertion_mode = ins_mode_in_template
1274                         template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1275                         return
1276                 if t.type is TYPE_END_TAG and t.name is 'template'
1277                         if template_tag_is_open()
1278                                 generate_implied_end_tags
1279                                 if open_els[0].name isnt 'template'
1280                                         parse_error()
1281                                 loop
1282                                         el = open_els.shift()
1283                                         if el.name is 'template'
1284                                                 break
1285                                 clear_afe_to_marker()
1286                                 template_insertion_modes.shift()
1287                                 reset_insertion_mode()
1288                         else
1289                                 parse_error()
1290                         return
1291                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1292                         parse_error()
1293                         return
1294                 ins_mode_in_head_else t
1295
1296         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1297         ins_mode_in_head_noscript = (t) ->
1298                 # FIXME ?fixfull
1299                 console.log "ins_mode_in_head_noscript unimplemented"
1300
1301         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1302         ins_mode_after_head_else = (t) ->
1303                 body_tok = new_open_tag 'body'
1304                 insert_html_element body_tok
1305                 insertion_mode = ins_mode_in_body
1306                 insertion_mode t # reprocess token
1307                 return
1308         ins_mode_after_head = (t) ->
1309                 if is_space_tok t
1310                         insert_character t
1311                         return
1312                 if t.type is TYPE_COMMENT
1313                         insert_comment t
1314                         return
1315                 if t.type is TYPE_DOCTYPE
1316                         parse_error()
1317                         return
1318                 if t.type is TYPE_START_TAG and t.name is 'html'
1319                         ins_mode_in_body t
1320                         return
1321                 if t.type is TYPE_START_TAG and t.name is 'body'
1322                         insert_html_element t
1323                         flag_frameset_ok = false
1324                         insertion_mode = ins_mode_in_body
1325                         return
1326                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1327                         insert_html_element t
1328                         insertion_mode = ins_mode_in_frameset
1329                         return
1330                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1331                         parse_error()
1332                         open_els.unshift head_element_pointer
1333                         ins_mode_in_head t
1334                         for el, i of open_els
1335                                 if el is head_element_pointer
1336                                         open_els.splice i, 1
1337                                         return
1338                         console.log "warning: 23904 couldn't find head element in open_els"
1339                         return
1340                 if t.type is TYPE_END_TAG and t.name is 'template'
1341                         ins_mode_in_head t
1342                         return
1343                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1344                         ins_mode_after_head_else t
1345                         return
1346                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1347                         parse_error()
1348                         return
1349                 # Anything else
1350                 ins_mode_after_head_else t
1351
1352         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1353         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1354                 for node, i in open_els
1355                         if node.name is name # FIXME check namespace too
1356                                 generate_implied_end_tags name # arg is exception
1357                                 parse_error() unless i is 0
1358                                 while i >= 0
1359                                         open_els.shift()
1360                                         i -= 1
1361                                 return
1362                         if special_elements[node.name]? # FIXME check namespac too
1363                                 parse_error()
1364                                 return
1365         ins_mode_in_body = (t) ->
1366                 switch t.type
1367                         when TYPE_TEXT
1368                                 switch t.text
1369                                         when "\u0000"
1370                                                 parse_error()
1371                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
1372                                                 reconstruct_active_formatting_elements()
1373                                                 insert_character t
1374                                         else
1375                                                 reconstruct_active_formatting_elements()
1376                                                 insert_character t
1377                                                 flag_frameset_ok = false
1378                         when TYPE_COMMENT
1379                                 insert_comment t
1380                         when TYPE_DOCTYPE
1381                                 parse_error()
1382                         when TYPE_START_TAG
1383                                 switch t.name
1384                                         when 'html'
1385                                                 parse_error()
1386                                                 return if template_tag_is_open()
1387                                                 root_attrs = open_els[open_els.length - 1].attrs
1388                                                 for k, v of t.attrs
1389                                                         root_attrs[k] = v unless root_attrs[k]?
1390                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1391                                                 # FIXME also do this for </template> (end tag)
1392                                                 return ins_mode_in_head t
1393                                         when 'body'
1394                                                 parse_error()
1395                                                 # TODO
1396                                         when 'frameset'
1397                                                 parse_error()
1398                                                 # TODO
1399                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1400                                                 close_p_if_in_button_scope()
1401                                                 insert_html_element t
1402                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1403                                                 close_p_if_in_button_scope()
1404                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1405                                                         parse_error()
1406                                                         open_els.shift()
1407                                                 insert_html_element t
1408                                         # TODO lots more to implement here
1409                                         when 'a'
1410                                                 # If the list of active formatting elements
1411                                                 # contains an a element between the end of the list and
1412                                                 # the last marker on the list (or the start of the list
1413                                                 # if there is no marker on the list), then this is a
1414                                                 # parse error; run the adoption agency algorithm for
1415                                                 # the tag name "a", then remove that element from the
1416                                                 # list of active formatting elements and the stack of
1417                                                 # open elements if the adoption agency algorithm didn't
1418                                                 # already remove it (it might not have if the element
1419                                                 # is not in table scope).
1420                                                 found = false
1421                                                 for el in afe
1422                                                         if el.type is TYPE_AFE_MARKER
1423                                                                 break
1424                                                         if el.name is 'a'
1425                                                                 found = el
1426                                                 if found?
1427                                                         parse_error()
1428                                                         adoption_agency 'a'
1429                                                         for el, i in afe
1430                                                                 if el is found
1431                                                                         afe.splice i, 1
1432                                                         for el, i in open_els
1433                                                                 if el is found
1434                                                                         open_els.splice i, 1
1435                                                 reconstruct_active_formatting_elements()
1436                                                 el = insert_html_element t
1437                                                 afe_push el
1438                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1439                                                 reconstruct_active_formatting_elements()
1440                                                 el = insert_html_element t
1441                                                 afe_push el
1442                                         when 'table'
1443                                                 # fixfull quirksmode thing
1444                                                 close_p_if_in_button_scope()
1445                                                 insert_html_element t
1446                                                 insertion_mode = ins_mode_in_table
1447                                         # TODO lots more to implement here
1448                                         else # any other start tag
1449                                                 reconstruct_active_formatting_elements()
1450                                                 insert_html_element t
1451                         when TYPE_EOF
1452                                 ok_tags = {
1453                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1454                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1455                                 }
1456                                 for t in open_els
1457                                         unless ok_tags[t.name]?
1458                                                 parse_error()
1459                                                 break
1460                                 # TODO stack of template insertion modes thing
1461                                 flag_parsing = false # stop parsing
1462                         when TYPE_END_TAG
1463                                 switch t.name
1464                                         when 'body'
1465                                                 unless is_in_scope 'body'
1466                                                         parse_error()
1467                                                         return
1468                                                 # TODO implement parse error and move to tree_after_body
1469                                         when 'html'
1470                                                 unless is_in_scope 'body' # weird, but it's what the spec says
1471                                                         parse_error()
1472                                                         return
1473                                                 # TODO implement parse error and move to tree_after_body, reprocess
1474                                         when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1475                                                 unless is_in_scope t.name, NS_HTML
1476                                                         parse_error()
1477                                                         return
1478                                                 generate_implied_end_tags()
1479                                                 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1480                                                         parse_error()
1481                                                 loop
1482                                                         el = open_els.shift()
1483                                                         if el.name is t.name and el.namespace is NS_HTML
1484                                                                 return
1485                                         # TODO lots more close tags to implement here
1486                                         when 'p'
1487                                                 unless is_in_button_scope 'p'
1488                                                         parse_error()
1489                                                         insert_html_element new_open_tag 'p'
1490                                                 close_p_element()
1491                                         # TODO lots more close tags to implement here
1492                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1493                                                 adoption_agency t.name
1494                                         # TODO lots more close tags to implement here
1495                                         else
1496                                                 in_body_any_other_end_tag t.name
1497                 return
1498
1499         ins_mode_in_table_else = (t) ->
1500                 parse_error()
1501                 flag_foster_parenting = true # FIXME
1502                 ins_mode_in_body t
1503                 flag_foster_parenting = false
1504         can_in_table = { # FIXME do this inline like everywhere else
1505                 'table': true
1506                 'tbody': true
1507                 'tfoot': true
1508                 'thead': true
1509                 'tr': true
1510         }
1511
1512         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1513         ins_mode_text = (t) ->
1514                 if t.type is TYPE_TEXT
1515                         insert_character t
1516                         return
1517                 if t.type is TYPE_EOF
1518                         parse_error()
1519                         if open_els[0].name is 'script'
1520                                 open_els[0].flag 'already started', true
1521                         open_els.shift()
1522                         insertion_mode = original_insertion_mode
1523                         insertion_mode t
1524                         return
1525                 if t.type is TYPE_END_TAG and t.name is 'script'
1526                         open_els.shift()
1527                         insertion_mode = original_insertion_mode
1528                         # fixfull the spec seems to assume that I'm going to run the script
1529                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1530                         return
1531                 if t.type is TYPE_END_TAG
1532                         open_els.shift()
1533                         insertion_mode = original_insertion_mode
1534                         return
1535                 console.log 'warning: end of ins_mode_text reached'
1536
1537         # the functions below implement the tokenizer stats described here:
1538         # http://www.w3.org/TR/html5/syntax.html#tokenization
1539
1540         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1541         ins_mode_in_table = (t) ->
1542                 switch t.type
1543                         when TYPE_TEXT
1544                                 if can_in_table[t.name]
1545                                         original_insertion_mode = insertion_mode
1546                                         insertion_mode = ins_mode_in_table_text
1547                                         insertion_mode t
1548                                 else
1549                                         ins_mode_in_table_else t
1550                         when TYPE_COMMENT
1551                                 insert_comment t
1552                         when TYPE_DOCTYPE
1553                                 parse_error()
1554                         when TYPE_START_TAG
1555                                 switch t.name
1556                                         when 'caption'
1557                                                 clear_stack_to_table_context()
1558                                                 afe_push_marker()
1559                                                 insert_html_element t
1560                                                 insertion_mode = ins_mode_in_caption
1561                                         when 'colgroup'
1562                                                 clear_stack_to_table_context()
1563                                                 insert_html_element t
1564                                                 insertion_mode = ins_mode_in_column_group
1565                                         when 'col'
1566                                                 clear_stack_to_table_context()
1567                                                 insert_html_element new_open_tag 'colgroup'
1568                                                 insertion_mode = ins_mode_in_column_group
1569                                                 insertion_mode t
1570                                         when 'tbody', 'tfoot', 'thead'
1571                                                 clear_stack_to_table_context()
1572                                                 insert_html_element t
1573                                                 insertion_mode = ins_mode_in_table_body
1574                                         when 'td', 'th', 'tr'
1575                                                 clear_stack_to_table_context()
1576                                                 insert_html_element new_open_tag 'tbody'
1577                                                 insertion_mode = ins_mode_in_table_body
1578                                                 insertion_mode t
1579                                         when 'table'
1580                                                 parse_error()
1581                                                 if is_in_table_scope 'table'
1582                                                         loop
1583                                                                 el = open_els.shift()
1584                                                                 if el.name is 'table'
1585                                                                         break
1586                                                         reset_insertion_mode()
1587                                                         insertion_mode t
1588                                         when 'style', 'script', 'template'
1589                                                 ins_mode_in_head t
1590                                         when 'input'
1591                                                 if token_is_input_hidden t
1592                                                         ins_mode_in_table_else t
1593                                                 else
1594                                                         parse_error()
1595                                                         el = insert_html_element t
1596                                                         open_els.shift()
1597                                                         el.acknowledge_self_closing()
1598                                         when 'form'
1599                                                 parse_error()
1600                                                 if form_element_pointer?
1601                                                         return
1602                                                 if template_tag_is_open()
1603                                                         return
1604                                                 form_element_pointer = insert_html_element t
1605                                                 open_els.shift()
1606                                         else
1607                                                 ins_mode_in_table_else t
1608                         when TYPE_END_TAG
1609                                 switch t.name
1610                                         when 'table'
1611                                                 if is_in_table_scope 'table'
1612                                                         loop
1613                                                                 el = open_els.shift()
1614                                                                 if el.name is 'table'
1615                                                                         break
1616                                                         reset_insertion_mode()
1617                                                 else
1618                                                         parse_error
1619                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1620                                                 parse_error()
1621                                         when 'template'
1622                                                 ins_mode_in_head t
1623                                         else
1624                                                 ins_mode_in_table_else t
1625                         when TYPE_EOF
1626                                 ins_mode_in_body t
1627                         else
1628                                 ins_mode_in_table_else t
1629
1630
1631         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1632         ins_mode_in_table_text = (t) ->
1633                 if t.type is TYPE_TEXT and t.text is "\u0000"
1634                         # huh? I thought the tokenizer didn't emit these
1635                         parse_error()
1636                         return
1637                 if t.type is TYPE_TEXT
1638                         pending_table_character_tokens.push t
1639                         return
1640                 # Anything else
1641                 all_space = true
1642                 for old in pending_table_character_tokens
1643                         unless is_space_tok old
1644                                 all_space = false
1645                                 break
1646                 if all_space
1647                         for old in pending_table_character_tokens
1648                                 insert_character old
1649                 else
1650                         for old in pending_table_character_tokens
1651                                 ins_mode_table_else old
1652                 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1653                 insertion_mode = original_insertion_mode
1654                 insertion_mode t
1655
1656         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1657         ins_mode_in_caption = (t) ->
1658                 if t.type is TYPE_END_TAG and t.name is 'caption'
1659                         if is_in_table_scope 'caption'
1660                                 generate_implied_end_tags()
1661                                 if open_els[0].name isnt 'caption'
1662                                         parse_error()
1663                                 loop
1664                                         el = open_els.shift()
1665                                         if el.name is 'caption'
1666                                                 break
1667                                 clear_afe_to_marker()
1668                                 insertion_mode = in_table
1669                         else
1670                                 parse_error()
1671                                 # fragment case
1672                         return
1673                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1674                         parse_error()
1675                         if is_in_table_scope 'caption'
1676                                 loop
1677                                         el = open_els.shift()
1678                                         if el.name is 'caption'
1679                                                 break
1680                                 clear_afe_to_marker()
1681                                 insertion_mode = in_table
1682                                 insertion_mode t
1683                         # else fragment case
1684                         return
1685                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1686                         parse_error()
1687                         return
1688                 # Anything else
1689                 ins_mode_in_body t
1690
1691         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1692         ins_mode_in_column_group = (t) ->
1693                 if is_space_tok t
1694                         insert_character t
1695                         return
1696                 if t.type is TYPE_COMMENT
1697                         insert_comment t
1698                         return
1699                 if t.type is TYPE_DOCTYPE
1700                         parse_error()
1701                         return
1702                 if t.type is TYPE_START_TAG and t.name is 'html'
1703                         ins_mode_in_body t
1704                         return
1705                 if t.type is TYPE_START_TAG and t.name is 'col'
1706                         el = insert_html_element t
1707                         open_els.shift()
1708                         el.acknowledge_self_closing()
1709                         return
1710                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1711                         if open_els[0].name is 'colgroup'
1712                                 open_els[0].shift()
1713                                 insertion_mode = ins_mode_in_table
1714                         else
1715                                 parse_error()
1716                         return
1717                 if t.type is TYPE_END_TAG and t.name is 'col'
1718                         parse_error()
1719                         return
1720                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1721                         ins_mode_in_head t
1722                         return
1723                 if t.type is TYPE_EOF
1724                         ins_mode_in_body t
1725                         return
1726                 # Anything else
1727                 if open_els[0].name isnt 'colgroup'
1728                         parse_error()
1729                         return
1730                 open_els.shift()
1731                 insertion_mode = ins_mode_in_table
1732                 insertion_mode t
1733                 return
1734
1735         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1736         ins_mode_in_table_body = (t) ->
1737                 if t.type is TYPE_START_TAG and t.name is 'tr'
1738                         clear_stack_to_table_body_context()
1739                         insert_html_element t
1740                         insertion_mode = ins_mode_in_row
1741                         return
1742                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1743                         parse_error()
1744                         clear_stack_to_table_body_context()
1745                         insert_html_element new_open_tag 'tr'
1746                         insertion_mode = ins_mode_in_row
1747                         insertion_mode t
1748                         return
1749                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1750                         unless is_in_table_scope t.name # fixfull check namespace
1751                                 parse_error()
1752                                 return
1753                         clear_stack_to_table_body_context()
1754                         open_els.shift()
1755                         insertion_mode = ins_mode_in_table
1756                         return
1757                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1758                         has = false
1759                         for el in open_els
1760                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1761                                         has = true
1762                                         break
1763                                 if table_scopers[el.name]
1764                                         break
1765                         if !has
1766                                 parse_error()
1767                                 return
1768                         clear_stack_to_table_body_context()
1769                         open_els.shift()
1770                         insertion_mode = ins_mode_in_table
1771                         insertion_mode t
1772                         return
1773                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1774                         parse_error()
1775                         return
1776                 # Anything else
1777                 ins_mode_in_table t
1778
1779         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1780         ins_mode_in_row = (t) ->
1781                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1782                         clear_stack_to_table_row_context()
1783                         insert_html_element t
1784                         insertion_mode = ins_mode_in_cell
1785                         afe_push_marker()
1786                         return
1787                 if t.type is TYPE_END_TAG and t.name is 'tr'
1788                         if is_in_table_scope 'tr'
1789                                 clear_stack_to_table_row_context()
1790                                 open_els.shift()
1791                                 insertion_mode = ins_mode_in_table_body
1792                         else
1793                                 parse_error()
1794                         return
1795                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1796                         if is_in_table_scope 'tr'
1797                                 clear_stack_to_table_row_context()
1798                                 open_els.shift()
1799                                 insertion_mode = ins_mode_in_table_body
1800                                 insertion_mode t
1801                         else
1802                                 parse_error()
1803                         return
1804                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1805                         if is_in_table_scope t.name # fixfull namespace
1806                                 if is_in_table_scope 'tr'
1807                                         clear_stack_to_table_row_context()
1808                                         open_els.shift()
1809                                         insertion_mode = ins_mode_in_table_body
1810                                         insertion_mode t
1811                         else
1812                                 parse_error()
1813                         return
1814                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1815                         parse_error()
1816                         return
1817                 # Anything else
1818                 ins_mode_in_table t
1819
1820         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1821         close_the_cell = ->
1822                 generate_implied_end_tags()
1823                 unless open_els[0].name is 'td' or open_els[0] is 'th'
1824                         parse_error()
1825                 loop
1826                         el = open_els.shift()
1827                         if el.name is 'td' or el.name is 'th'
1828                                 break
1829                 clear_afe_to_marker()
1830                 insertion_mode = ins_mode_in_row
1831
1832         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1833         ins_mode_in_cell = (t) ->
1834                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1835                         if is_in_table_scope t.name
1836                                 generate_implied_end_tags()
1837                                 if open_els[0].name isnt t.name
1838                                         parse_error
1839                                 loop
1840                                         el = open_els.shift()
1841                                         if el.name is t.name
1842                                                 break
1843                                 clear_afe_to_marker()
1844                                 insertion_mode = ins_mode_in_row
1845                         else
1846                                 parse_error()
1847                         return
1848                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1849                         has = false
1850                         for el in open_els
1851                                 if el.name is 'td' or el.name is 'th'
1852                                         has = true
1853                                         break
1854                                 if table_scopers[el.name]
1855                                         break
1856                         if !has
1857                                 parse_error()
1858                                 return
1859                         close_the_cell()
1860                         insertion_mode t
1861                         return
1862                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1863                         parse_error()
1864                         return
1865                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1866                         if is_in_table_scope t.name # fixfull namespace
1867                                 close_the_cell()
1868                                 insertion_mode t
1869                         else
1870                                 parse_error()
1871                         return
1872                 # Anything Else
1873                 ins_mode_in_body t
1874
1875         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
1876         ins_mode_in_select = (t) ->
1877                 if t.type is TYPE_TEXT and t.text is "\u0000"
1878                         parse_error()
1879                         return
1880                 if t.type is TYPE_TEXT
1881                         insert_character t
1882                         return
1883                 if t.type is TYPE_COMMENT
1884                         insert_comment t
1885                         return
1886                 if t.type is TYPE_DOCTYPE
1887                         parse_error()
1888                         return
1889                 if t.type is TYPE_START_TAG and t.name is 'html'
1890                         ins_mode_in_body t
1891                         return
1892                 if t.type is TYPE_START_TAG and t.name is 'option'
1893                         if open_els[0].name is 'option'
1894                                 open_els.shift()
1895                         insert_html_element t
1896                         return
1897                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
1898                         if open_els[0].name is 'option'
1899                                 open_els.shift()
1900                         if open_els[0].name is 'optgroup'
1901                                 open_els.shift()
1902                         insert_html_element t
1903                         return
1904                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
1905                         if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
1906                                 open_els.shift()
1907                         if open_els[0].name is 'optgroup'
1908                                 open_els.shift()
1909                         else
1910                                 parse_error()
1911                         return
1912                 if t.type is TYPE_END_TAG and t.name is 'option'
1913                         if open_els[0].name is 'option'
1914                                 open_els.shift()
1915                         else
1916                                 parse_error()
1917                         return
1918                 if t.type is TYPE_END_TAG and t.name is 'select'
1919                         if is_in_select_scope 'select'
1920                                 loop
1921                                         el = open_els.shift()
1922                                         if el.name is 'select'
1923                                                 break
1924                                 reset_insertion_mode()
1925                         else
1926                                 parse_error()
1927                         return
1928                 if t.type is TYPE_START_TAG and t.name is 'select'
1929                         parse_error()
1930                         loop
1931                                 el = open_els.shift()
1932                                 if el.name is 'select'
1933                                         break
1934                         reset_insertion_mode()
1935                         # spec says that this is the same as </select> but it doesn't say
1936                         # to check scope first
1937                         return
1938                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
1939                         parse_error()
1940                         if is_in_select_scope 'select'
1941                                 return
1942                         loop
1943                                 el = open_els.shift()
1944                                 if el.name is 'select'
1945                                         break
1946                         reset_insertion_mode()
1947                         insertion_mode t
1948                         return
1949                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
1950                         ins_mode_in_head t
1951                         return
1952                 if t.type is TYPE_EOF
1953                         ins_mode_in_body t
1954                         return
1955                 # Anything else
1956                 parse_error()
1957                 return
1958
1959         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
1960         ins_mode_in_select_in_table = (t) ->
1961                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1962                         parse_error()
1963                         loop
1964                                 el = open_els.shift()
1965                                 if el.name is 'select'
1966                                         break
1967                         reset_insertion_mode()
1968                         insertion_mode t
1969                         return
1970                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1971                         parse_error()
1972                         unless is_in_table_scope t.name, NS_HTML
1973                                 return
1974                         loop
1975                                 el = open_els.shift()
1976                                 if el.name is 'select'
1977                                         break
1978                         reset_insertion_mode()
1979                         insertion_mode t
1980                         return
1981                 # Anything else
1982                 ins_mode_in_select t
1983                 return
1984
1985         # CONTINUE more insertion modes!
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
1999         tok_state_data = ->
2000                 switch c = txt.charAt(cur++)
2001                         when '&'
2002                                 return new_text_node parse_character_reference()
2003                         when '<'
2004                                 tok_state = tok_state_tag_open
2005                         when "\u0000"
2006                                 parse_error()
2007                                 return new_text_node c
2008                         when '' # EOF
2009                                 return new_eof_token()
2010                         else
2011                                 return new_text_node c
2012                 return null
2013
2014         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2015         # not needed: tok_state_character_reference_in_data = ->
2016         # just call parse_character_reference()
2017
2018         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2019         tok_state_rcdata = ->
2020                 switch c = txt.charAt(cur++)
2021                         when '&'
2022                                 return new_text_node parse_character_reference()
2023                         when '<'
2024                                 tok_state = tok_state_rcdata_less_than_sign
2025                         when "\u0000"
2026                                 parse_error()
2027                                 return new_character_token "\ufffd"
2028                         when '' # EOF
2029                                 return new_eof_token()
2030                         else
2031                                 return new_character_token c
2032                 return null
2033
2034         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2035         # not needed: tok_state_character_reference_in_rcdata = ->
2036         # just call parse_character_reference()
2037
2038         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2039         tok_state_rawtext = ->
2040                 switch c = txt.charAt(cur++)
2041                         when '<'
2042                                 tok_state = tok_state_rawtext_less_than_sign
2043                         when "\u0000"
2044                                 parse_error()
2045                                 return new_character_token "\ufffd"
2046                         when '' # EOF
2047                                 return new_eof_token()
2048                         else
2049                                 return new_character_token c
2050                 return null
2051
2052         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2053         tok_state_script_data = ->
2054                 switch c = txt.charAt(cur++)
2055                         when '<'
2056                                 tok_state = tok_state_script_data_less_than_sign
2057                         when "\u0000"
2058                                 parse_error()
2059                                 return new_character_token "\ufffd"
2060                         when '' # EOF
2061                                 return new_eof_token()
2062                         else
2063                                 return new_character_token c
2064                 return null
2065
2066         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2067         tok_state_plaintext = ->
2068                 switch c = txt.charAt(cur++)
2069                         when "\u0000"
2070                                 parse_error()
2071                                 return new_character_token "\ufffd"
2072                         when '' # EOF
2073                                 return new_eof_token()
2074                         else
2075                                 return new_character_token c
2076                 return null
2077
2078
2079         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2080         tok_state_tag_open = ->
2081                 switch c = txt.charAt(cur++)
2082                         when '!'
2083                                 tok_state = tok_state_markup_declaration_open
2084                         when '/'
2085                                 tok_state = tok_state_end_tag_open
2086                         when '?'
2087                                 parse_error()
2088                                 tok_state = tok_state_bogus_comment
2089                         else
2090                                 if lc_alpha.indexOf(c) > -1
2091                                         tok_cur_tag = new_open_tag c
2092                                         tok_state = tok_state_tag_name
2093                                 else if uc_alpha.indexOf(c) > -1
2094                                         tok_cur_tag = new_open_tag c.toLowerCase()
2095                                         tok_state = tok_state_tag_name
2096                                 else
2097                                         parse_error()
2098                                         tok_state = tok_state_data
2099                                         cur -= 1 # we didn't parse/handle the char after <
2100                                         return new_text_node '<'
2101                 return null
2102
2103         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2104         tok_state_end_tag_open = ->
2105                 switch c = txt.charAt(cur++)
2106                         when '>'
2107                                 parse_error()
2108                                 tok_state = tok_state_data
2109                         when '' # EOF
2110                                 parse_error()
2111                                 tok_state = tok_state_data
2112                                 return new_text_node '</'
2113                         else
2114                                 if uc_alpha.indexOf(c) > -1
2115                                         tok_cur_tag = new_end_tag c.toLowerCase()
2116                                         tok_state = tok_state_tag_name
2117                                 else if lc_alpha.indexOf(c) > -1
2118                                         tok_cur_tag = new_end_tag c
2119                                         tok_state = tok_state_tag_name
2120                                 else
2121                                         parse_error()
2122                                         tok_state = tok_state_bogus_comment
2123                 return null
2124
2125         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2126         tok_state_tag_name = ->
2127                 switch c = txt.charAt(cur++)
2128                         when "\t", "\n", "\u000c", ' '
2129                                 tok_state = tok_state_before_attribute_name
2130                         when '/'
2131                                 tok_state = tok_state_self_closing_start_tag
2132                         when '>'
2133                                 tok_state = tok_state_data
2134                                 tmp = tok_cur_tag
2135                                 tok_cur_tag = null
2136                                 return tmp
2137                         when "\u0000"
2138                                 parse_error()
2139                                 tok_cur_tag.name += "\ufffd"
2140                         when '' # EOF
2141                                 parse_error()
2142                                 tok_state = tok_state_data
2143                         else
2144                                 if uc_alpha.indexOf(c) > -1
2145                                         tok_cur_tag.name += c.toLowerCase()
2146                                 else
2147                                         tok_cur_tag.name += c
2148                 return null
2149
2150         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2151         tok_state_rcdata_less_than_sign = ->
2152                 c = txt.charAt(cur++)
2153                 if c is '/'
2154                         temporary_buffer = ''
2155                         tok_state = tok_state_rcdata_end_tag_open
2156                         return null
2157                 # Anything else
2158                 tok_state = tok_state_rcdata
2159                 cur -= 1 # reconsume the input character
2160                 return new_character_token '<'
2161
2162         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2163         tok_state_rcdata_end_tag_open = ->
2164                 c = txt.charAt(cur++)
2165                 if uc_alpha.indexOf(c) > -1
2166                         tok_cur_tag = new_end_tag c.toLowerCase()
2167                         temporary_buffer += c
2168                         tok_state = tok_state_rcdata_end_tag_name
2169                         return null
2170                 if lc_alpha.indexOf(c) > -1
2171                         tok_cur_tag = new_end_tag c
2172                         temporary_buffer += c
2173                         tok_state = tok_state_rcdata_end_tag_name
2174                         return null
2175                 # Anything else
2176                 tok_state = tok_state_rcdata
2177                 cur -= 1 # reconsume the input character
2178                 return new_character_token "</" # fixfull separate these
2179
2180         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2181         is_appropriate_end_tag = (t) ->
2182                 # spec says to check against "the tag name of the last start tag to
2183                 # have been emitted from this tokenizer", but this is only called from
2184                 # the various "raw" states, which I'm pretty sure all push the start
2185                 # token onto open_els. TODO: verify this after the script data states
2186                 # are implemented
2187                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2188                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2189
2190         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2191         tok_state_rcdata_end_tag_name = ->
2192                 c = txt.charAt(cur++)
2193                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2194                         if is_appropriate_end_tag tok_cur_tag
2195                                 tok_state = tok_state_before_attribute_name
2196                                 return
2197                         # else fall through to "Anything else"
2198                 if c is '/'
2199                         if is_appropriate_end_tag tok_cur_tag
2200                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2201                                 return
2202                         # else fall through to "Anything else"
2203                 if c is '>'
2204                         if is_appropriate_end_tag tok_cur_tag
2205                                 tok_state = tok_state_data
2206                                 return tok_cur_tag
2207                         # else fall through to "Anything else"
2208                 if uc_alpha.indexOf(c) > -1
2209                         tok_cur_tag.name += c.toLowerCase()
2210                         temporary_buffer += c
2211                         return null
2212                 if lc_alpha.indexOf(c) > -1
2213                         tok_cur_tag.name += c
2214                         temporary_buffer += c
2215                         return null
2216                 # Anything else
2217                 tok_state = tok_state_rcdata
2218                 cur -= 1 # reconsume the input character
2219                 return new_character_token '</' + temporary_buffer # fixfull separate these
2220
2221         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2222         tok_state_rawtext_less_than_sign = ->
2223                 c = txt.charAt(cur++)
2224                 if c is '/'
2225                         temporary_buffer = ''
2226                         tok_state = tok_state_rawtext_end_tag_open
2227                         return null
2228                 # Anything else
2229                 tok_state = tok_state_rawtext
2230                 cur -= 1 # reconsume the input character
2231                 return new_character_token '<'
2232
2233         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2234         tok_state_rawtext_end_tag_open = ->
2235                 c = txt.charAt(cur++)
2236                 if uc_alpha.indexOf(c) > -1
2237                         tok_cur_tag = new_end_tag c.toLowerCase()
2238                         temporary_buffer += c
2239                         tok_state = tok_state_rawtext_end_tag_name
2240                         return null
2241                 if lc_alpha.indexOf(c) > -1
2242                         tok_cur_tag = new_end_tag c
2243                         temporary_buffer += c
2244                         tok_state = tok_state_rawtext_end_tag_name
2245                         return null
2246                 # Anything else
2247                 tok_state = tok_state_rawtext
2248                 cur -= 1 # reconsume the input character
2249                 return new_character_token "</" # fixfull separate these
2250
2251         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2252         tok_state_rawtext_end_tag_name = ->
2253                 c = txt.charAt(cur++)
2254                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2255                         if is_appropriate_end_tag tok_cur_tag
2256                                 tok_state = tok_state_before_attribute_name
2257                                 return
2258                         # else fall through to "Anything else"
2259                 if c is '/'
2260                         if is_appropriate_end_tag tok_cur_tag
2261                                 tok_state = tok_state_self_closing_start_tag
2262                                 return
2263                         # else fall through to "Anything else"
2264                 if c is '>'
2265                         if is_appropriate_end_tag tok_cur_tag
2266                                 tok_state = tok_state_data
2267                                 return tok_cur_tag
2268                         # else fall through to "Anything else"
2269                 if uc_alpha.indexOf(c) > -1
2270                         tok_cur_tag.name += c.toLowerCase()
2271                         temporary_buffer += c
2272                         return null
2273                 if lc_alpha.indexOf(c) > -1
2274                         tok_cur_tag.name += c
2275                         temporary_buffer += c
2276                         return null
2277                 # Anything else
2278                 tok_state = tok_state_rawtext
2279                 cur -= 1 # reconsume the input character
2280                 return new_character_token '</' + temporary_buffer # fixfull separate these
2281
2282         # TODO _all_ of the missing states here (17-33) are for parsing script tags
2283
2284         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2285         tok_state_before_attribute_name = ->
2286                 attr_name = null
2287                 switch c = txt.charAt(cur++)
2288                         when "\t", "\n", "\u000c", ' '
2289                                 return null
2290                         when '/'
2291                                 tok_state = tok_state_self_closing_start_tag
2292                                 return null
2293                         when '>'
2294                                 tok_state = tok_state_data
2295                                 tmp = tok_cur_tag
2296                                 tok_cur_tag = null
2297                                 return tmp
2298                         when "\u0000"
2299                                 parse_error()
2300                                 attr_name = "\ufffd"
2301                         when '"', "'", '<', '='
2302                                 parse_error()
2303                                 attr_name = c
2304                         when '' # EOF
2305                                 parse_error()
2306                                 tok_state = tok_state_data
2307                         else
2308                                 if uc_alpha.indexOf(c) > -1
2309                                         attr_name = c.toLowerCase()
2310                                 else
2311                                         attr_name = c
2312                 if attr_name?
2313                         tok_cur_tag.attrs_a.unshift [attr_name, '']
2314                         tok_state = tok_state_attribute_name
2315                 return null
2316
2317         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2318         tok_state_attribute_name = ->
2319                 switch c = txt.charAt(cur++)
2320                         when "\t", "\n", "\u000c", ' '
2321                                 tok_state = tok_state_after_attribute_name
2322                         when '/'
2323                                 tok_state = tok_state_self_closing_start_tag
2324                         when '='
2325                                 tok_state = tok_state_before_attribute_value
2326                         when '>'
2327                                 tok_state = tok_state_data
2328                                 tmp = tok_cur_tag
2329                                 tok_cur_tag = null
2330                                 return tmp
2331                         when "\u0000"
2332                                 parse_error()
2333                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2334                         when '"', "'", '<'
2335                                 parse_error()
2336                                 tok_cur_tag.attrs_a[0][0] = c
2337                         when '' # EOF
2338                                 parse_error()
2339                                 tok_state = tok_state_data
2340                         else
2341                                 if uc_alpha.indexOf(c) > -1
2342                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2343                                 else
2344                                         tok_cur_tag.attrs_a[0][0] += c
2345                 return null
2346
2347         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2348         tok_state_after_attribute_name = ->
2349                 c = txt.charAt(cur++)
2350                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2351                         return
2352                 if c is '/'
2353                         tok_state = tok_state_self_closing_start_tag
2354                         return
2355                 if c is '='
2356                         tok_state = tok_state_before_attribute_value
2357                         return
2358                 if c is '>'
2359                         tok_state = tok_state_data
2360                         return
2361                 if uc_alpha.indexOf(c) > -1
2362                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2363                         tok_state = tok_state_attribute_name
2364                         return
2365                 if c is "\u0000"
2366                         parse_error()
2367                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2368                         tok_state = tok_state_attribute_name
2369                         return
2370                 if c is '' # EOF
2371                         parse_error()
2372                         tok_state = tok_state_data
2373                         cur -= 1 # reconsume
2374                         return
2375                 if c is '"' or c is "'" or c is '<'
2376                         parse_error()
2377                         # fall through to Anything else
2378                 # Anything else
2379                 tok_cur_tag.attrs_a.unshift [c, '']
2380                 tok_state = tok_state_attribute_name
2381
2382         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2383         tok_state_before_attribute_value = ->
2384                 switch c = txt.charAt(cur++)
2385                         when "\t", "\n", "\u000c", ' '
2386                                 return null
2387                         when '"'
2388                                 tok_state = tok_state_attribute_value_double_quoted
2389                         when '&'
2390                                 tok_state = tok_state_attribute_value_unquoted
2391                                 cur -= 1
2392                         when "'"
2393                                 tok_state = tok_state_attribute_value_single_quoted
2394                         when "\u0000"
2395                                 # Parse error
2396                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2397                                 tok_state = tok_state_attribute_value_unquoted
2398                         when '>'
2399                                 # Parse error
2400                                 tok_state = tok_state_data
2401                                 tmp = tok_cur_tag
2402                                 tok_cur_tag = null
2403                                 return tmp
2404                         when '' # EOF
2405                                 parse_error()
2406                                 tok_state = tok_state_data
2407                         else
2408                                 tok_cur_tag.attrs_a[0][1] += c
2409                                 tok_state = tok_state_attribute_value_unquoted
2410                 return null
2411
2412         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2413         tok_state_attribute_value_double_quoted = ->
2414                 switch c = txt.charAt(cur++)
2415                         when '"'
2416                                 tok_state = tok_state_after_attribute_value_quoted
2417                         when '&'
2418                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2419                         when "\u0000"
2420                                 # Parse error
2421                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2422                         when '' # EOF
2423                                 parse_error()
2424                                 tok_state = tok_state_data
2425                         else
2426                                 tok_cur_tag.attrs_a[0][1] += c
2427                 return null
2428
2429         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2430         tok_state_attribute_value_single_quoted = ->
2431                 switch c = txt.charAt(cur++)
2432                         when "'"
2433                                 tok_state = tok_state_after_attribute_value_quoted
2434                         when '&'
2435                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2436                         when "\u0000"
2437                                 # Parse error
2438                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2439                         when '' # EOF
2440                                 parse_error()
2441                                 tok_state = tok_state_data
2442                         else
2443                                 tok_cur_tag.attrs_a[0][1] += c
2444                 return null
2445
2446         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2447         tok_state_attribute_value_unquoted = ->
2448                 switch c = txt.charAt(cur++)
2449                         when "\t", "\n", "\u000c", ' '
2450                                 tok_state = tok_state_before_attribute_name
2451                         when '&'
2452                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2453                         when '>'
2454                                 tok_state = tok_state_data
2455                                 tmp = tok_cur_tag
2456                                 tok_cur_tag = null
2457                                 return tmp
2458                         when "\u0000"
2459                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2460                         when '' # EOF
2461                                 parse_error()
2462                                 tok_state = tok_state_data
2463                         else
2464                                 # Parse Error if ', <, = or ` (backtick)
2465                                 tok_cur_tag.attrs_a[0][1] += c
2466                 return null
2467
2468         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2469         tok_state_after_attribute_value_quoted = ->
2470                 switch c = txt.charAt(cur++)
2471                         when "\t", "\n", "\u000c", ' '
2472                                 tok_state = tok_state_before_attribute_name
2473                         when '/'
2474                                 tok_state = tok_state_self_closing_start_tag
2475                         when '>'
2476                                 tok_state = tok_state_data
2477                                 tmp = tok_cur_tag
2478                                 tok_cur_tag = null
2479                                 return tmp
2480                         when '' # EOF
2481                                 parse_error()
2482                                 tok_state = tok_state_data
2483                         else
2484                                 # Parse Error
2485                                 tok_state = tok_state_before_attribute_name
2486                                 cur -= 1 # we didn't handle that char
2487                 return null
2488
2489         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
2490         # Don't set this as a state, just call it
2491         # returns a string (NOT a text node)
2492         parse_character_reference = (allowed_char = null, in_attr = false) ->
2493                 if cur >= txt.length
2494                         return '&'
2495                 switch c = txt.charAt(cur)
2496                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
2497                                 # explicitly not a parse error
2498                                 return '&'
2499                         when ';'
2500                                 # there has to be "one or more" alnums between & and ; to be a parse error
2501                                 return '&'
2502                         when '#'
2503                                 if cur + 1 >= txt.length
2504                                         return '&'
2505                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
2506                                         prefix = '#x'
2507                                         charset = hex_chars
2508                                         start = cur + 2
2509                                 else
2510                                         charset = digits
2511                                         start = cur + 1
2512                                         prefix = '#'
2513                                 i = 0
2514                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
2515                                         i += 1
2516                                 if i is 0
2517                                         return '&'
2518                                 if txt.charAt(start + i) is ';'
2519                                         i += 1
2520                                 # FIXME This is supposed to generate parse errors for some chars
2521                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
2522                                 if decoded?
2523                                         cur = start + i
2524                                         return decoded
2525                                 return '&'
2526                         else
2527                                 for i in [0...31]
2528                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
2529                                                 break
2530                                 if i is 0
2531                                         # exit early, because parse_error() below needs at least one alnum
2532                                         return '&'
2533                                 if txt.charAt(cur + i) is ';'
2534                                         i += 1 # include ';' terminator in value
2535                                         decoded = decode_named_char_ref txt.substr(cur, i)
2536                                         if decoded?
2537                                                 cur += i
2538                                                 return decoded
2539                                         parse_error()
2540                                         return '&'
2541                                 else
2542                                         # no ';' terminator (only legacy char refs)
2543                                         max = i
2544                                         for i in [2..max] # no prefix matches, so ok to check shortest first
2545                                                 c = legacy_char_refs[txt.substr(cur, i)]
2546                                                 if c?
2547                                                         if in_attr
2548                                                                 if txt.charAt(cur + i) is '='
2549                                                                         # "because some legacy user agents will
2550                                                                         # misinterpret the markup in those cases"
2551                                                                         parse_error()
2552                                                                         return '&'
2553                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
2554                                                                         # this makes attributes forgiving about url args
2555                                                                         return '&'
2556                                                         # ok, and besides the weird exceptions for attributes...
2557                                                         # return the matching char
2558                                                         cur += i # consume entity chars
2559                                                         parse_error() # because no terminating ";"
2560                                                         return c
2561                                         parse_error()
2562                                         return '&'
2563                 return # never reached
2564
2565         # tree constructor initialization
2566         # see comments on TYPE_TAG/etc for the structure of this data
2567         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
2568         open_els = [doc]
2569         afe = [] # active formatting elements
2570         template_insertion_modes = []
2571         insertion_mode = ins_mode_initial
2572         original_insertion_mode = insertion_mode # TODO check spec
2573         flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
2574         flag_frameset_ok = true
2575         flag_parsing = true
2576         flag_foster_parenting = false
2577         form_element_pointer = null
2578         temporary_buffer = null
2579         pending_table_character_tokens = []
2580         head_element_pointer = null
2581
2582         # tokenizer initialization
2583         tok_state = tok_state_data
2584
2585         # proccess input
2586         while flag_parsing
2587                 t = tok_state()
2588                 if t?
2589                         insertion_mode t
2590         return doc.children
2591
2592 # everything below is tests on the above
2593 test_equals = (description, output, expected_output) ->
2594         if output is expected_output
2595                 console.log "passed." # don't say name, so smart consoles can merge all of these
2596         else
2597                 console.log "FAILED: \"#{description}\""
2598                 console.log "   Expected: #{expected_output}"
2599                 console.log "     Actual: #{output}"
2600 serialize_els = (els, shallow, show_ids) ->
2601         serialized = ''
2602         sep = ''
2603         for t in els
2604                 serialized += sep
2605                 sep = ','
2606                 serialized += t.serialize shallow, show_ids
2607         return serialized
2608 test_parser = (args) ->
2609         debug_log_reset()
2610         parse_errors = []
2611         errors_cb = (i) ->
2612                 parse_errors.push i
2613         prev_node_id = 0 # reset counter
2614         parsed = parse_html args.html, errors_cb
2615         serialized = serialize_els parsed, false, false
2616         expected = 'tag:"html",{},[tag:"head",{},[],tag:"body",{},[' + args.expected + ']]'
2617         if serialized isnt expected
2618                 debug_log_each (str) ->
2619                         console.log str
2620                 console.log "FAILED: \"#{args.name}\""
2621                 console.log "      Input: #{args.html}"
2622                 console.log "    Correct: #{expected}"
2623                 console.log "     Output: #{serialized}"
2624                 if parse_errors.length > 0
2625                         console.log " parse errs: #{JSON.stringify parse_errors}"
2626                 else
2627                         console.log "   No parse errors"
2628         else
2629                 console.log "passed \"#{args.name}\""
2630
2631 test_parser name: "empty", \
2632         html: "",
2633         expected: ''
2634 test_parser name: "just text", \
2635         html: "abc",
2636         expected: 'text:"abc"'
2637 test_parser name: "named entity", \
2638         html: "a&amp;1234",
2639         expected: 'text:"a&1234"'
2640 test_parser name: "broken named character references", \
2641         html: "1&amp2&&amp;3&aabbcc;",
2642         expected: 'text:"1&2&&3&aabbcc;"'
2643 test_parser name: "numbered entity overrides", \
2644         html: "1&#X80&#x80; &#x83",
2645         expected: 'text:"1€€ ƒ"'
2646 test_parser name: "open tag", \
2647         html: "foo<span>bar",
2648         expected: 'text:"foo",tag:"span",{},[text:"bar"]'
2649 test_parser name: "open tag with attributes", \
2650         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
2651         expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
2652 test_parser name: "open tag with attributes of various quotings", \
2653         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
2654         expected: 'text:"foo",tag:"span",{"abc":"def","autofocus":"","g":"hij","klm":"nopqrstuv\\""},[text:"bar"]'
2655 test_parser name: "attribute entity exceptions dq", \
2656         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
2657         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
2658 test_parser name: "attribute entity exceptions sq", \
2659         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
2660         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
2661 test_parser name: "attribute entity exceptions uq", \
2662         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
2663         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
2664 test_parser name: "matching closing tags", \
2665         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
2666         expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"'
2667 test_parser name: "missing closing tag inside", \
2668         html: "foo<div>bar<span>baz</div>qux",
2669         expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"'
2670 test_parser name: "mis-matched closing tags", \
2671         html: "<span>12<div>34</span>56</div>78",
2672         expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]'
2673 test_parser name: "mis-matched formatting elements", \
2674         html: "12<b>34<i>56</b>78</i>90",
2675         expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"'
2676 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
2677         html: '<p>1<b>2<i>3</b>4</i>5</p>',
2678         expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]'
2679 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
2680         html: '<b>1<p>2</b>3</p>',
2681         expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]'
2682 test_parser name: "crazy formatting elements test", \
2683         html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
2684         # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
2685         # firefox does this:
2686         expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
2687 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
2688 test_parser name: "html5lib aaa 1", \
2689         html: '<a><p></a></p>',
2690         expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]'
2691 test_parser name: "html5lib aaa 2", \
2692         html: '<a>1<p>2</a>3</p>',
2693         expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]'
2694 test_parser name: "html5lib aaa 3", \
2695         html: '<a>1<button>2</a>3</button>',
2696         expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]'
2697 test_parser name: "html5lib aaa 4", \
2698         html: '<a>1<b>2</a>3</b>',
2699         expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]'
2700 test_parser name: "html5lib aaa 5 (two divs deep)", \
2701         html: '<a>1<div>2<div>3</a>4</div>5</div>',
2702         expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]'
2703 test_parser name: "html5lib aaa 6 (foster parenting)", \
2704         html: '<table><a>1<p>2</a>3</p>',
2705         expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]'
2706 test_parser name: "html5lib aaa 7 (aaa, eof) 1", \
2707         html: '<b><b><a><p></a>',
2708         expected: 'tag:"b",{},[tag:"b",{},[tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]]]'
2709 test_parser name: "html5lib aaa 8 (aaa, eof) 2", \
2710         html: '<b><a><b><p></a>',
2711         expected: 'tag:"b",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2712 test_parser name: "html5lib aaa 9 (aaa, eof) 3", \
2713         html: '<a><b><b><p></a>',
2714         expected: 'tag:"a",{},[tag:"b",{},[tag:"b",{},[]]],tag:"b",{},[tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2715 test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \
2716         html: '<p>1<s id="A">2<b id="B">3</p>4</s>5</b>',
2717         expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]'
2718 test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
2719         html: '<table><a>1<td>2</td>3</table>',
2720         expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]'
2721 test_parser name: "html5lib aaa 12 (table with foster parenting, split text)", \
2722         html: '<table>A<td>B</td>C</table>',
2723         expected: 'text:"AC",tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2724 # TODO implement svg and namespacing
2725 #test_parser name: "html5lib aaa 13 (svg tr input)", \
2726 #       html: '<a><svg><tr><input></a>',
2727 #       expected: 'tag:"a",{},[svg:"svg",{},[svg:"tr",{},[svg:"input"]]]'
2728 test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \
2729         html: '<div><a><b><div><div><div><div><div><div><div><div><div><div></a>',
2730         expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"div",{},[tag:"div",{},[]]]]]]]]]]]]]'
2731 test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \
2732         html: '<div><a><b><u><i><code><div></a>',
2733         expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]'
2734 test_parser name: "html5lib aaa 16 (correctly nested 4b)", \
2735         html: '<b><b><b><b>x</b></b></b></b>y',
2736         expected: 'tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]],text:"y"'
2737 test_parser name: "html5lib aaa 17 (formatting, implied /p, noah's ark)", \
2738         html: '<p><b><b><b><b><p>x',
2739         expected: 'tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[]]]]],tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]]'
2740 test_parser name: "variation on html5lib aaa 17 (with attributes in various orders)", \
2741         html: '<p><b c="d" e="f"><b e="f" c="d"><b e="f" c="d"><b c="d" e="f"><p>x',
2742         expected: 'tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[]]]]],tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[text:"x"]]]]'
2743 test_parser name: "junk after attribute close-quote", \
2744         html: '<p><b c="d", e="f">foo<p>x',
2745         expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'
2746 test_parser name: "html5lib aaa02 1", \
2747         html: '<b>1<i>2<p>3</b>4',
2748         expected: 'tag:"b",{},[text:"1",tag:"i",{},[text:"2"]],tag:"i",{},[tag:"p",{},[tag:"b",{},[text:"3"],text:"4"]]'
2749 test_parser name: "html5lib aaa02 2", \
2750         html: '<a><div><style></style><address><a>',
2751         expected: 'tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"style",{},[]],tag:"address",{},[tag:"a",{},[],tag:"a",{},[]]]'
2752 test_parser name: "html5lib tables 1", \
2753         html: '<table><th>',
2754         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"th",{},[]]]]'
2755 test_parser name: "html5lib tables 2", \
2756         html: '<table><td>',
2757         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
2758 test_parser name: "html5lib tables 3", \
2759         html: "<table><col foo='bar'>",
2760         expected: 'tag:"table",{},[tag:"colgroup",{},[tag:"col",{"foo":"bar"},[]]]'
2761 test_parser name: "html5lib tables 4", \
2762         html: '<table><colgroup></html>foo',
2763         expected: 'text:"foo",tag:"table",{},[tag:"colgroup",{},[]]'
2764 test_parser name: "html5lib tables 5", \
2765         html: '<table></table><p>foo',
2766         expected: 'tag:"table",{},[],tag:"p",{},[text:"foo"]'
2767 test_parser name: "html5lib tables 6", \
2768         html: '<table></body></caption></col></colgroup></html></tbody></td></tfoot></th></thead></tr><td>',
2769         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
2770 test_parser name: "html5lib tables 7", \
2771         html: '<table><select><option>3</select></table>',
2772         expected: 'tag:"select",{},[tag:"option",{},[text:"3"]],tag:"table",{},[]'
2773 test_parser name: "html5lib tables 8", \
2774         html: '<table><select><table></table></select></table>',
2775         expected: 'tag:"select",{},[],tag:"table",{},[],tag:"table",{},[]'
2776 test_parser name: "html5lib tables 9", \
2777         html: '<table><select></table>',
2778         expected: 'tag:"select",{},[],tag:"table",{},[]'
2779 test_parser name: "html5lib tables 10", \
2780         html: '<table><select><option>A<tr><td>B</td></tr></table>',
2781         expected: 'tag:"select",{},[tag:"option",{},[text:"A"]],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2782 test_parser name: "html5lib tables 11", \
2783         html: '<table><td></body></caption></col></colgroup></html>foo',
2784         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
2785 test_parser name: "html5lib tables 12", \
2786         html: '<table><td>A</table>B',
2787         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"A"]]]],text:"B"'
2788 test_parser name: "html5lib tables 13", \
2789         html: '<table><tr><caption>',
2790         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[]],tag:"caption",{},[]]'
2791 test_parser name: "html5lib tables 14", \
2792         html: '<table><tr></body></caption></col></colgroup></html></td></th><td>foo',
2793         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
2794 test_parser name: "html5lib tables 15", \
2795         html: '<table><td><tr>',
2796         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]],tag:"tr",{},[]]]'
2797 test_parser name: "html5lib tables 16", \
2798         html: '<table><td><button><td>',
2799         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[tag:"button",{},[]],tag:"td",{},[]]]]'
2800 # TODO implement svg parsing
2801 #test_parser name: "html5lib tables 17", \
2802 #       html: '<table><tr><td><svg><desc><td>',
2803 #       expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[svg:"svg",{},[svg:"desc",{},[]]],tag:"td",{},[]]]]'