parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
  20 # or <body> tags, nor does it produce the top level "document" node in the dom
  21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
  22 # indicate places where additional code is needed for full HTML document
  23 # parsing.
  24 #
  25 # Instead, the data structure produced by this parser is an array of Nodes.
  26
  27
  28 # stacks/lists
  29 #
  30 # the spec uses a many different words do indicate which ends of lists/stacks
  31 # they are talking about (and relative movement within the lists/stacks). This
  32 # section splains. I'm implementing "lists" (afe and open_els) the same way
  33 # (both as stacks)
  34 #
  35 # stacks grow downward (current element is index=0)
  36 #
  37 # example: open_els = [a, b, c, d, e, f, g]
  38 #
  39 # "grows downwards" means it's visualized like this: (index: el, names)
  40 #
  41 #   6: g "start of the list", "topmost", "first"
  42 #   5: f
  43 #   4: e "previous" (to d), "above", "before"
  44 #   3: d   (previous/next are relative to this element)
  45 #   2: c "next", "after", "lower", "below"
  46 #   1: b
  47 #   0: a "end of the list", "current node", "bottommost", "last"
  48
  49
  50
  51 # Each node is an obect of the Node class. Here are the Node types:
  52 TYPE_TAG = 0 # name, {attributes}, [children]
  53 TYPE_TEXT = 1 # "text"
  54 TYPE_COMMENT = 2
  55 TYPE_DOCTYPE = 3
  56 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  57 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
  58 TYPE_END_TAG = 5 # name
  59 TYPE_EOF = 6
  60 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
  61 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
  62
  63 # namespace constants
  64 NS_HTML = 1
  65 NS_MATHML = 2
  66 NS_SVG = 3
  67
  68 g_debug_log = []
  69 debug_log_reset = ->
  70         g_debug_log = []
  71 debug_log = (str) ->
  72         g_debug_log.push str
  73 debug_log_each = (cb) ->
  74         for str in g_debug_log
  75                 cb str
  76
  77 prev_node_id = 0
  78 class Node
  79         constructor: (type, args = {}) ->
  80                 @type = type # one of the TYPE_* constants above
  81                 @name = args.name ? '' # tag name
  82                 @text = args.text ? '' # contents for text/comment nodes
  83                 @attrs = args.attrs ? {}
  84                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
  85                 @children = args.children ? []
  86                 @namespace = args.namespace ? NS_HTML
  87                 @parent = args.parent ? null
  88                 if args.id?
  89                         @id = "#{args.id}+"
  90                 else
  91                         @id = "#{++prev_node_id}"
  92         shallow_clone: -> # return a new node that's the same except without the children or parent
  93                 # WARNING this doesn't work right on open tags that are still being parsed
  94                 attrs = {}
  95                 attrs[k] = v for k, v of @attrs
  96                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
  97         acknowledge_self_closing: ->
  98                 @flag 'did_self_close', true
  99         flag: ->
 100                 # fixfull
 101         serialize: (shallow = false, show_ids = false) -> # for unit tests
 102                 ret = ''
 103                 switch @type
 104                         when TYPE_TAG
 105                                 ret += 'tag:'
 106                                 ret += JSON.stringify @name
 107                                 ret += ','
 108                                 if show_ids
 109                                         ret += "##{@id},"
 110                                 if shallow
 111                                         break
 112                                 attr_keys = []
 113                                 for k of @attrs
 114                                         attr_keys.push k
 115                                 attr_keys.sort()
 116                                 ret += '{'
 117                                 sep = ''
 118                                 for k in attr_keys
 119                                         ret += sep
 120                                         sep = ','
 121                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
 122                                 ret += '},['
 123                                 sep = ''
 124                                 for c in @children
 125                                         ret += sep
 126                                         sep = ','
 127                                         ret += c.serialize shallow, show_ids
 128                                 ret += ']'
 129                         when TYPE_TEXT
 130                                 ret += 'text:'
 131                                 ret += JSON.stringify @text
 132                         when TYPE_COMMENT
 133                                 ret += 'comment:'
 134                                 ret += JSON.stringify @text
 135                         when TYPE_DOCTYPE
 136                                 ret += 'doctype'
 137                                 # FIXME
 138                         when TYPE_AFE_MARKER
 139                                 ret += 'marker'
 140                         when TYPE_AAA_BOOKMARK
 141                                 ret += 'aaa_bookmark'
 142                         else
 143                                 ret += 'unknown:'
 144                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
 145                 return ret
 146
 147 # helpers: (only take args that are normally known when parser creates nodes)
 148 new_open_tag = (name) ->
 149         return new Node TYPE_START_TAG, name: name
 150 new_end_tag = (name) ->
 151         return new Node TYPE_END_TAG, name: name
 152 new_element = (name) ->
 153         return new Node TYPE_TAG, name: name
 154 new_text_node = (txt) ->
 155         return new Node TYPE_TEXT, text: txt
 156 new_character_token = new_text_node
 157 new_comment_node = (txt) ->
 158         return new Node TYPE_COMMENT, text: txt
 159 new_eof_token = ->
 160         return new Node TYPE_EOF
 161 new_afe_marker = ->
 162         return new Node TYPE_AFE_MARKER
 163 new_aaa_bookmark = ->
 164         return new Node TYPE_AAA_BOOKMARK
 165
 166 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
 167 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 168 digits = "0123456789"
 169 alnum = lc_alpha + uc_alpha + digits
 170 hex_chars = digits + "abcdefABCDEF"
 171
 172 # some SVG elements have dashes in them
 173 tag_name_chars = alnum + "-"
 174
 175 # http://www.w3.org/TR/html5/infrastructure.html#space-character
 176 space_chars = "\u0009\u000a\u000c\u000d\u0020"
 177
 178 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
 179 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
 180
 181 # These are the character references that don't need a terminating semicolon
 182 # min length: 2, max: 6, none are a prefix of any other.
 183 legacy_char_refs = {
 184         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
 185         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
 186         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
 187         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
 188         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
 189         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
 190         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
 191         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
 192         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
 193         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
 194         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
 195         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
 196         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
 197         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
 198         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
 199         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
 200         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
 201         yen: '¥', yuml: 'ÿ'
 202 }
 203
 204 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
 205 raw_text_elements = ['script', 'style']
 206 escapable_raw_text_elements = ['textarea', 'title']
 207 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
 208 svg_elements = [
 209         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
 210         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
 211         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
 212         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
 213         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
 214         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
 215         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
 216         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
 217         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
 218         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
 219         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
 220         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
 221         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
 222         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
 223         'view', 'vkern'
 224 ]
 225
 226 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
 227 mathml_elements = [
 228         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
 229         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
 230         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
 231         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
 232         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
 233         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 234         'determinant', 'diff', 'divergence', 'divide', 'domain',
 235         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 236         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 237         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 238         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 239         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 240         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 241         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 242         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 243         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 244         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 245         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 246         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 247         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 248         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 249         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 250         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 251         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 252         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 253         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 254         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 255         'vectorproduct', 'xor'
 256 ]
 257 # foreign_elements = [svg_elements..., mathml_elements...]
 258 #normal_elements = All other allowed HTML elements are normal elements.
 259
 260 special_elements = {
 261         # HTML:
 262         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
 263         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
 264         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
 265         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
 266         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
 267         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
 268         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
 269         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
 270         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
 271         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
 272         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
 273         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
 274         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
 275         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
 276         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
 277         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
 278         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
 279         wbr:NS_HTML, xmp:NS_HTML,
 280
 281         # MathML:
 282         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
 283         'annotation-xml':NS_MATHML,
 284
 285         # SVG:
 286         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
 287 }
 288
 289 formatting_elements = {
 290          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
 291          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
 292          u: true
 293 }
 294
 295 foster_parenting_targets = {
 296         table: true
 297         tbody: true
 298         tfoot: true
 299         thead: true
 300         tr: true
 301 }
 302
 303 # all html I presume
 304 end_tag_implied = {
 305         dd: true
 306         dt: true
 307         li: true
 308         option: true
 309         optgroup: true
 310         p: true
 311         rb: true
 312         rp: true
 313         rt: true
 314         rtc: true
 315 }
 316
 317 el_is_special = (e) ->
 318         return special_elements[e.name] is e.namespace
 319
 320 # decode_named_char_ref()
 321 #
 322 # The list of named character references is _huge_ so ask the browser to decode
 323 # for us instead of wasting bandwidth/space on including the table here.
 324 #
 325 # Pass without the "&" but with the ";" examples:
 326 #    for "&amp" pass "amp;"
 327 #    for "&#x2032" pass "x2032;"
 328 g_dncr = {
 329         cache: {}
 330         textarea: document.createElement('textarea')
 331 }
 332 # TODO test this in IE8
 333 decode_named_char_ref = (txt) ->
 334         txt = "&#{txt}"
 335         decoded = g_dncr.cache[txt]
 336         return decoded if decoded?
 337         g_dncr.textarea.innerHTML = txt
 338         decoded = g_dncr.textarea.value
 339         return null if decoded is txt
 340         return g_dncr.cache[txt] = decoded
 341
 342 parse_html = (txt, parse_error_cb = null) ->
 343         cur = 0 # index of next char in txt to be parsed
 344         # declare tree and tokenizer variables so they're in scope below
 345         tree = null
 346         open_els = null # stack of open elements
 347         afe = null # active formatting elements
 348         template_insertion_modes = null
 349         insertion_mode = null
 350         original_insertion_mode = null
 351         tok_state = null
 352         tok_cur_tag = null # partially parsed tag
 353         flag_scripting = null
 354         flag_frameset_ok = null
 355         flag_parsing = null
 356         flag_foster_parenting = null
 357         form_element_pointer = null
 358         temporary_buffer = null
 359         pending_table_character_tokens = null
 360
 361         parse_error = ->
 362                 if parse_error_cb?
 363                         parse_error_cb cur
 364                 else
 365                         console.log "Parse error at character #{cur} of #{txt.length}"
 366
 367         afe_push = (new_el) ->
 368                 matches = 0
 369                 for el, i in afe
 370                         if el.name is new_el.name and el.namespace is new_el.namespace
 371                                 for k, v of el.attrs
 372                                         continue unless new_el.attrs[k] is v
 373                                 for k, v of new_el.attrs
 374                                         continue unless el.attrs[k] is v
 375                                 matches += 1
 376                                 if matches is 3
 377                                         afe.splice i, 1
 378                                         break
 379                 afe.unshift new_el
 380         afe_push_marker = ->
 381                 afe.unshift new_afe_marker()
 382
 383         # the functions below impliment the Tree Contstruction algorithm
 384         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 385
 386         # But first... the helpers
 387         template_tag_is_open = ->
 388                 for t in open_els
 389                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
 390                                 return true
 391                 return false
 392         is_in_scope_x = (tag_name, scope, namespace) ->
 393                 for t in open_els
 394                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 395                                 return true
 396                         if scope[t.name] is t.namespace
 397                                 return false
 398                 return false
 399         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
 400                 for t in open_els
 401                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 402                                 return true
 403                         if scope[t.name] is t.namespace
 404                                 return false
 405                         if scope2[t.name] is t.namespace
 406                                 return false
 407                 return false
 408         standard_scopers = { # FIXME these are supposed to be namespace specific
 409                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
 410                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
 411                 template: NS_HTML, mi: NS_MATHML,
 412
 413                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
 414                 'annotation-xml': NS_MATHML,
 415
 416                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
 417         }
 418         button_scopers = button: NS_HTML
 419         li_scopers = ol: NS_HTML, ul: NS_HTML
 420         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
 421         is_in_scope = (tag_name, namespace = null) ->
 422                 return is_in_scope_x tag_name, standard_scopers, namespace
 423         is_in_button_scope = (tag_name, namespace = null) ->
 424                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
 425         is_in_table_scope = (tag_name, namespace = null) ->
 426                 return is_in_scope_x tag_name, table_scopers, namespace
 427         is_in_select_scope = (tag_name, namespace = null) ->
 428                 for t in open_els
 429                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 430                                 return true
 431                         if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
 432                                 return false
 433                 return false
 434         # this checks for a particular element, not by name
 435         el_is_in_scope = (el) ->
 436                 for t in open_els
 437                         if t is el
 438                                 return true
 439                         if standard_scopers[t.name] is t.namespace
 440                                 return false
 441                 return false
 442
 443         # 8.2.3.1 ...
 444         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
 445         reset_insertion_mode = ->
 446                 # 1. Let last be false.
 447                 last = false
 448                 # 2. Let node be the last node in the stack of open elements.
 449                 node_i = 0
 450                 node = open_els[node_i]
 451                 # 3. Loop: If node is the first node in the stack of open elements,
 452                 # then set last to true, and, if the parser was originally created as
 453                 # part of the HTML fragment parsing algorithm (fragment case) set node
 454                 # to the context element.
 455                 loop
 456                         if node_i is open_els.length - 1
 457                                 last = true
 458                                 # fixfull (fragment case)
 459
 460                         # 4. If node is a select element, run these substeps:
 461                         if node.name is 'select'
 462                                 # 1. If last is true, jump to the step below labeled done.
 463                                 unless last
 464                                         # 2. Let ancestor be node.
 465                                         ancestor_i = node_i
 466                                         ancestor = node
 467                                         # 3. Loop: If ancestor is the first node in the stack of
 468                                         # open elements, jump to the step below labeled done.
 469                                         loop
 470                                                 if ancestor_i is open_els.length - 1
 471                                                         break
 472                                                 # 4. Let ancestor be the node before ancestor in the stack
 473                                                 # of open elements.
 474                                                 ancestor_i += 1
 475                                                 ancestor = open_els[ancestor_i]
 476                                                 # 5. If ancestor is a template node, jump to the step below
 477                                                 # labeled done.
 478                                                 if ancestor.name is 'template'
 479                                                         break
 480                                                 # 6. If ancestor is a table node, switch the insertion mode
 481                                                 # to "in select in table" and abort these steps.
 482                                                 if ancestor.name is 'table'
 483                                                         insertion_mode = ins_mode_in_select_in_table
 484                                                         return
 485                                                 # 7. Jump back to the step labeled loop.
 486                                 # 8. Done: Switch the insertion mode to "in select" and abort
 487                                 # these steps.
 488                                 insertion_mode = ins_mode_in_select
 489                                 return
 490                         # 5. If node is a td or th element and last is false, then switch
 491                         # the insertion mode to "in cell" and abort these steps.
 492                         if (node.name is 'td' or node.name is 'th') and last is false
 493                                 insertion_mode = ins_mode_in_cell
 494                                 return
 495                         # 6. If node is a tr element, then switch the insertion mode to "in
 496                         # row" and abort these steps.
 497                         if node.name is 'tr'
 498                                 insertion_mode = ins_mode_in_row
 499                                 return
 500                         # 7. If node is a tbody, thead, or tfoot element, then switch the
 501                         # insertion mode to "in table body" and abort these steps.
 502                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
 503                                 insertion_mode = ins_mode_in_table_body
 504                                 return
 505                         # 8. If node is a caption element, then switch the insertion mode
 506                         # to "in caption" and abort these steps.
 507                         if node.name is 'caption'
 508                                 insertion_mode = ins_mode_in_caption
 509                                 return
 510                         # 9. If node is a colgroup element, then switch the insertion mode
 511                         # to "in column group" and abort these steps.
 512                         if node.name is 'colgroup'
 513                                 insertion_mode = ins_mode_in_column_group
 514                                 return
 515                         # 10. If node is a table element, then switch the insertion mode to
 516                         # "in table" and abort these steps.
 517                         if node.name is 'table'
 518                                 insertion_mode = ins_mode_in_table
 519                                 return
 520                         # 11. If node is a template element, then switch the insertion mode
 521                         # to the current template insertion mode and abort these steps.
 522                         # fixfull (template insertion mode stack)
 523
 524                         # 12. If node is a head element and last is true, then switch the
 525                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
 526                         # these steps. (fragment case)
 527                         if node.name is 'head' and last
 528                                 insertion_mode = ins_mode_in_body
 529                                 return
 530                         # 13. If node is a head element and last is false, then switch the
 531                         # insertion mode to "in head" and abort these steps.
 532                         if node.name is 'head' and last is false
 533                                 insertion_mode = ins_mode_in_head
 534                                 return
 535                         # 14. If node is a body element, then switch the insertion mode to
 536                         # "in body" and abort these steps.
 537                         if node.name is 'body'
 538                                 insertion_mode = ins_mode_in_body
 539                                 return
 540                         # 15. If node is a frameset element, then switch the insertion mode
 541                         # to "in frameset" and abort these steps. (fragment case)
 542                         if node.name is 'frameset'
 543                                 insertion_mode = ins_mode_in_frameset
 544                                 return
 545                         # 16. If node is an html element, run these substeps:
 546                         if node.name is 'html'
 547                                 # 1. If the head element pointer is null, switch the insertion
 548                                 # mode to "before head" and abort these steps. (fragment case)
 549                                 # fixfull (fragment case)
 550
 551                                 # 2. Otherwise, the head element pointer is not null, switch
 552                                 # the insertion mode to "after head" and abort these steps.
 553                                 insertion_mode = ins_mode_in_body # FIXME fixfull
 554                                 return
 555                         # 17. If last is true, then switch the insertion mode to "in body"
 556                         # and abort these steps. (fragment case)
 557                         if last
 558                                 insertion_mode = ins_mode_in_body
 559                                 return
 560                         # 18. Let node now be the node before node in the stack of open
 561                         # elements.
 562                         node_i += 1
 563                         node = open_els[node_i]
 564                         # 19. Return to the step labeled loop.
 565
 566         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
 567         # this implementation is structured (mostly) as described at the link above.
 568         # capitalized comments are the "labels" described at the link above.
 569         reconstruct_active_formatting_elements = ->
 570                 return if afe.length is 0
 571                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
 572                         return
 573                 # Rewind
 574                 i = 0
 575                 loop
 576                         if i is afe.length - 1
 577                                 break
 578                         i += 1
 579                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
 580                                 i -= 1 # Advance
 581                                 break
 582                 # Create
 583                 loop
 584                         el = afe[i].shallow_clone()
 585                         tree_insert_element el
 586                         afe[i] = el
 587                         break if i is 0
 588                         i -= 1 # Advance
 589
 590         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
 591         # adoption agency algorithm
 592         # overview here:
 593         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
 594         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
 595         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
 596         adoption_agency = (subject) ->
 597                 debug_log "adoption_agency()"
 598                 debug_log "tree: #{serialize_els tree.children, false, true}"
 599                 debug_log "open_els: #{serialize_els open_els, true, true}"
 600                 debug_log "afe: #{serialize_els afe, true, true}"
 601                 if open_els[0].name is subject
 602                         el = open_els[0]
 603                         open_els.shift()
 604                         # remove it from the list of active formatting elements (if found)
 605                         for t, i in afe
 606                                 if t is el
 607                                         afe.splice i, 1
 608                                         break
 609                         debug_log "aaa: starting off with subject on top of stack, exiting"
 610                         return
 611                 outer = 0
 612                 loop
 613                         if outer >= 8
 614                                 return
 615                         outer += 1
 616                         # 5. Let formatting element be the last element in the list of
 617                         # active formatting elements that: is between the end of the list
 618                         # and the last scope marker in the list, if any, or the start of
 619                         # the list otherwise, and  has the tag name subject.
 620                         fe = null
 621                         for t, fe_of_afe in afe
 622                                 if t.type is TYPE_AFE_MARKER
 623                                         break
 624                                 if t.name is subject
 625                                         fe = t
 626                                         break
 627                         # If there is no such element, then abort these steps and instead
 628                         # act as described in the "any other end tag" entry above.
 629                         if fe is null
 630                                 debug_log "aaa: fe not found in afe"
 631                                 in_body_any_other_end_tag subject
 632                                 return
 633                         # 6. If formatting element is not in the stack of open elements,
 634                         # then this is a parse error; remove the element from the list, and
 635                         # abort these steps.
 636                         in_open_els = false
 637                         for t, fe_of_open_els in open_els
 638                                 if t is fe
 639                                         in_open_els = true
 640                                         break
 641                         unless in_open_els
 642                                 debug_log "aaa: fe not found in open_els"
 643                                 parse_error()
 644                                 # "remove it from the list" must mean afe, since it's not in open_els
 645                                 afe.splice fe_of_afe, 1
 646                                 return
 647                         # 7. If formatting element is in the stack of open elements, but
 648                         # the element is not in scope, then this is a parse error; abort
 649                         # these steps.
 650                         unless el_is_in_scope fe
 651                                 debug_log "aaa: fe not in scope"
 652                                 parse_error()
 653                                 return
 654                         # 8. If formatting element is not the current node, this is a parse
 655                         # error. (But do not abort these steps.)
 656                         unless open_els[0] is fe
 657                                 parse_error()
 658                                 # continue
 659                         # 9. Let furthest block be the topmost node in the stack of open
 660                         # elements that is lower in the stack than formatting element, and
 661                         # is an element in the special category. There might not be one.
 662                         fb = null
 663                         fb_of_open_els = null
 664                         for t, i in open_els
 665                                 if t is fe
 666                                         break
 667                                 if el_is_special t
 668                                         fb = t
 669                                         fb_of_open_els = i
 670                                         # and continue, to see if there's one that's more "topmost"
 671                         # 10. If there is no furthest block, then the UA must first pop all
 672                         # the nodes from the bottom of the stack of open elements, from the
 673                         # current node up to and including formatting element, then remove
 674                         # formatting element from the list of active formatting elements,
 675                         # and finally abort these steps.
 676                         if fb is null
 677                                 debug_log "aaa: no fb"
 678                                 loop
 679                                         t = open_els.shift()
 680                                         if t is fe
 681                                                 afe.splice fe_of_afe, 1
 682                                                 return
 683                         # 11. Let common ancestor be the element immediately above
 684                         # formatting element in the stack of open elements.
 685                         ca = open_els[fe_of_open_els + 1] # common ancestor
 686
 687                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
 688                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
 689                         bookmark = new_aaa_bookmark()
 690                         for t, i in afe
 691                                 if t is fe
 692                                         afe.splice i, 0, bookmark
 693                                         break
 694                         node = last_node = fb
 695                         inner = 0
 696                         loop
 697                                 inner += 1
 698                                 # 3. Let node be the element immediately above node in the
 699                                 # stack of open elements, or if node is no longer in the stack
 700                                 # of open elements (e.g. because it got removed by this
 701                                 # algorithm), the element that was immediately above node in
 702                                 # the stack of open elements before node was removed.
 703                                 node_next = null
 704                                 for t, i in open_els
 705                                         if t is node
 706                                                 node_next = open_els[i + 1]
 707                                                 break
 708                                 node = node_next ? node_above
 709                                 debug_log "inner loop #{inner}"
 710                                 debug_log "tree: #{serialize_els tree.children, false, true}"
 711                                 debug_log "open_els: #{serialize_els open_els, true, true}"
 712                                 debug_log "afe: #{serialize_els afe, true, true}"
 713                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
 714                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
 715                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
 716                                 debug_log "node: #{node.serialize true, true}"
 717                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
 718
 719                                 # 4. If node is formatting element, then go to the next step in
 720                                 # the overall algorithm.
 721                                 if node is fe
 722                                         break
 723                                 debug_log "the meat"
 724                                 # 5. If inner loop counter is greater than three and node is in
 725                                 # the list of active formatting elements, then remove node from
 726                                 # the list of active formatting elements.
 727                                 node_in_afe = false
 728                                 for t, i in afe
 729                                         if t is node
 730                                                 if inner > 3
 731                                                         afe.splice i, 1
 732                                                         debug_log "max out inner"
 733                                                 else
 734                                                         node_in_afe = true
 735                                                         debug_log "in afe"
 736                                                 break
 737                                 # 6. If node is not in the list of active formatting elements,
 738                                 # then remove node from the stack of open elements and then go
 739                                 # back to the step labeled inner loop.
 740                                 unless node_in_afe
 741                                         debug_log "not in afe"
 742                                         for t, i in open_els
 743                                                 if t is node
 744                                                         node_above = open_els[i + 1]
 745                                                         open_els.splice i, 1
 746                                                         break
 747                                         continue
 748                                 debug_log "the bones"
 749                                 # 7. create an element for the token for which the element node
 750                                 # was created, in the HTML namespace, with common ancestor as
 751                                 # the intended parent; replace the entry for node in the list
 752                                 # of active formatting elements with an entry for the new
 753                                 # element, replace the entry for node in the stack of open
 754                                 # elements with an entry for the new element, and let node be
 755                                 # the new element.
 756                                 new_node = node.shallow_clone()
 757                                 for t, i in afe
 758                                         if t is node
 759                                                 afe[i] = new_node
 760                                                 debug_log "replaced in afe"
 761                                                 break
 762                                 for t, i in open_els
 763                                         if t is node
 764                                                 node_above = open_els[i + 1]
 765                                                 open_els[i] = new_node
 766                                                 debug_log "replaced in open_els"
 767                                                 break
 768                                 node = new_node
 769                                 # 8. If last node is furthest block, then move the
 770                                 # aforementioned bookmark to be immediately after the new node
 771                                 # in the list of active formatting elements.
 772                                 if last_node is fb
 773                                         for t, i in afe
 774                                                 if t is bookmark
 775                                                         afe.splice i, 1
 776                                                         debug_log "removed bookmark"
 777                                                         break
 778                                         for t, i in afe
 779                                                 if t is node
 780                                                         # "after" means lower
 781                                                         afe.splice i, 0, bookmark # "after as <-
 782                                                         debug_log "placed bookmark after node"
 783                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
 784                                                         break
 785                                 # 9. Insert last node into node, first removing it from its
 786                                 # previous parent node if any.
 787                                 if last_node.parent?
 788                                         debug_log "last_node has parent"
 789                                         for c, i in last_node.parent.children
 790                                                 if c is last_node
 791                                                         debug_log "removing last_node from parent"
 792                                                         last_node.parent.children.splice i, 1
 793                                                         break
 794                                 node.children.push last_node
 795                                 last_node.parent = node
 796                                 # 10. Let last node be node.
 797                                 last_node = node
 798                                 debug_log "at last"
 799                                 # 11. Return to the step labeled inner loop.
 800                         # 14. Insert whatever last node ended up being in the previous step
 801                         # at the appropriate place for inserting a node, but using common
 802                         # ancestor as the override target.
 803
 804                         # JASON: In the case where fe is immediately followed by fb:
 805                         #   * inner loop exits out early (node==fe)
 806                         #   * last_node is fb
 807                         #   * last_node is still in the tree (not a duplicate)
 808                         if last_node.parent?
 809                                 debug_log "FEFIRST? last_node has parent"
 810                                 for c, i in last_node.parent.children
 811                                         if c is last_node
 812                                                 debug_log "removing last_node from parent"
 813                                                 last_node.parent.children.splice i, 1
 814                                                 break
 815
 816                         debug_log "after aaa inner loop"
 817                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
 818                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
 819                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
 820                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
 821                         debug_log "tree: #{serialize_els tree.children, false, true}"
 822
 823                         debug_log "insert"
 824
 825
 826                         # can't use standard insert token thing, because it's already in
 827                         # open_els and must stay at it's current position in open_els
 828                         dest = adjusted_insertion_location ca
 829                         dest[0].children.splice dest[1], 0, last_node
 830                         last_node.parent = dest[0]
 831
 832
 833                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
 834                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
 835                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
 836                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
 837                         debug_log "tree: #{serialize_els tree.children, false, true}"
 838
 839                         # 15. Create an element for the token for which formatting element
 840                         # was created, in the HTML namespace, with furthest block as the
 841                         # intended parent.
 842                         new_element = fe.shallow_clone() # FIXME intended parent thing
 843                         # 16. Take all of the child nodes of furthest block and append them
 844                         # to the element created in the last step.
 845                         while fb.children.length
 846                                 t = fb.children.shift()
 847                                 t.parent = new_element
 848                                 new_element.children.push t
 849                         # 17. Append that new element to furthest block.
 850                         new_element.parent = fb
 851                         fb.children.push new_element
 852                         # 18. Remove formatting element from the list of active formatting
 853                         # elements, and insert the new element into the list of active
 854                         # formatting elements at the position of the aforementioned
 855                         # bookmark.
 856                         for t, i in afe
 857                                 if t is fe
 858                                         afe.splice i, 1
 859                                         break
 860                         for t, i in afe
 861                                 if t is bookmark
 862                                         afe[i] = new_element
 863                                         break
 864                         # 19. Remove formatting element from the stack of open elements,
 865                         # and insert the new element into the stack of open elements
 866                         # immediately below the position of furthest block in that stack.
 867                         for t, i in open_els
 868                                 if t is fe
 869                                         open_els.splice i, 1
 870                                         break
 871                         for t, i in open_els
 872                                 if t is fb
 873                                         open_els.splice i, 0, new_element
 874                                         break
 875                         # 20. Jump back to the step labeled outer loop.
 876                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
 877                         debug_log "tree: #{serialize_els tree.children, false, true}"
 878                         debug_log "open_els: #{serialize_els open_els, true, true}"
 879                         debug_log "afe: #{serialize_els afe, true, true}"
 880                 debug_log "AAA DONE"
 881
 882         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
 883         close_p_element = ->
 884                 generate_implied_end_tags 'p' # arg is exception
 885                 if open_els[0].name isnt 'p'
 886                         parse_error()
 887                 while open_els.length > 1 # just in case
 888                         el = open_els.shift()
 889                         if el.name is 'p'
 890                                 return
 891         close_p_if_in_button_scope = ->
 892                 if is_in_button_scope 'p'
 893                         close_p_element()
 894
 895         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
 896         # aka insert_a_character = (t) ->
 897         insert_character = (t) ->
 898                 dest = adjusted_insertion_location()
 899                 # fixfull check for Document node
 900                 if dest[1] > 0
 901                         prev = dest[0].children[dest[1] - 1]
 902                         if prev.type is TYPE_TEXT
 903                                 prev.text += t.text
 904                                 return
 905                 dest[0].children.splice dest[1], 0, t
 906
 907         # 8.2.5.1
 908         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
 909         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
 910         adjusted_insertion_location = (override_target = null) ->
 911                 # 1. If there was an override target specified, then let target be the
 912                 # override target.
 913                 if override_target?
 914                         target = override_target
 915                 else # Otherwise, let target be the current node.
 916                         target = open_els[0]
 917                 # 2. Determine the adjusted insertion location using the first matching
 918                 # steps from the following list:
 919                 #
 920                 # If foster parenting is enabled and target is a table, tbody, tfoot,
 921                 # thead, or tr element Foster parenting happens when content is
 922                 # misnested in tables.
 923                 if flag_foster_parenting and foster_parenting_targets[target.name]
 924                         loop # once. this is here so we can ``break`` to "abort these substeps"
 925                                 # 1. Let last template be the last template element in the
 926                                 # stack of open elements, if any.
 927                                 last_template = null
 928                                 last_template_i = null
 929                                 for el, i in open_els
 930                                         if el.name is 'template'
 931                                                 last_template = el
 932                                                 last_template_i = i
 933                                                 break
 934                                 # 2. Let last table be the last table element in the stack of
 935                                 # open elements, if any.
 936                                 last_table = null
 937                                 last_table_i
 938                                 for el, i in open_els
 939                                         if el.name is 'table'
 940                                                 last_table = el
 941                                                 last_table_i = i
 942                                                 break
 943                                 # 3. If there is a last template and either there is no last
 944                                 # table, or there is one, but last template is lower (more
 945                                 # recently added) than last table in the stack of open
 946                                 # elements, then: let adjusted insertion location be inside
 947                                 # last template's template contents, after its last child (if
 948                                 # any), and abort these substeps.
 949                                 if last_template and (last_table is null or last_template_i < last_table_i)
 950                                         target = template # fixfull should be it's contents
 951                                         target_i = target.children.length
 952                                         break
 953                                 # 4. If there is no last table, then let adjusted insertion
 954                                 # location be inside the first element in the stack of open
 955                                 # elements (the html element), after its last child (if any),
 956                                 # and abort these substeps. (fragment case)
 957                                 if last_table is null
 958                                         # this is odd
 959                                         target = open_els[open_els.length - 1]
 960                                         target_i = target.children.length
 961                                 # 5. If last table has a parent element, then let adjusted
 962                                 # insertion location be inside last table's parent element,
 963                                 # immediately before last table, and abort these substeps.
 964                                 if last_table.parent?
 965                                         for c, i in last_table.parent.children
 966                                                 if c is last_table
 967                                                         target = last_table.parent
 968                                                         target_i = i
 969                                                         break
 970                                         break
 971                                 # 6. Let previous element be the element immediately above last
 972                                 # table in the stack of open elements.
 973                                 #
 974                                 # huh? how could it not have a parent?
 975                                 previous_element = open_els[last_table_i + 1]
 976                                 # 7. Let adjusted insertion location be inside previous
 977                                 # element, after its last child (if any).
 978                                 target = previous_element
 979                                 target_i = target.children.length
 980                                 # Note: These steps are involved in part because it's possible
 981                                 # for elements, the table element in this case in particular,
 982                                 # to have been moved by a script around in the DOM, or indeed
 983                                 # removed from the DOM entirely, after the element was inserted
 984                                 # by the parser.
 985                                 break # don't really loop
 986                 else
 987                         # Otherwise Let adjusted insertion location be inside target, after
 988                         # its last child (if any).
 989                         target_i = target.children.length
 990
 991                 # 3. If the adjusted insertion location is inside a template element,
 992                 # let it instead be inside the template element's template contents,
 993                 # after its last child (if any).
 994                 # fixfull (template)
 995
 996                 # 4. Return the adjusted insertion location.
 997                 return [target, target_i]
 998
 999         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1000         # aka create_an_element_for_token
1001         token_to_element = (t, namespace, intended_parent) ->
1002                 t.type = TYPE_TAG # not TYPE_START_TAG
1003                 # convert attributes into a hash
1004                 attrs = {}
1005                 while t.attrs_a.length
1006                         a = t.attrs_a.pop()
1007                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1008                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
1009
1010                 # TODO 2. If the newly created element has an xmlns attribute in the
1011                 # XMLNS namespace whose value is not exactly the same as the element's
1012                 # namespace, that is a parse error. Similarly, if the newly created
1013                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1014                 # value is not the XLink Namespace, that is a parse error.
1015
1016                 # fixfull: the spec says stuff about form pointers and ownerDocument
1017
1018                 return el
1019
1020         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1021         insert_foreign_element = (token, namespace) ->
1022                 ail = adjusted_insertion_location()
1023                 ail_el = ail[0]
1024                 ail_i = ail[1]
1025                 el = token_to_element token, namespace, ail_el
1026                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1027                 el.parent = ail_el
1028                 ail_el.children.splice ail_i, 0, el
1029                 open_els.unshift el
1030                 return el
1031         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1032         insert_html_element = insert_foreign_element # (token, namespace) ->
1033
1034         # FIXME read implement "foster parenting" part
1035         # FIXME read spec, do this right
1036         # FIXME implement the override target thing
1037         # note: this assumes it's an open tag
1038         # FIXME what part of the spec is this?
1039         # TODO look through all callers of this, and see what they should really be doing.
1040         #   eg probably insert_html_element for tokens
1041         tree_insert_element = (el, override_target = null, namespace = null) ->
1042                 if namespace?
1043                         el.namespace = namespace
1044                 dest = adjusted_insertion_location override_target
1045                 if el.type is TYPE_START_TAG # means it's a "token"
1046                         el = token_to_element el, namespace, dest[0]
1047                 unless el.namespace?
1048                         namespace = dest.namespace
1049                 # fixfull: Document nodes sometimes can't accept more chidren
1050                 dest[0].children.splice dest[1], 0, el
1051                 el.parent = dest[0]
1052                 open_els.unshift el
1053                 return el
1054
1055         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1056         # position should be [node, index_within_children]
1057         insert_comment = (t, position = null) ->
1058                 position ?= adjusted_insertion_location()
1059                 position[0].children.splice position[1], 0, t
1060
1061         # 8.2.5.2
1062         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1063         parse_generic_raw_text = (t) ->
1064                 insert_html_element t
1065                 tok_state = tok_state_rawtext
1066                 original_insertion_mode = insertion_mode
1067                 insertion_mode = ins_mode_text
1068         parse_generic_rcdata_text = (t) ->
1069                 insert_html_element t
1070                 tok_state = tok_state_rcdata
1071                 original_insertion_mode = insertion_mode
1072                 insertion_mode = ins_mode_text
1073
1074         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1075         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1076         generate_implied_end_tags = (except = null) ->
1077                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1078                         open_els.shift()
1079
1080         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1081         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1082                 open_els.shift() # spec says this will be a 'head' node
1083                 insertion_mode = ins_mode_after_head
1084                 insertion_mode t
1085         ins_mode_in_head = (t) ->
1086                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1087                         insert_character t
1088                         return
1089                 if t.type is TYPE_COMMENT
1090                         insert_comment t
1091                         return
1092                 if t.type is TYPE_DOCTYPE
1093                         parse_error()
1094                         return
1095                 if t.type is TYPE_START_TAG and t.name is 'html'
1096                         ins_mode_in_body t
1097                         return
1098                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1099                         el = insert_html_element t
1100                         open_els.shift()
1101                         el.acknowledge_self_closing()
1102                         return
1103                 if t.type is TYPE_START_TAG and t.name is 'meta'
1104                         el = insert_html_element t
1105                         open_els.shift()
1106                         el.acknowledge_self_closing()
1107                         # fixfull encoding stuff
1108                         return
1109                 if t.type is TYPE_START_TAG and t.name is 'title'
1110                         parse_generic_rcdata_element t
1111                         return
1112                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1113                         parse_generic_raw_text t
1114                         return
1115                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1116                         insert_html_element t
1117                         insertion_mode = in_head_noscript # FIXME implement
1118                         return
1119                 if t.type is TYPE_START_TAG and t.name is 'script'
1120                         ail = adjusted_insertion_location()
1121                         el = token_to_element t, NS_HTML, ail
1122                         el.flag_parser_inserted true # FIXME implement
1123                         # fixfull frament case
1124                         ail[0].children.splice ail[1], 0, el
1125                         open_els.unshift el
1126                         tok_state = tok_state_script_data
1127                         original_insertion_mode = insertion_mode # make sure orig... is defined
1128                         insertion_mode = ins_mode_text # FIXME implement
1129                         return
1130                 if t.type is TYPE_END_TAG and t.name is 'head'
1131                         open_els.shift() # will be a head element... spec says so
1132                         insertion_mode = ins_mode_after_head
1133                         return
1134                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1135                         ins_mode_in_head_else t
1136                         return
1137                 if t.type is TYPE_START_TAG and t.name is 'template'
1138                         insert_html_element t
1139                         afe_push_marker()
1140                         flag_frameset_ok = false
1141                         insertion_mode = ins_mode_in_template
1142                         template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1143                         return
1144                 if t.type is TYPE_END_TAG and t.name is 'template'
1145                         if template_tag_is_open()
1146                                 generate_implied_end_tags
1147                                 if open_els[0].name isnt 'template'
1148                                         parse_error()
1149                                 loop
1150                                         el = open_els.shift()
1151                                         if el.name is 'template'
1152                                                 break
1153                                 clear_afe_to_marker()
1154                                 template_insertion_modes.shift()
1155                                 reset_insertion_mode()
1156                         else
1157                                 parse_error()
1158                         return
1159                 if (t.type is TYPE_OPEN_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1160                         parse_error()
1161                         return
1162                 ins_mode_in_head_else t
1163
1164         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1165         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1166                 for node, i in open_els
1167                         if node.name is name # FIXME check namespace too
1168                                 generate_implied_end_tags name # arg is exception
1169                                 parse_error() unless i is 0
1170                                 while i >= 0
1171                                         open_els.shift()
1172                                         i -= 1
1173                                 return
1174                         if special_elements[node.name]? # FIXME check namespac too
1175                                 parse_error()
1176                                 return
1177         ins_mode_in_body = (t) ->
1178                 switch t.type
1179                         when TYPE_TEXT
1180                                 switch t.text
1181                                         when "\u0000"
1182                                                 parse_error()
1183                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
1184                                                 reconstruct_active_formatting_elements()
1185                                                 insert_character t
1186                                         else
1187                                                 reconstruct_active_formatting_elements()
1188                                                 insert_character t
1189                                                 flag_frameset_ok = false
1190                         when TYPE_COMMENT
1191                                 insert_comment t
1192                         when TYPE_DOCTYPE
1193                                 parse_error()
1194                         when TYPE_START_TAG
1195                                 switch t.name
1196                                         when 'html'
1197                                                 parse_error()
1198                                                 return if template_tag_is_open()
1199                                                 root_attrs = open_els[open_els.length - 1].attrs
1200                                                 for k, v of t.attrs
1201                                                         root_attrs[k] = v unless root_attrs[k]?
1202                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1203                                                 # FIXME also do this for </template> (end tag)
1204                                                 return ins_mode_in_head t
1205                                         when 'body'
1206                                                 parse_error()
1207                                                 # TODO
1208                                         when 'frameset'
1209                                                 parse_error()
1210                                                 # TODO
1211                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1212                                                 close_p_if_in_button_scope()
1213                                                 insert_html_element t
1214                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1215                                                 close_p_if_in_button_scope()
1216                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1217                                                         parse_error()
1218                                                         open_els.shift()
1219                                                 insert_html_element t
1220                                         # TODO lots more to implement here
1221                                         when 'a'
1222                                                 # If the list of active formatting elements
1223                                                 # contains an a element between the end of the list and
1224                                                 # the last marker on the list (or the start of the list
1225                                                 # if there is no marker on the list), then this is a
1226                                                 # parse error; run the adoption agency algorithm for
1227                                                 # the tag name "a", then remove that element from the
1228                                                 # list of active formatting elements and the stack of
1229                                                 # open elements if the adoption agency algorithm didn't
1230                                                 # already remove it (it might not have if the element
1231                                                 # is not in table scope).
1232                                                 found = false
1233                                                 for el in afe
1234                                                         if el.type is TYPE_AFE_MARKER
1235                                                                 break
1236                                                         if el.name is 'a'
1237                                                                 found = el
1238                                                 if found?
1239                                                         parse_error()
1240                                                         adoption_agency 'a'
1241                                                         for el, i in afe
1242                                                                 if el is found
1243                                                                         afe.splice i, 1
1244                                                         for el, i in open_els
1245                                                                 if el is found
1246                                                                         open_els.splice i, 1
1247                                                 reconstruct_active_formatting_elements()
1248                                                 el = insert_html_element t
1249                                                 afe_push el
1250                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1251                                                 reconstruct_active_formatting_elements()
1252                                                 el = insert_html_element t
1253                                                 afe_push el
1254                                         when 'table'
1255                                                 # fixfull quirksmode thing
1256                                                 close_p_if_in_button_scope()
1257                                                 insert_html_element t
1258                                                 insertion_mode = ins_mode_in_table
1259                                         # TODO lots more to implement here
1260                                         else # any other start tag
1261                                                 reconstruct_active_formatting_elements()
1262                                                 insert_html_element t
1263                         when TYPE_EOF
1264                                 ok_tags = {
1265                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1266                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1267                                 }
1268                                 for t in open_els
1269                                         unless ok_tags[t.name]?
1270                                                 parse_error()
1271                                                 break
1272                                 # TODO stack of template insertion modes thing
1273                                 flag_parsing = false # stop parsing
1274                         when TYPE_END_TAG
1275                                 switch t.name
1276                                         when 'body'
1277                                                 unless is_in_scope 'body'
1278                                                         parse_error()
1279                                                         return
1280                                                 # TODO implement parse error and move to tree_after_body
1281                                         when 'html'
1282                                                 unless is_in_scope 'body' # weird, but it's what the spec says
1283                                                         parse_error()
1284                                                         return
1285                                                 # TODO implement parse error and move to tree_after_body, reprocess
1286                                         when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1287                                                 unless is_in_scope t.name, NS_HTML
1288                                                         parse_error()
1289                                                         return
1290                                                 generate_implied_end_tags()
1291                                                 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1292                                                         parse_error()
1293                                                 loop
1294                                                         el = open_els.shift()
1295                                                         if el.name is t.name and el.namespace is NS_HTML
1296                                                                 return
1297                                         # TODO lots more close tags to implement here
1298                                         when 'p'
1299                                                 unless is_in_button_scope 'p'
1300                                                         parse_error()
1301                                                         insert_html_element new_open_tag 'p'
1302                                                 close_p_element()
1303                                         # TODO lots more close tags to implement here
1304                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1305                                                 adoption_agency t.name
1306                                         # TODO lots more close tags to implement here
1307                                         else
1308                                                 in_body_any_other_end_tag t.name
1309                 return
1310
1311         ins_mode_in_table_else = (t) ->
1312                 parse_error()
1313                 flag_foster_parenting = true # FIXME
1314                 ins_mode_in_body t
1315                 flag_foster_parenting = false
1316         can_in_table = {
1317                 'table': true
1318                 'tbody': true
1319                 'tfoot': true
1320                 'thead': true
1321                 'tr': true
1322         }
1323         clear_to_table_stopers = {
1324                 'table': true
1325                 'template': true
1326                 'html': true
1327         }
1328         clear_stack_to_table_context = ->
1329                 loop
1330                         if clear_to_table_stopers[open_els[0].name]?
1331                                 break
1332                         open_els.shift()
1333                 return
1334         clear_to_table_body_stopers = {
1335                 'tbody': true
1336                 'tfoot': true
1337                 'thead': true
1338                 'template': true
1339                 'html': true
1340         }
1341         clear_stack_to_table_body_context = ->
1342                 loop
1343                         if clear_to_table_body_stopers[open_els[0].name]?
1344                                 break
1345                         open_els.shift()
1346                 return
1347         clear_to_table_row_stopers = {
1348                 'tr': true
1349                 'template': true
1350                 'html': true
1351         }
1352         clear_stack_to_table_row_context = ->
1353                 loop
1354                         if clear_to_table_row_stopers[open_els[0].name]?
1355                                 break
1356                         open_els.shift()
1357                 return
1358         clear_afe_to_marker = ->
1359                 loop
1360                         el = afe.shift()
1361                         if el.type is TYPE_AFE_MARKER
1362                                 return
1363
1364         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1365         ins_mode_text = (t) ->
1366                 if t.type is TYPE_TEXT
1367                         insert_character t
1368                         return
1369                 if t.type is TYPE_EOF
1370                         parse_error()
1371                         if open_els[0].name is 'script'
1372                                 open_els[0].flag 'already started', true
1373                         open_els.shift()
1374                         insertion_mode = original_insertion_mode
1375                         insertion_mode t
1376                         return
1377                 if t.type is TYPE_END_TAG and t.name is 'script'
1378                         open_els.shift()
1379                         insertion_mode = original_insertion_mode
1380                         # fixfull the spec seems to assume that I'm going to run the script
1381                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1382                         return
1383                 if t.type is TYPE_END_TAG
1384                         open_els.shift()
1385                         insertion_mode = original_insertion_mode
1386                         return
1387                 console.log 'warning: end of ins_mode_text reached'
1388
1389         # the functions below implement the tokenizer stats described here:
1390         # http://www.w3.org/TR/html5/syntax.html#tokenization
1391
1392         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1393         ins_mode_in_table = (t) ->
1394                 switch t.type
1395                         when TYPE_TEXT
1396                                 if can_in_table[t.name]
1397                                         original_insertion_mode = insertion_mode
1398                                         insertion_mode = ins_mode_in_table_text
1399                                         insertion_mode t
1400                                 else
1401                                         ins_mode_in_table_else t
1402                         when TYPE_COMMENT
1403                                 insert_comment t
1404                         when TYPE_DOCTYPE
1405                                 parse_error()
1406                         when TYPE_START_TAG
1407                                 switch t.name
1408                                         when 'caption'
1409                                                 clear_stack_to_table_context()
1410                                                 afe_push_marker()
1411                                                 insert_html_element t
1412                                                 insertion_mode = ins_mode_in_caption
1413                                         when 'colgroup'
1414                                                 clear_stack_to_table_context()
1415                                                 insert_html_element t
1416                                                 insertion_mode = ins_mode_in_column_group
1417                                         when 'col'
1418                                                 clear_stack_to_table_context()
1419                                                 insert_html_element new_open_tag 'colgroup'
1420                                                 insertion_mode = ins_mode_in_column_group
1421                                                 insertion_mode t
1422                                         when 'tbody', 'tfoot', 'thead'
1423                                                 clear_stack_to_table_context()
1424                                                 insert_html_element t
1425                                                 insertion_mode = ins_mode_in_table_body
1426                                         when 'td', 'th', 'tr'
1427                                                 clear_stack_to_table_context()
1428                                                 insert_html_element new_open_tag 'tbody'
1429                                                 insertion_mode = ins_mode_in_table_body
1430                                                 insertion_mode t
1431                                         when 'table'
1432                                                 parse_error()
1433                                                 if is_in_table_scope 'table'
1434                                                         loop
1435                                                                 el = open_els.shift()
1436                                                                 if el.name is 'table'
1437                                                                         break
1438                                                         reset_insertion_mode()
1439                                                         insertion_mode t
1440                                         when 'style', 'script', 'template'
1441                                                 ins_mode_in_head t
1442                                         when 'input'
1443                                                 if token_is_input_hidden t
1444                                                         ins_mode_in_table_else t
1445                                                 else
1446                                                         parse_error()
1447                                                         el = insert_html_element t
1448                                                         open_els.shift()
1449                                                         el.acknowledge_self_closing()
1450                                         when 'form'
1451                                                 parse_error()
1452                                                 if form_element_pointer?
1453                                                         return
1454                                                 if template_tag_is_open()
1455                                                         return
1456                                                 form_element_pointer = insert_html_element t
1457                                                 open_els.shift()
1458                                         else
1459                                                 ins_mode_in_table_else t
1460                         when TYPE_END_TAG
1461                                 switch t.name
1462                                         when 'table'
1463                                                 if is_in_table_scope 'table'
1464                                                         loop
1465                                                                 el = open_els.shift()
1466                                                                 if el.name is 'table'
1467                                                                         break
1468                                                         reset_insertion_mode()
1469                                                 else
1470                                                         parse_error
1471                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1472                                                 parse_error()
1473                                         when 'template'
1474                                                 ins_mode_in_head t
1475                                         else
1476                                                 ins_mode_in_table_else t
1477                         when TYPE_EOF
1478                                 ins_mode_in_body t
1479                         else
1480                                 ins_mode_in_table_else t
1481
1482
1483         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1484         ins_mode_in_table_text = (t) ->
1485                 if t.type is TYPE_TEXT and t.text is "\u0000"
1486                         # huh? I thought the tokenizer didn't emit these
1487                         parse_error()
1488                         return
1489                 if t.type is TYPE_TEXT
1490                         pending_table_character_tokens.push t
1491                         return
1492                 # Anything else
1493                 all_space = true
1494                 for old in pending_table_character_tokens
1495                         unless space_chars.indexOf(old.text) > -1
1496                                 all_space = false
1497                                 break
1498                 if all_space
1499                         for old in pending_table_character_tokens
1500                                 insert_character old
1501                 else
1502                         for old in pending_table_character_tokens
1503                                 ins_mode_table_else old
1504                 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1505                 insertion_mode = original_insertion_mode
1506                 insertion_mode t
1507
1508         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1509         ins_mode_in_caption = (t) ->
1510                 if t.type is TYPE_END_TAG and t.name is 'caption'
1511                         if is_in_table_scope 'caption'
1512                                 generate_implied_end_tags()
1513                                 if open_els[0].name isnt 'caption'
1514                                         parse_error()
1515                                 loop
1516                                         el = open_els.shift()
1517                                         if el.name is 'caption'
1518                                                 break
1519                                 clear_afe_to_marker()
1520                                 insertion_mode = in_table
1521                         else
1522                                 parse_error()
1523                                 # fragment case
1524                         return
1525                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1526                         parse_error()
1527                         if is_in_table_scope 'caption'
1528                                 loop
1529                                         el = open_els.shift()
1530                                         if el.name is 'caption'
1531                                                 break
1532                                 clear_afe_to_marker()
1533                                 insertion_mode = in_table
1534                                 insertion_mode t
1535                         # else fragment case
1536                         return
1537                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1538                         parse_error()
1539                         return
1540                 # Anything else
1541                 ins_mode_in_body t
1542
1543         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1544         ins_mode_in_column_group = (t) ->
1545                 if t.type is TYPE_TEXT and space_chars.indexOf(t.text) > -1
1546                         insert_character t
1547                         return
1548                 if t.type is TYPE_COMMENT
1549                         insert_comment t
1550                         return
1551                 if t.type is TYPE_DOCTYPE
1552                         parse_error()
1553                         return
1554                 if t.type is TYPE_START_TAG and t.name is 'html'
1555                         ins_mode_in_body t
1556                         return
1557                 if t.type is TYPE_START_TAG and t.name is 'col'
1558                         el = insert_html_element t
1559                         open_els.shift()
1560                         el.acknowledge_self_closing()
1561                         return
1562                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1563                         if open_els[0].name is 'colgroup'
1564                                 open_els[0].shift()
1565                                 insertion_mode = ins_mode_in_table
1566                         else
1567                                 parse_error()
1568                         return
1569                 if t.type is TYPE_END_TAG and t.name is 'col'
1570                         parse_error()
1571                         return
1572                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1573                         ins_mode_in_head t
1574                         return
1575                 if t.type is TYPE_EOF
1576                         ins_mode_in_body t
1577                         return
1578                 # Anything else
1579                 if open_els[0].name isnt 'colgroup'
1580                         parse_error()
1581                         return
1582                 open_els.shift()
1583                 insertion_mode = ins_mode_in_table
1584                 insertion_mode t
1585                 return
1586
1587         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1588         ins_mode_in_table_body = (t) ->
1589                 if t.type is TYPE_START_TAG and t.name is 'tr'
1590                         clear_stack_to_table_body_context()
1591                         insert_html_element t
1592                         insertion_mode = ins_mode_in_row
1593                         return
1594                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1595                         parse_error()
1596                         clear_stack_to_table_body_context()
1597                         insert_html_element new_open_tag 'tr'
1598                         insertion_mode = ins_mode_in_row
1599                         insertion_mode t
1600                         return
1601                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1602                         unless is_in_table_scope t.name # fixfull check namespace
1603                                 parse_error()
1604                                 return
1605                         clear_stack_to_table_body_context()
1606                         open_els.shift()
1607                         insertion_mode = ins_mode_in_table
1608                         return
1609                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1610                         has = false
1611                         for el in open_els
1612                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1613                                         has = true
1614                                         break
1615                                 if table_scopers[el.name]
1616                                         break
1617                         if !has
1618                                 parse_error()
1619                                 return
1620                         clear_stack_to_table_body_context()
1621                         open_els.shift()
1622                         insertion_mode = ins_mode_in_table
1623                         insertion_mode t
1624                         return
1625                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1626                         parse_error()
1627                         return
1628                 # Anything else
1629                 ins_mode_in_table t
1630
1631         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1632         ins_mode_in_row = (t) ->
1633                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1634                         clear_stack_to_table_row_context()
1635                         insert_html_element t
1636                         insertion_mode = ins_mode_in_cell
1637                         afe_push_marker()
1638                         return
1639                 if t.type is TYPE_END_TAG and t.name is 'tr'
1640                         if is_in_table_scope 'tr'
1641                                 clear_stack_to_table_row_context()
1642                                 open_els.shift()
1643                                 insertion_mode = ins_mode_in_table_body
1644                         else
1645                                 parse_error()
1646                         return
1647                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1648                         if is_in_table_scope 'tr'
1649                                 clear_stack_to_table_row_context()
1650                                 open_els.shift()
1651                                 insertion_mode = ins_mode_in_table_body
1652                                 insertion_mode t
1653                         else
1654                                 parse_error()
1655                         return
1656                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1657                         if is_in_table_scope t.name # fixfull namespace
1658                                 if is_in_table_scope 'tr'
1659                                         clear_stack_to_table_row_context()
1660                                         open_els.shift()
1661                                         insertion_mode = ins_mode_in_table_body
1662                                         insertion_mode t
1663                         else
1664                                 parse_error()
1665                         return
1666                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1667                         parse_error()
1668                         return
1669                 # Anything else
1670                 ins_mode_in_table t
1671
1672         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1673         close_the_cell = ->
1674                 generate_implied_end_tags()
1675                 unless open_els[0].name is 'td' or open_els[0] is 'th'
1676                         parse_error()
1677                 loop
1678                         el = open_els.shift()
1679                         if el.name is 'td' or el.name is 'th'
1680                                 break
1681                 clear_afe_to_marker()
1682                 insertion_mode = ins_mode_in_row
1683
1684         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1685         ins_mode_in_cell = (t) ->
1686                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1687                         if is_in_table_scope t.name
1688                                 generate_implied_end_tags()
1689                                 if open_els[0].name isnt t.name
1690                                         parse_error
1691                                 loop
1692                                         el = open_els.shift()
1693                                         if el.name is t.name
1694                                                 break
1695                                 clear_afe_to_marker()
1696                                 insertion_mode = ins_mode_in_row
1697                         else
1698                                 parse_error()
1699                         return
1700                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1701                         has = false
1702                         for el in open_els
1703                                 if el.name is 'td' or el.name is 'th'
1704                                         has = true
1705                                         break
1706                                 if table_scopers[el.name]
1707                                         break
1708                         if !has
1709                                 parse_error()
1710                                 return
1711                         close_the_cell()
1712                         insertion_mode t
1713                         return
1714                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1715                         parse_error()
1716                         return
1717                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1718                         if is_in_table_scope t.name # fixfull namespace
1719                                 close_the_cell()
1720                                 insertion_mode t
1721                         else
1722                                 parse_error()
1723                         return
1724                 # Anything Else
1725                 ins_mode_in_body t
1726
1727         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
1728         tok_state_data = ->
1729                 switch c = txt.charAt(cur++)
1730                         when '&'
1731                                 return new_text_node parse_character_reference()
1732                         when '<'
1733                                 tok_state = tok_state_tag_open
1734                         when "\u0000"
1735                                 parse_error()
1736                                 return new_text_node c
1737                         when '' # EOF
1738                                 return new_eof_token()
1739                         else
1740                                 return new_text_node c
1741                 return null
1742
1743         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
1744         # not needed: tok_state_character_reference_in_data = ->
1745         # just call parse_character_reference()
1746
1747         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
1748         tok_state_rcdata = ->
1749                 switch c = txt.charAt(cur++)
1750                         when '&'
1751                                 return new_text_node parse_character_reference()
1752                         when '<'
1753                                 tok_state = tok_state_rcdata_less_than_sign
1754                         when "\u0000"
1755                                 parse_error()
1756                                 return new_character_token "\ufffd"
1757                         when '' # EOF
1758                                 return new_eof_token()
1759                         else
1760                                 return new_character_token c
1761                 return null
1762
1763         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
1764         # not needed: tok_state_character_reference_in_rcdata = ->
1765         # just call parse_character_reference()
1766
1767         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
1768         tok_state_rawtext = ->
1769                 switch c = txt.charAt(cur++)
1770                         when '<'
1771                                 tok_state = tok_state_rawtext_less_than_sign
1772                         when "\u0000"
1773                                 parse_error()
1774                                 return new_character_token "\ufffd"
1775                         when '' # EOF
1776                                 return new_eof_token()
1777                         else
1778                                 return new_character_token c
1779                 return null
1780
1781         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
1782         tok_state_script_data = ->
1783                 switch c = txt.charAt(cur++)
1784                         when '<'
1785                                 tok_state = tok_state_script_data_less_than_sign
1786                         when "\u0000"
1787                                 parse_error()
1788                                 return new_character_token "\ufffd"
1789                         when '' # EOF
1790                                 return new_eof_token()
1791                         else
1792                                 return new_character_token c
1793                 return null
1794
1795         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
1796         tok_state_plaintext = ->
1797                 switch c = txt.charAt(cur++)
1798                         when "\u0000"
1799                                 parse_error()
1800                                 return new_character_token "\ufffd"
1801                         when '' # EOF
1802                                 return new_eof_token()
1803                         else
1804                                 return new_character_token c
1805                 return null
1806
1807
1808         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
1809         tok_state_tag_open = ->
1810                 switch c = txt.charAt(cur++)
1811                         when '!'
1812                                 tok_state = tok_state_markup_declaration_open
1813                         when '/'
1814                                 tok_state = tok_state_end_tag_open
1815                         when '?'
1816                                 parse_error()
1817                                 tok_state = tok_state_bogus_comment
1818                         else
1819                                 if lc_alpha.indexOf(c) > -1
1820                                         tok_cur_tag = new_open_tag c
1821                                         tok_state = tok_state_tag_name
1822                                 else if uc_alpha.indexOf(c) > -1
1823                                         tok_cur_tag = new_open_tag c.toLowerCase()
1824                                         tok_state = tok_state_tag_name
1825                                 else
1826                                         parse_error()
1827                                         tok_state = tok_state_data
1828                                         cur -= 1 # we didn't parse/handle the char after <
1829                                         return new_text_node '<'
1830                 return null
1831
1832         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
1833         tok_state_end_tag_open = ->
1834                 switch c = txt.charAt(cur++)
1835                         when '>'
1836                                 parse_error()
1837                                 tok_state = tok_state_data
1838                         when '' # EOF
1839                                 parse_error()
1840                                 tok_state = tok_state_data
1841                                 return new_text_node '</'
1842                         else
1843                                 if uc_alpha.indexOf(c) > -1
1844                                         tok_cur_tag = new_end_tag c.toLowerCase()
1845                                         tok_state = tok_state_tag_name
1846                                 else if lc_alpha.indexOf(c) > -1
1847                                         tok_cur_tag = new_end_tag c
1848                                         tok_state = tok_state_tag_name
1849                                 else
1850                                         parse_error()
1851                                         tok_state = tok_state_bogus_comment
1852                 return null
1853
1854         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
1855         tok_state_tag_name = ->
1856                 switch c = txt.charAt(cur++)
1857                         when "\t", "\n", "\u000c", ' '
1858                                 tok_state = tok_state_before_attribute_name
1859                         when '/'
1860                                 tok_state = tok_state_self_closing_start_tag
1861                         when '>'
1862                                 tok_state = tok_state_data
1863                                 tmp = tok_cur_tag
1864                                 tok_cur_tag = null
1865                                 return tmp
1866                         when "\u0000"
1867                                 parse_error()
1868                                 tok_cur_tag.name += "\ufffd"
1869                         when '' # EOF
1870                                 parse_error()
1871                                 tok_state = tok_state_data
1872                         else
1873                                 if uc_alpha.indexOf(c) > -1
1874                                         tok_cur_tag.name += c.toLowerCase()
1875                                 else
1876                                         tok_cur_tag.name += c
1877                 return null
1878
1879         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
1880         tok_state_rcdata_less_than_sign = ->
1881                 c = txt.charAt(cur++)
1882                 if c is '/'
1883                         temporary_buffer = ''
1884                         tok_state = tok_state_rcdata_end_tag_open
1885                         return null
1886                 # Anything else
1887                 tok_state = tok_state_rcdata
1888                 cur -= 1 # reconsume the input character
1889                 return new_character_token '<'
1890
1891         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
1892         tok_state_rcdata_end_tag_open = ->
1893                 c = txt.charAt(cur++)
1894                 if uc_alpha.indexOf(c) > -1
1895                         tok_cur_tag = new_end_tag c.toLowerCase()
1896                         temporary_buffer += c
1897                         tok_state = tok_state_rcdata_end_tag_name
1898                         return null
1899                 if lc_alpha.indexOf(c) > -1
1900                         tok_cur_tag = new_end_tag c
1901                         temporary_buffer += c
1902                         tok_state = tok_state_rcdata_end_tag_name
1903                         return null
1904                 # Anything else
1905                 tok_state = tok_state_rcdata
1906                 cur -= 1 # reconsume the input character
1907                 return new_character_token "</" # fixfull separate these
1908
1909         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
1910         is_appropriate_end_tag = (t) ->
1911                 # spec says to check against "the tag name of the last start tag to
1912                 # have been emitted from this tokenizer", but this is only called from
1913                 # the various "raw" states, which I'm pretty sure all push the start
1914                 # token onto open_els. TODO: verify this after the script data states
1915                 # are implemented
1916                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
1917                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
1918
1919         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
1920         tok_state_rcdata_end_tag_name = ->
1921                 c = txt.charAt(cur++)
1922                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
1923                         if is_appropriate_end_tag tok_cur_tag
1924                                 tok_state = tok_state_before_attribute_name
1925                                 return
1926                         # else fall through to "Anything else"
1927                 if c is '/'
1928                         if is_appropriate_end_tag tok_cur_tag
1929                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
1930                                 return
1931                         # else fall through to "Anything else"
1932                 if c is '>'
1933                         if is_appropriate_end_tag tok_cur_tag
1934                                 tok_state = tok_state_data
1935                                 return tok_cur_tag
1936                         # else fall through to "Anything else"
1937                 if uc_alpha.indexOf(c) > -1
1938                         tok_cur_tag.name += c.toLowerCase()
1939                         temporary_buffer += c
1940                         return null
1941                 if lc_alpha.indexOf(c) > -1
1942                         tok_cur_tag.name += c
1943                         temporary_buffer += c
1944                         return null
1945                 # Anything else
1946                 tok_state = tok_state_rcdata
1947                 cur -= 1 # reconsume the input character
1948                 return new_character_token '</' + temporary_buffer # fixfull separate these
1949
1950         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
1951         tok_state_rawtext_less_than_sign = ->
1952                 c = txt.charAt(cur++)
1953                 if c is '/'
1954                         temporary_buffer = ''
1955                         tok_state = tok_state_rawtext_end_tag_open
1956                         return null
1957                 # Anything else
1958                 tok_state = tok_state_rawtext
1959                 cur -= 1 # reconsume the input character
1960                 return new_character_token '<'
1961
1962         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
1963         tok_state_rawtext_end_tag_open = ->
1964                 c = txt.charAt(cur++)
1965                 if uc_alpha.indexOf(c) > -1
1966                         tok_cur_tag = new_end_tag c.toLowerCase()
1967                         temporary_buffer += c
1968                         tok_state = tok_state_rawtext_end_tag_name
1969                         return null
1970                 if lc_alpha.indexOf(c) > -1
1971                         tok_cur_tag = new_end_tag c
1972                         temporary_buffer += c
1973                         tok_state = tok_state_rawtext_end_tag_name
1974                         return null
1975                 # Anything else
1976                 tok_state = tok_state_rawtext
1977                 cur -= 1 # reconsume the input character
1978                 return new_character_token "</" # fixfull separate these
1979
1980         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
1981         tok_state_rawtext_end_tag_name = ->
1982                 c = txt.charAt(cur++)
1983                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
1984                         if is_appropriate_end_tag tok_cur_tag
1985                                 tok_state = tok_state_before_attribute_name
1986                                 return
1987                         # else fall through to "Anything else"
1988                 if c is '/'
1989                         if is_appropriate_end_tag tok_cur_tag
1990                                 tok_state = tok_state_self_closing_start_tag
1991                                 return
1992                         # else fall through to "Anything else"
1993                 if c is '>'
1994                         if is_appropriate_end_tag tok_cur_tag
1995                                 tok_state = tok_state_data
1996                                 return tok_cur_tag
1997                         # else fall through to "Anything else"
1998                 if uc_alpha.indexOf(c) > -1
1999                         tok_cur_tag.name += c.toLowerCase()
2000                         temporary_buffer += c
2001                         return null
2002                 if lc_alpha.indexOf(c) > -1
2003                         tok_cur_tag.name += c
2004                         temporary_buffer += c
2005                         return null
2006                 # Anything else
2007                 tok_state = tok_state_rawtext
2008                 cur -= 1 # reconsume the input character
2009                 return new_character_token '</' + temporary_buffer # fixfull separate these
2010
2011         # TODO _all_ of the missing states here (17-33) are for parsing script tags
2012
2013         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2014         tok_state_before_attribute_name = ->
2015                 attr_name = null
2016                 switch c = txt.charAt(cur++)
2017                         when "\t", "\n", "\u000c", ' '
2018                                 return null
2019                         when '/'
2020                                 tok_state = tok_state_self_closing_start_tag
2021                                 return null
2022                         when '>'
2023                                 tok_state = tok_state_data
2024                                 tmp = tok_cur_tag
2025                                 tok_cur_tag = null
2026                                 return tmp
2027                         when "\u0000"
2028                                 parse_error()
2029                                 attr_name = "\ufffd"
2030                         when '"', "'", '<', '='
2031                                 parse_error()
2032                                 attr_name = c
2033                         when '' # EOF
2034                                 parse_error()
2035                                 tok_state = tok_state_data
2036                         else
2037                                 if uc_alpha.indexOf(c) > -1
2038                                         attr_name = c.toLowerCase()
2039                                 else
2040                                         attr_name = c
2041                 if attr_name?
2042                         tok_cur_tag.attrs_a.unshift [attr_name, '']
2043                         tok_state = tok_state_attribute_name
2044                 return null
2045
2046         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2047         tok_state_attribute_name = ->
2048                 switch c = txt.charAt(cur++)
2049                         when "\t", "\n", "\u000c", ' '
2050                                 tok_state = tok_state_after_attribute_name
2051                         when '/'
2052                                 tok_state = tok_state_self_closing_start_tag
2053                         when '='
2054                                 tok_state = tok_state_before_attribute_value
2055                         when '>'
2056                                 tok_state = tok_state_data
2057                                 tmp = tok_cur_tag
2058                                 tok_cur_tag = null
2059                                 return tmp
2060                         when "\u0000"
2061                                 parse_error()
2062                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2063                         when '"', "'", '<'
2064                                 parse_error()
2065                                 tok_cur_tag.attrs_a[0][0] = c
2066                         when '' # EOF
2067                                 parse_error()
2068                                 tok_state = tok_state_data
2069                         else
2070                                 if uc_alpha.indexOf(c) > -1
2071                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2072                                 else
2073                                         tok_cur_tag.attrs_a[0][0] += c
2074                 return null
2075
2076         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2077         tok_state_after_attribute_name = ->
2078                 c = txt.charAt(cur++)
2079                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2080                         return
2081                 if c is '/'
2082                         tok_state = tok_state_self_closing_start_tag
2083                         return
2084                 if c is '='
2085                         tok_state = tok_state_before_attribute_value
2086                         return
2087                 if c is '>'
2088                         tok_state = tok_state_data
2089                         return
2090                 if uc_alpha.indexOf(c) > -1
2091                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2092                         tok_state = tok_state_attribute_name
2093                         return
2094                 if c is "\u0000"
2095                         parse_error()
2096                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2097                         tok_state = tok_state_attribute_name
2098                         return
2099                 if c is '' # EOF
2100                         parse_error()
2101                         tok_state = tok_state_data
2102                         cur -= 1 # reconsume
2103                         return
2104                 if c is '"' or c is "'" or c is '<'
2105                         parse_error()
2106                         # fall through to Anything else
2107                 # Anything else
2108                 tok_cur_tag.attrs_a.unshift [c, '']
2109                 tok_state = tok_state_attribute_name
2110
2111         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2112         tok_state_before_attribute_value = ->
2113                 switch c = txt.charAt(cur++)
2114                         when "\t", "\n", "\u000c", ' '
2115                                 return null
2116                         when '"'
2117                                 tok_state = tok_state_attribute_value_double_quoted
2118                         when '&'
2119                                 tok_state = tok_state_attribute_value_unquoted
2120                                 cur -= 1
2121                         when "'"
2122                                 tok_state = tok_state_attribute_value_single_quoted
2123                         when "\u0000"
2124                                 # Parse error
2125                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2126                                 tok_state = tok_state_attribute_value_unquoted
2127                         when '>'
2128                                 # Parse error
2129                                 tok_state = tok_state_data
2130                                 tmp = tok_cur_tag
2131                                 tok_cur_tag = null
2132                                 return tmp
2133                         when '' # EOF
2134                                 parse_error()
2135                                 tok_state = tok_state_data
2136                         else
2137                                 tok_cur_tag.attrs_a[0][1] += c
2138                                 tok_state = tok_state_attribute_value_unquoted
2139                 return null
2140
2141         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2142         tok_state_attribute_value_double_quoted = ->
2143                 switch c = txt.charAt(cur++)
2144                         when '"'
2145                                 tok_state = tok_state_after_attribute_value_quoted
2146                         when '&'
2147                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2148                         when "\u0000"
2149                                 # Parse error
2150                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2151                         when '' # EOF
2152                                 parse_error()
2153                                 tok_state = tok_state_data
2154                         else
2155                                 tok_cur_tag.attrs_a[0][1] += c
2156                 return null
2157
2158         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2159         tok_state_attribute_value_single_quoted = ->
2160                 switch c = txt.charAt(cur++)
2161                         when "'"
2162                                 tok_state = tok_state_after_attribute_value_quoted
2163                         when '&'
2164                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2165                         when "\u0000"
2166                                 # Parse error
2167                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2168                         when '' # EOF
2169                                 parse_error()
2170                                 tok_state = tok_state_data
2171                         else
2172                                 tok_cur_tag.attrs_a[0][1] += c
2173                 return null
2174
2175         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2176         tok_state_attribute_value_unquoted = ->
2177                 switch c = txt.charAt(cur++)
2178                         when "\t", "\n", "\u000c", ' '
2179                                 tok_state = tok_state_before_attribute_name
2180                         when '&'
2181                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2182                         when '>'
2183                                 tok_state = tok_state_data
2184                                 tmp = tok_cur_tag
2185                                 tok_cur_tag = null
2186                                 return tmp
2187                         when "\u0000"
2188                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2189                         when '' # EOF
2190                                 parse_error()
2191                                 tok_state = tok_state_data
2192                         else
2193                                 # Parse Error if ', <, = or ` (backtick)
2194                                 tok_cur_tag.attrs_a[0][1] += c
2195                 return null
2196
2197         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2198         tok_state_after_attribute_value_quoted = ->
2199                 switch c = txt.charAt(cur++)
2200                         when "\t", "\n", "\u000c", ' '
2201                                 tok_state = tok_state_before_attribute_name
2202                         when '/'
2203                                 tok_state = tok_state_self_closing_start_tag
2204                         when '>'
2205                                 tok_state = tok_state_data
2206                                 tmp = tok_cur_tag
2207                                 tok_cur_tag = null
2208                                 return tmp
2209                         when '' # EOF
2210                                 parse_error()
2211                                 tok_state = tok_state_data
2212                         else
2213                                 # Parse Error
2214                                 tok_state = tok_state_before_attribute_name
2215                                 cur -= 1 # we didn't handle that char
2216                 return null
2217
2218         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
2219         # Don't set this as a state, just call it
2220         # returns a string (NOT a text node)
2221         parse_character_reference = (allowed_char = null, in_attr = false) ->
2222                 if cur >= txt.length
2223                         return '&'
2224                 switch c = txt.charAt(cur)
2225                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
2226                                 # explicitly not a parse error
2227                                 return '&'
2228                         when ';'
2229                                 # there has to be "one or more" alnums between & and ; to be a parse error
2230                                 return '&'
2231                         when '#'
2232                                 if cur + 1 >= txt.length
2233                                         return '&'
2234                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
2235                                         prefix = '#x'
2236                                         charset = hex_chars
2237                                         start = cur + 2
2238                                 else
2239                                         charset = digits
2240                                         start = cur + 1
2241                                         prefix = '#'
2242                                 i = 0
2243                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
2244                                         i += 1
2245                                 if i is 0
2246                                         return '&'
2247                                 if txt.charAt(start + i) is ';'
2248                                         i += 1
2249                                 # FIXME This is supposed to generate parse errors for some chars
2250                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
2251                                 if decoded?
2252                                         cur = start + i
2253                                         return decoded
2254                                 return '&'
2255                         else
2256                                 for i in [0...31]
2257                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
2258                                                 break
2259                                 if i is 0
2260                                         # exit early, because parse_error() below needs at least one alnum
2261                                         return '&'
2262                                 if txt.charAt(cur + i) is ';'
2263                                         i += 1 # include ';' terminator in value
2264                                         decoded = decode_named_char_ref txt.substr(cur, i)
2265                                         if decoded?
2266                                                 cur += i
2267                                                 return decoded
2268                                         parse_error()
2269                                         return '&'
2270                                 else
2271                                         # no ';' terminator (only legacy char refs)
2272                                         max = i
2273                                         for i in [2..max] # no prefix matches, so ok to check shortest first
2274                                                 c = legacy_char_refs[txt.substr(cur, i)]
2275                                                 if c?
2276                                                         if in_attr
2277                                                                 if txt.charAt(cur + i) is '='
2278                                                                         # "because some legacy user agents will
2279                                                                         # misinterpret the markup in those cases"
2280                                                                         parse_error()
2281                                                                         return '&'
2282                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
2283                                                                         # this makes attributes forgiving about url args
2284                                                                         return '&'
2285                                                         # ok, and besides the weird exceptions for attributes...
2286                                                         # return the matching char
2287                                                         cur += i # consume entity chars
2288                                                         parse_error() # because no terminating ";"
2289                                                         return c
2290                                         parse_error()
2291                                         return '&'
2292                 return # never reached
2293
2294         # tree constructor initialization
2295         # see comments on TYPE_TAG/etc for the structure of this data
2296         tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
2297         open_els = [tree]
2298         afe = [] # active formatting elements
2299         template_insertion_modes = []
2300         insertion_mode = ins_mode_in_body
2301         original_insertion_mode = insertion_mode # TODO check spec
2302         flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
2303         flag_frameset_ok = true
2304         flag_parsing = true
2305         flag_foster_parenting = false
2306         form_element_pointer = null
2307         temporary_buffer = null
2308         pending_table_character_tokens = []
2309
2310         # tokenizer initialization
2311         tok_state = tok_state_data
2312
2313         # proccess input
2314         while flag_parsing
2315                 t = tok_state()
2316                 if t?
2317                         insertion_mode t
2318         return tree.children
2319
2320 # everything below is tests on the above
2321 test_equals = (description, output, expected_output) ->
2322         if output is expected_output
2323                 console.log "passed." # don't say name, so smart consoles can merge all of these
2324         else
2325                 console.log "FAILED: \"#{description}\""
2326                 console.log "   Expected: #{expected_output}"
2327                 console.log "     Actual: #{output}"
2328 serialize_els = (els, shallow, show_ids) ->
2329         serialized = ''
2330         sep = ''
2331         for t in els
2332                 serialized += sep
2333                 sep = ','
2334                 serialized += t.serialize shallow, show_ids
2335         return serialized
2336 test_parser = (args) ->
2337         debug_log_reset()
2338         parse_errors = []
2339         errors_cb = (i) ->
2340                 parse_errors.push i
2341         prev_node_id = 0 # reset counter
2342         parsed = parse_html args.html, errors_cb
2343         serialized = serialize_els parsed, false, false
2344         if serialized isnt args.expected
2345                 debug_log_each (str) ->
2346                         console.log str
2347                 console.log "FAILED: \"#{args.name}\""
2348                 console.log "      Input: #{args.html}"
2349                 console.log "    Correct: #{args.expected}"
2350                 console.log "     Output: #{serialized}"
2351                 if parse_errors.length > 0
2352                         console.log " parse errs: #{JSON.stringify parse_errors}"
2353                 else
2354                         console.log "   No parse errors"
2355         else
2356                 console.log "passed \"#{args.name}\""
2357
2358 test_parser name: "empty", \
2359         html: "",
2360         expected: ''
2361 test_parser name: "just text", \
2362         html: "abc",
2363         expected: 'text:"abc"'
2364 test_parser name: "named entity", \
2365         html: "a&amp;1234",
2366         expected: 'text:"a&1234"'
2367 test_parser name: "broken named character references", \
2368         html: "1&amp2&&amp;3&aabbcc;",
2369         expected: 'text:"1&2&&3&aabbcc;"'
2370 test_parser name: "numbered entity overrides", \
2371         html: "1&#X80&#x80; &#x83",
2372         expected: 'text:"1€€ ƒ"'
2373 test_parser name: "open tag", \
2374         html: "foo<span>bar",
2375         expected: 'text:"foo",tag:"span",{},[text:"bar"]'
2376 test_parser name: "open tag with attributes", \
2377         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
2378         expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
2379 test_parser name: "open tag with attributes of various quotings", \
2380         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
2381         expected: 'text:"foo",tag:"span",{"abc":"def","autofocus":"","g":"hij","klm":"nopqrstuv\\""},[text:"bar"]'
2382 test_parser name: "attribute entity exceptions dq", \
2383         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
2384         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
2385 test_parser name: "attribute entity exceptions sq", \
2386         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
2387         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
2388 test_parser name: "attribute entity exceptions uq", \
2389         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
2390         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
2391 test_parser name: "matching closing tags", \
2392         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
2393         expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"'
2394 test_parser name: "missing closing tag inside", \
2395         html: "foo<div>bar<span>baz</div>qux",
2396         expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"'
2397 test_parser name: "mis-matched closing tags", \
2398         html: "<span>12<div>34</span>56</div>78",
2399         expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]'
2400 test_parser name: "mis-matched formatting elements", \
2401         html: "12<b>34<i>56</b>78</i>90",
2402         expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"'
2403 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
2404         html: '<p>1<b>2<i>3</b>4</i>5</p>',
2405         expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]'
2406 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
2407         html: '<b>1<p>2</b>3</p>',
2408         expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]'
2409 test_parser name: "crazy formatting elements test", \
2410         html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
2411         # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
2412         # firefox does this:
2413         expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
2414 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
2415 test_parser name: "html5lib aaa 1", \
2416         html: '<a><p></a></p>',
2417         expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]'
2418 test_parser name: "html5lib aaa 2", \
2419         html: '<a>1<p>2</a>3</p>',
2420         expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]'
2421 test_parser name: "html5lib aaa 3", \
2422         html: '<a>1<button>2</a>3</button>',
2423         expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]'
2424 test_parser name: "html5lib aaa 4", \
2425         html: '<a>1<b>2</a>3</b>',
2426         expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]'
2427 test_parser name: "html5lib aaa 5 (two divs deep)", \
2428         html: '<a>1<div>2<div>3</a>4</div>5</div>',
2429         expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]'
2430 test_parser name: "html5lib aaa 6 (foster parenting)", \
2431         html: '<table><a>1<p>2</a>3</p>',
2432         expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]'
2433 test_parser name: "html5lib aaa 7 (aaa, eof) 1", \
2434         html: '<b><b><a><p></a>',
2435         expected: 'tag:"b",{},[tag:"b",{},[tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]]]'
2436 test_parser name: "html5lib aaa 8 (aaa, eof) 2", \
2437         html: '<b><a><b><p></a>',
2438         expected: 'tag:"b",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2439 test_parser name: "html5lib aaa 9 (aaa, eof) 3", \
2440         html: '<a><b><b><p></a>',
2441         expected: 'tag:"a",{},[tag:"b",{},[tag:"b",{},[]]],tag:"b",{},[tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2442 test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \
2443         html: '<p>1<s id="A">2<b id="B">3</p>4</s>5</b>',
2444         expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]'
2445 test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
2446         html: '<table><a>1<td>2</td>3</table>',
2447         expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]'
2448 test_parser name: "html5lib aaa 12 (table with foster parenting, split text)", \
2449         html: '<table>A<td>B</td>C</table>',
2450         expected: 'text:"AC",tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2451 # TODO implement svg and namespacing
2452 #test_parser name: "html5lib aaa 13 (svg tr input)", \
2453 #       html: '<a><svg><tr><input></a>',
2454 #       expected: 'tag:"a",{},[svg:"svg",{},[svg:"tr",{},[svg:"input"]]]'
2455 test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \
2456         html: '<div><a><b><div><div><div><div><div><div><div><div><div><div></a>',
2457         expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"div",{},[tag:"div",{},[]]]]]]]]]]]]]'
2458 test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \
2459         html: '<div><a><b><u><i><code><div></a>',
2460         expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]'
2461 test_parser name: "html5lib aaa 16 (correctly nested 4b)", \
2462         html: '<b><b><b><b>x</b></b></b></b>y',
2463         expected: 'tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]],text:"y"'
2464 test_parser name: "html5lib aaa 17 (formatting, implied /p, noah's ark)", \
2465         html: '<p><b><b><b><b><p>x',
2466         expected: 'tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[]]]]],tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]]'
2467 test_parser name: "variation on html5lib aaa 17 (with attributes in various orders)", \
2468         html: '<p><b c="d" e="f"><b e="f" c="d"><b e="f" c="d"><b c="d" e="f"><p>x',
2469         expected: 'tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[]]]]],tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[text:"x"]]]]'
2470 test_parser name: "junk after attribute close-quote", \
2471         html: '<p><b c="d", e="f">foo<p>x',
2472         expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'
2473 test_parser name: "html5lib aaa02 1", \
2474         html: '<b>1<i>2<p>3</b>4',
2475         expected: 'tag:"b",{},[text:"1",tag:"i",{},[text:"2"]],tag:"i",{},[tag:"p",{},[tag:"b",{},[text:"3"],text:"4"]]'
2476 test_parser name: "html5lib aaa02 2", \
2477         html: '<a><div><style></style><address><a>',
2478         expected: 'tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"style",{},[]],tag:"address",{},[tag:"a",{},[],tag:"a",{},[]]]'
2479 test_parser name: "html5lib tables 1", \
2480         html: '<table><th>',
2481         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"th",{},[]]]]'
2482 test_parser name: "html5lib tables 2", \
2483         html: '<table><td>',
2484         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
2485 test_parser name: "html5lib tables 3", \
2486         html: "<table><col foo='bar'>",
2487         expected: 'tag:"table",{},[tag:"colgroup",{},[tag:"col",{"foo":"bar"},[]]]'
2488 test_parser name: "html5lib tables 4", \
2489         html: '<table><colgroup></html>foo',
2490         expected: 'text:"foo",tag:"table",{},[tag:"colgroup",{},[]]'
2491 test_parser name: "html5lib tables 5", \
2492         html: '<table></table><p>foo',
2493         expected: 'tag:"table",{},[],tag:"p",{},[text:"foo"]'
2494 test_parser name: "html5lib tables 6", \
2495         html: '<table></body></caption></col></colgroup></html></tbody></td></tfoot></th></thead></tr><td>',
2496         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
2497 test_parser name: "html5lib tables 7", \
2498         html: '<table><select><option>3</select></table>',
2499         expected: 'tag:"select",{},[tag:"option",{},[text:"3"]],tag:"table",{},[]'
2500 test_parser name: "html5lib tables 8", \
2501         html: '<table><select><table></table></select></table>',
2502         expected: 'tag:"select",{},[],tag:"table",{},[],tag:"table",{},[]'
2503 test_parser name: "html5lib tables 9", \
2504         html: '<table><select></table>',
2505         expected: 'tag:"select",{},[],tag:"table",{},[]'
2506 test_parser name: "html5lib tables 10", \
2507         html: '<table><select><option>A<tr><td>B</td></tr></table>',
2508         expected: 'tag:"select",{},[tag:"option",{},[text:"A"]],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2509 test_parser name: "html5lib tables 11", \
2510         html: '<table><td></body></caption></col></colgroup></html>foo',
2511         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
2512 test_parser name: "html5lib tables 12", \
2513         html: '<table><td>A</table>B',
2514         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"A"]]]],text:"B"'
2515 test_parser name: "html5lib tables 13", \
2516         html: '<table><tr><caption>',
2517         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[]],tag:"caption",{},[]]'
2518 test_parser name: "html5lib tables 14", \
2519         html: '<table><tr></body></caption></col></colgroup></html></td></th><td>foo',
2520         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
2521 test_parser name: "html5lib tables 15", \
2522         html: '<table><td><tr>',
2523         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]],tag:"tr",{},[]]]'
2524 test_parser name: "html5lib tables 16", \
2525         html: '<table><td><button><td>',
2526         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[tag:"button",{},[]],tag:"td",{},[]]]]'
2527 # TODO implement svg parsing
2528 #test_parser name: "html5lib tables 17", \
2529 #       html: '<table><tr><td><svg><desc><td>',
2530 #       expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[svg:"svg",{},[svg:"desc",{},[]]],tag:"td",{},[]]]]'