parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
  20 # or <body> tags, nor does it produce the top level "document" node in the dom
  21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
  22 # indicate places where additional code is needed for full HTML document
  23 # parsing.
  24 #
  25 # Instead, the data structure produced by this parser is an array of Nodes.
  26
  27
  28 # stacks/lists
  29 #
  30 # the spec uses a many different words do indicate which ends of lists/stacks
  31 # they are talking about (and relative movement within the lists/stacks). This
  32 # section splains. I'm implementing "lists" (afe and open_els) the same way
  33 # (both as stacks)
  34 #
  35 # stacks grow downward (current element is index=0)
  36 #
  37 # example: open_els = [a, b, c, d, e, f, g]
  38 #
  39 # "grows downwards" means it's visualized like this: (index: el, names)
  40 #
  41 #   6: g "start of the list", "topmost", "first"
  42 #   5: f
  43 #   4: e "previous" (to d), "above", "before"
  44 #   3: d   (previous/next are relative to this element)
  45 #   2: c "next", "after", "lower", "below"
  46 #   1: b
  47 #   0: a "end of the list", "current node", "bottommost", "last"
  48
  49
  50
  51 # Each node is an obect of the Node class. Here are the Node types:
  52 TYPE_TAG = 0 # name, {attributes}, [children]
  53 TYPE_TEXT = 1 # "text"
  54 TYPE_COMMENT = 2
  55 TYPE_DOCTYPE = 3
  56 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  57 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
  58 TYPE_END_TAG = 5 # name
  59 TYPE_EOF = 6
  60 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
  61 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
  62
  63 # namespace constants
  64 NS_HTML = 1
  65 NS_MATHML = 2
  66 NS_SVG = 3
  67
  68 g_debug_log = []
  69 debug_log_reset = ->
  70         g_debug_log = []
  71 debug_log = (str) ->
  72         g_debug_log.push str
  73 debug_log_each = (cb) ->
  74         for str in g_debug_log
  75                 cb str
  76
  77 prev_node_id = 0
  78 class Node
  79         constructor: (type, args = {}) ->
  80                 @type = type # one of the TYPE_* constants above
  81                 @name = args.name ? '' # tag name
  82                 @text = args.text ? '' # contents for text/comment nodes
  83                 @attrs = args.attrs ? {}
  84                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
  85                 @children = args.children ? []
  86                 @namespace = args.namespace ? NS_HTML
  87                 @parent = args.parent ? null
  88                 if args.id?
  89                         @id = "#{args.id}+"
  90                 else
  91                         @id = "#{++prev_node_id}"
  92         shallow_clone: -> # return a new node that's the same except without the children or parent
  93                 # WARNING this doesn't work right on open tags that are still being parsed
  94                 attrs = {}
  95                 attrs[k] = v for k, v of @attrs
  96                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
  97         acknowledge_self_closing: ->
  98                 @flag 'did_self_close', true
  99         flag: ->
 100                 # fixfull
 101         serialize: (shallow = false, show_ids = false) -> # for unit tests
 102                 ret = ''
 103                 switch @type
 104                         when TYPE_TAG
 105                                 ret += 'tag:'
 106                                 ret += JSON.stringify @name
 107                                 ret += ','
 108                                 if show_ids
 109                                         ret += "##{@id},"
 110                                 if shallow
 111                                         break
 112                                 attr_keys = []
 113                                 for k of @attrs
 114                                         attr_keys.push k
 115                                 attr_keys.sort()
 116                                 ret += '{'
 117                                 sep = ''
 118                                 for k in attr_keys
 119                                         ret += sep
 120                                         sep = ','
 121                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
 122                                 ret += '},['
 123                                 sep = ''
 124                                 for c in @children
 125                                         ret += sep
 126                                         sep = ','
 127                                         ret += c.serialize shallow, show_ids
 128                                 ret += ']'
 129                         when TYPE_TEXT
 130                                 ret += 'text:'
 131                                 ret += JSON.stringify @text
 132                         when TYPE_COMMENT
 133                                 ret += 'comment:'
 134                                 ret += JSON.stringify @text
 135                         when TYPE_DOCTYPE
 136                                 ret += 'doctype'
 137                                 # FIXME
 138                         when TYPE_AFE_MARKER
 139                                 ret += 'marker'
 140                         when TYPE_AAA_BOOKMARK
 141                                 ret += 'aaa_bookmark'
 142                         else
 143                                 ret += 'unknown:'
 144                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
 145                 return ret
 146
 147 # helpers: (only take args that are normally known when parser creates nodes)
 148 new_open_tag = (name) ->
 149         return new Node TYPE_START_TAG, name: name
 150 new_end_tag = (name) ->
 151         return new Node TYPE_END_TAG, name: name
 152 new_element = (name) ->
 153         return new Node TYPE_TAG, name: name
 154 new_text_node = (txt) ->
 155         return new Node TYPE_TEXT, text: txt
 156 new_character_token = new_text_node
 157 new_comment_node = (txt) ->
 158         return new Node TYPE_COMMENT, text: txt
 159 new_eof_token = ->
 160         return new Node TYPE_EOF
 161 new_afe_marker = ->
 162         return new Node TYPE_AFE_MARKER
 163 new_aaa_bookmark = ->
 164         return new Node TYPE_AAA_BOOKMARK
 165
 166 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
 167 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 168 digits = "0123456789"
 169 alnum = lc_alpha + uc_alpha + digits
 170 hex_chars = digits + "abcdefABCDEF"
 171
 172 # some SVG elements have dashes in them
 173 tag_name_chars = alnum + "-"
 174
 175 # http://www.w3.org/TR/html5/infrastructure.html#space-character
 176 space_chars = "\u0009\u000a\u000c\u000d\u0020"
 177
 178 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
 179 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
 180
 181 # These are the character references that don't need a terminating semicolon
 182 # min length: 2, max: 6, none are a prefix of any other.
 183 legacy_char_refs = {
 184         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
 185         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
 186         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
 187         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
 188         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
 189         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
 190         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
 191         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
 192         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
 193         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
 194         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
 195         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
 196         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
 197         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
 198         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
 199         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
 200         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
 201         yen: '¥', yuml: 'ÿ'
 202 }
 203
 204 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
 205 raw_text_elements = ['script', 'style']
 206 escapable_raw_text_elements = ['textarea', 'title']
 207 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
 208 svg_elements = [
 209         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
 210         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
 211         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
 212         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
 213         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
 214         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
 215         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
 216         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
 217         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
 218         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
 219         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
 220         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
 221         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
 222         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
 223         'view', 'vkern'
 224 ]
 225
 226 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
 227 mathml_elements = [
 228         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
 229         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
 230         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
 231         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
 232         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
 233         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 234         'determinant', 'diff', 'divergence', 'divide', 'domain',
 235         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 236         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 237         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 238         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 239         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 240         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 241         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 242         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 243         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 244         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 245         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 246         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 247         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 248         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 249         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 250         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 251         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 252         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 253         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 254         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 255         'vectorproduct', 'xor'
 256 ]
 257 # foreign_elements = [svg_elements..., mathml_elements...]
 258 #normal_elements = All other allowed HTML elements are normal elements.
 259
 260 special_elements = {
 261         # HTML:
 262         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
 263         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
 264         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
 265         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
 266         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
 267         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
 268         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
 269         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
 270         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
 271         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
 272         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
 273         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
 274         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
 275         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
 276         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
 277         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
 278         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
 279         wbr:NS_HTML, xmp:NS_HTML,
 280
 281         # MathML:
 282         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
 283         'annotation-xml':NS_MATHML,
 284
 285         # SVG:
 286         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
 287 }
 288
 289 formatting_elements = {
 290          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
 291          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
 292          u: true
 293 }
 294
 295 foster_parenting_targets = {
 296         table: true
 297         tbody: true
 298         tfoot: true
 299         thead: true
 300         tr: true
 301 }
 302
 303 # all html I presume
 304 end_tag_implied = {
 305         dd: true
 306         dt: true
 307         li: true
 308         option: true
 309         optgroup: true
 310         p: true
 311         rb: true
 312         rp: true
 313         rt: true
 314         rtc: true
 315 }
 316
 317 el_is_special = (e) ->
 318         return special_elements[e.name]?
 319         # FIXME it should really be:
 320         #return special_elements[e.name] is e.namespace
 321
 322 # decode_named_char_ref()
 323 #
 324 # The list of named character references is _huge_ so ask the browser to decode
 325 # for us instead of wasting bandwidth/space on including the table here.
 326 #
 327 # Pass without the "&" but with the ";" examples:
 328 #    for "&amp" pass "amp;"
 329 #    for "&#x2032" pass "x2032;"
 330 g_dncr = {
 331         cache: {}
 332         textarea: document.createElement('textarea')
 333 }
 334 # TODO test this in IE8
 335 decode_named_char_ref = (txt) ->
 336         txt = "&#{txt}"
 337         decoded = g_dncr.cache[txt]
 338         return decoded if decoded?
 339         g_dncr.textarea.innerHTML = txt
 340         decoded = g_dncr.textarea.value
 341         return null if decoded is txt
 342         return g_dncr.cache[txt] = decoded
 343
 344 parse_html = (txt, parse_error_cb = null) ->
 345         cur = 0 # index of next char in txt to be parsed
 346         # declare tree and tokenizer variables so they're in scope below
 347         tree = null
 348         open_els = null # stack of open elements
 349         afe = null # active formatting elements
 350         template_insertion_modes = null
 351         insertion_mode = null
 352         original_insertion_mode = null
 353         tok_state = null
 354         tok_cur_tag = null # partially parsed tag
 355         flag_scripting = null
 356         flag_frameset_ok = null
 357         flag_parsing = null
 358         flag_foster_parenting = null
 359         form_element_pointer = null
 360         temporary_buffer = null
 361         pending_table_character_tokens = null
 362
 363         parse_error = ->
 364                 if parse_error_cb?
 365                         parse_error_cb cur
 366                 else
 367                         console.log "Parse error at character #{cur} of #{txt.length}"
 368
 369         afe_push = (new_el) ->
 370                 matches = 0
 371                 for el, i in afe
 372                         if el.name is new_el.name and el.namespace is new_el.namespace
 373                                 for k, v of el.attrs
 374                                         continue unless new_el.attrs[k] is v
 375                                 for k, v of new_el.attrs
 376                                         continue unless el.attrs[k] is v
 377                                 matches += 1
 378                                 if matches is 3
 379                                         afe.splice i, 1
 380                                         break
 381                 afe.unshift new_el
 382         afe_push_marker = ->
 383                 afe.unshift new_afe_marker()
 384
 385         # the functions below impliment the Tree Contstruction algorithm
 386         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 387
 388         # But first... the helpers
 389         template_tag_is_open = ->
 390                 for t in open_els
 391                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
 392                                 return true
 393                 return false
 394         is_in_scope_x = (tag_name, scope, namespace) ->
 395                 for t in open_els
 396                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 397                                 return true
 398                         if scope[t.name] is t.namespace
 399                                 return false
 400                 return false
 401         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
 402                 for t in open_els
 403                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 404                                 return true
 405                         if scope[t.name] is t.namespace
 406                                 return false
 407                         if scope2[t.name] is t.namespace
 408                                 return false
 409                 return false
 410         standard_scopers = { # FIXME these are supposed to be namespace specific
 411                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
 412                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
 413                 template: NS_HTML, mi: NS_MATHML,
 414
 415                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
 416                 'annotation-xml': NS_MATHML,
 417
 418                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
 419         }
 420         button_scopers = button: NS_HTML
 421         li_scopers = ol: NS_HTML, ul: NS_HTML
 422         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
 423         is_in_scope = (tag_name, namespace = null) ->
 424                 return is_in_scope_x tag_name, standard_scopers, namespace
 425         is_in_button_scope = (tag_name, namespace = null) ->
 426                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
 427         is_in_table_scope = (tag_name, namespace = null) ->
 428                 return is_in_scope_x tag_name, table_scopers, namespace
 429         is_in_select_scope = (tag_name, namespace = null) ->
 430                 for t in open_els
 431                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 432                                 return true
 433                         if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
 434                                 return false
 435                 return false
 436         # this checks for a particular element, not by name
 437         el_is_in_scope = (el) ->
 438                 for t in open_els
 439                         if t is el
 440                                 return true
 441                         if standard_scopers[t.name] is t.namespace
 442                                 return false
 443                 return false
 444
 445         # 8.2.3.1 ...
 446         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
 447         reset_insertion_mode = ->
 448                 # 1. Let last be false.
 449                 last = false
 450                 # 2. Let node be the last node in the stack of open elements.
 451                 node_i = 0
 452                 node = open_els[node_i]
 453                 # 3. Loop: If node is the first node in the stack of open elements,
 454                 # then set last to true, and, if the parser was originally created as
 455                 # part of the HTML fragment parsing algorithm (fragment case) set node
 456                 # to the context element.
 457                 loop
 458                         if node_i is open_els.length - 1
 459                                 last = true
 460                                 # fixfull (fragment case)
 461
 462                         # 4. If node is a select element, run these substeps:
 463                         if node.name is 'select'
 464                                 # 1. If last is true, jump to the step below labeled done.
 465                                 unless last
 466                                         # 2. Let ancestor be node.
 467                                         ancestor_i = node_i
 468                                         ancestor = node
 469                                         # 3. Loop: If ancestor is the first node in the stack of
 470                                         # open elements, jump to the step below labeled done.
 471                                         loop
 472                                                 if ancestor_i is open_els.length - 1
 473                                                         break
 474                                                 # 4. Let ancestor be the node before ancestor in the stack
 475                                                 # of open elements.
 476                                                 ancestor_i += 1
 477                                                 ancestor = open_els[ancestor_i]
 478                                                 # 5. If ancestor is a template node, jump to the step below
 479                                                 # labeled done.
 480                                                 if ancestor.name is 'template'
 481                                                         break
 482                                                 # 6. If ancestor is a table node, switch the insertion mode
 483                                                 # to "in select in table" and abort these steps.
 484                                                 if ancestor.name is 'table'
 485                                                         insertion_mode = ins_mode_in_select_in_table
 486                                                         return
 487                                                 # 7. Jump back to the step labeled loop.
 488                                 # 8. Done: Switch the insertion mode to "in select" and abort
 489                                 # these steps.
 490                                 insertion_mode = ins_mode_in_select
 491                                 return
 492                         # 5. If node is a td or th element and last is false, then switch
 493                         # the insertion mode to "in cell" and abort these steps.
 494                         if (node.name is 'td' or node.name is 'th') and last is false
 495                                 insertion_mode = ins_mode_in_cell
 496                                 return
 497                         # 6. If node is a tr element, then switch the insertion mode to "in
 498                         # row" and abort these steps.
 499                         if node.name is 'tr'
 500                                 insertion_mode = ins_mode_in_row
 501                                 return
 502                         # 7. If node is a tbody, thead, or tfoot element, then switch the
 503                         # insertion mode to "in table body" and abort these steps.
 504                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
 505                                 insertion_mode = ins_mode_in_table_body
 506                                 return
 507                         # 8. If node is a caption element, then switch the insertion mode
 508                         # to "in caption" and abort these steps.
 509                         if node.name is 'caption'
 510                                 insertion_mode = ins_mode_in_caption
 511                                 return
 512                         # 9. If node is a colgroup element, then switch the insertion mode
 513                         # to "in column group" and abort these steps.
 514                         if node.name is 'colgroup'
 515                                 insertion_mode = ins_mode_in_column_group
 516                                 return
 517                         # 10. If node is a table element, then switch the insertion mode to
 518                         # "in table" and abort these steps.
 519                         if node.name is 'table'
 520                                 insertion_mode = ins_mode_in_table
 521                                 return
 522                         # 11. If node is a template element, then switch the insertion mode
 523                         # to the current template insertion mode and abort these steps.
 524                         # fixfull (template insertion mode stack)
 525
 526                         # 12. If node is a head element and last is true, then switch the
 527                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
 528                         # these steps. (fragment case)
 529                         if node.name is 'head' and last
 530                                 insertion_mode = ins_mode_in_body
 531                                 return
 532                         # 13. If node is a head element and last is false, then switch the
 533                         # insertion mode to "in head" and abort these steps.
 534                         if node.name is 'head' and last is false
 535                                 insertion_mode = ins_mode_in_head
 536                                 return
 537                         # 14. If node is a body element, then switch the insertion mode to
 538                         # "in body" and abort these steps.
 539                         if node.name is 'body'
 540                                 insertion_mode = ins_mode_in_body
 541                                 return
 542                         # 15. If node is a frameset element, then switch the insertion mode
 543                         # to "in frameset" and abort these steps. (fragment case)
 544                         if node.name is 'frameset'
 545                                 insertion_mode = ins_mode_in_frameset
 546                                 return
 547                         # 16. If node is an html element, run these substeps:
 548                         if node.name is 'html'
 549                                 # 1. If the head element pointer is null, switch the insertion
 550                                 # mode to "before head" and abort these steps. (fragment case)
 551                                 # fixfull (fragment case)
 552
 553                                 # 2. Otherwise, the head element pointer is not null, switch
 554                                 # the insertion mode to "after head" and abort these steps.
 555                                 insertion_mode = ins_mode_in_body # FIXME fixfull
 556                                 return
 557                         # 17. If last is true, then switch the insertion mode to "in body"
 558                         # and abort these steps. (fragment case)
 559                         if last
 560                                 insertion_mode = ins_mode_in_body
 561                                 return
 562                         # 18. Let node now be the node before node in the stack of open
 563                         # elements.
 564                         node_i += 1
 565                         node = open_els[node_i]
 566                         # 19. Return to the step labeled loop.
 567
 568         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
 569         # this implementation is structured (mostly) as described at the link above.
 570         # capitalized comments are the "labels" described at the link above.
 571         reconstruct_active_formatting_elements = ->
 572                 return if afe.length is 0
 573                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
 574                         return
 575                 # Rewind
 576                 i = 0
 577                 loop
 578                         if i is afe.length - 1
 579                                 break
 580                         i += 1
 581                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
 582                                 i -= 1 # Advance
 583                                 break
 584                 # Create
 585                 loop
 586                         el = afe[i].shallow_clone()
 587                         tree_insert_element el
 588                         afe[i] = el
 589                         break if i is 0
 590                         i -= 1 # Advance
 591
 592         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
 593         # adoption agency algorithm
 594         # overview here:
 595         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
 596         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
 597         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
 598         adoption_agency = (subject) ->
 599                 debug_log "adoption_agency()"
 600                 debug_log "tree: #{serialize_els tree.children, false, true}"
 601                 debug_log "open_els: #{serialize_els open_els, true, true}"
 602                 debug_log "afe: #{serialize_els afe, true, true}"
 603                 if open_els[0].name is subject
 604                         el = open_els[0]
 605                         open_els.shift()
 606                         # remove it from the list of active formatting elements (if found)
 607                         for t, i in afe
 608                                 if t is el
 609                                         afe.splice i, 1
 610                                         break
 611                         debug_log "aaa: starting off with subject on top of stack, exiting"
 612                         return
 613                 outer = 0
 614                 loop
 615                         if outer >= 8
 616                                 return
 617                         outer += 1
 618                         # 5. Let formatting element be the last element in the list of
 619                         # active formatting elements that: is between the end of the list
 620                         # and the last scope marker in the list, if any, or the start of
 621                         # the list otherwise, and  has the tag name subject.
 622                         fe = null
 623                         for t, fe_of_afe in afe
 624                                 if t.type is TYPE_AFE_MARKER
 625                                         break
 626                                 if t.name is subject
 627                                         fe = t
 628                                         break
 629                         # If there is no such element, then abort these steps and instead
 630                         # act as described in the "any other end tag" entry above.
 631                         if fe is null
 632                                 debug_log "aaa: fe not found in afe"
 633                                 in_body_any_other_end_tag subject
 634                                 return
 635                         # 6. If formatting element is not in the stack of open elements,
 636                         # then this is a parse error; remove the element from the list, and
 637                         # abort these steps.
 638                         in_open_els = false
 639                         for t, fe_of_open_els in open_els
 640                                 if t is fe
 641                                         in_open_els = true
 642                                         break
 643                         unless in_open_els
 644                                 debug_log "aaa: fe not found in open_els"
 645                                 parse_error()
 646                                 # "remove it from the list" must mean afe, since it's not in open_els
 647                                 afe.splice fe_of_afe, 1
 648                                 return
 649                         # 7. If formatting element is in the stack of open elements, but
 650                         # the element is not in scope, then this is a parse error; abort
 651                         # these steps.
 652                         unless el_is_in_scope fe
 653                                 debug_log "aaa: fe not in scope"
 654                                 parse_error()
 655                                 return
 656                         # 8. If formatting element is not the current node, this is a parse
 657                         # error. (But do not abort these steps.)
 658                         unless open_els[0] is fe
 659                                 parse_error()
 660                                 # continue
 661                         # 9. Let furthest block be the topmost node in the stack of open
 662                         # elements that is lower in the stack than formatting element, and
 663                         # is an element in the special category. There might not be one.
 664                         fb = null
 665                         fb_of_open_els = null
 666                         for t, i in open_els
 667                                 if t is fe
 668                                         break
 669                                 if el_is_special t
 670                                         fb = t
 671                                         fb_of_open_els = i
 672                                         # and continue, to see if there's one that's more "topmost"
 673                         # 10. If there is no furthest block, then the UA must first pop all
 674                         # the nodes from the bottom of the stack of open elements, from the
 675                         # current node up to and including formatting element, then remove
 676                         # formatting element from the list of active formatting elements,
 677                         # and finally abort these steps.
 678                         if fb is null
 679                                 debug_log "aaa: no fb"
 680                                 loop
 681                                         t = open_els.shift()
 682                                         if t is fe
 683                                                 afe.splice fe_of_afe, 1
 684                                                 return
 685                         # 11. Let common ancestor be the element immediately above
 686                         # formatting element in the stack of open elements.
 687                         ca = open_els[fe_of_open_els + 1] # common ancestor
 688
 689                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
 690                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
 691                         bookmark = new_aaa_bookmark()
 692                         for t, i in afe
 693                                 if t is fe
 694                                         afe.splice i, 0, bookmark
 695                                         break
 696                         node = last_node = fb
 697                         inner = 0
 698                         loop
 699                                 inner += 1
 700                                 # 3. Let node be the element immediately above node in the
 701                                 # stack of open elements, or if node is no longer in the stack
 702                                 # of open elements (e.g. because it got removed by this
 703                                 # algorithm), the element that was immediately above node in
 704                                 # the stack of open elements before node was removed.
 705                                 node_next = null
 706                                 for t, i in open_els
 707                                         if t is node
 708                                                 node_next = open_els[i + 1]
 709                                                 break
 710                                 node = node_next ? node_above
 711                                 debug_log "inner loop #{inner}"
 712                                 debug_log "tree: #{serialize_els tree.children, false, true}"
 713                                 debug_log "open_els: #{serialize_els open_els, true, true}"
 714                                 debug_log "afe: #{serialize_els afe, true, true}"
 715                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
 716                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
 717                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
 718                                 debug_log "node: #{node.serialize true, true}"
 719                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
 720
 721                                 # 4. If node is formatting element, then go to the next step in
 722                                 # the overall algorithm.
 723                                 if node is fe
 724                                         break
 725                                 debug_log "the meat"
 726                                 # 5. If inner loop counter is greater than three and node is in
 727                                 # the list of active formatting elements, then remove node from
 728                                 # the list of active formatting elements.
 729                                 node_in_afe = false
 730                                 for t, i in afe
 731                                         if t is node
 732                                                 if inner > 3
 733                                                         afe.splice i, 1
 734                                                         debug_log "max out inner"
 735                                                 else
 736                                                         node_in_afe = true
 737                                                         debug_log "in afe"
 738                                                 break
 739                                 # 6. If node is not in the list of active formatting elements,
 740                                 # then remove node from the stack of open elements and then go
 741                                 # back to the step labeled inner loop.
 742                                 unless node_in_afe
 743                                         debug_log "not in afe"
 744                                         for t, i in open_els
 745                                                 if t is node
 746                                                         node_above = open_els[i + 1]
 747                                                         open_els.splice i, 1
 748                                                         break
 749                                         continue
 750                                 debug_log "the bones"
 751                                 # 7. create an element for the token for which the element node
 752                                 # was created, in the HTML namespace, with common ancestor as
 753                                 # the intended parent; replace the entry for node in the list
 754                                 # of active formatting elements with an entry for the new
 755                                 # element, replace the entry for node in the stack of open
 756                                 # elements with an entry for the new element, and let node be
 757                                 # the new element.
 758                                 new_node = node.shallow_clone()
 759                                 for t, i in afe
 760                                         if t is node
 761                                                 afe[i] = new_node
 762                                                 debug_log "replaced in afe"
 763                                                 break
 764                                 for t, i in open_els
 765                                         if t is node
 766                                                 node_above = open_els[i + 1]
 767                                                 open_els[i] = new_node
 768                                                 debug_log "replaced in open_els"
 769                                                 break
 770                                 node = new_node
 771                                 # 8. If last node is furthest block, then move the
 772                                 # aforementioned bookmark to be immediately after the new node
 773                                 # in the list of active formatting elements.
 774                                 if last_node is fb
 775                                         for t, i in afe
 776                                                 if t is bookmark
 777                                                         afe.splice i, 1
 778                                                         debug_log "removed bookmark"
 779                                                         break
 780                                         for t, i in afe
 781                                                 if t is node
 782                                                         # "after" means lower
 783                                                         afe.splice i, 0, bookmark # "after as <-
 784                                                         debug_log "placed bookmark after node"
 785                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
 786                                                         break
 787                                 # 9. Insert last node into node, first removing it from its
 788                                 # previous parent node if any.
 789                                 if last_node.parent?
 790                                         debug_log "last_node has parent"
 791                                         for c, i in last_node.parent.children
 792                                                 if c is last_node
 793                                                         debug_log "removing last_node from parent"
 794                                                         last_node.parent.children.splice i, 1
 795                                                         break
 796                                 node.children.push last_node
 797                                 last_node.parent = node
 798                                 # 10. Let last node be node.
 799                                 last_node = node
 800                                 debug_log "at last"
 801                                 # 11. Return to the step labeled inner loop.
 802                         # 14. Insert whatever last node ended up being in the previous step
 803                         # at the appropriate place for inserting a node, but using common
 804                         # ancestor as the override target.
 805
 806                         # JASON: In the case where fe is immediately followed by fb:
 807                         #   * inner loop exits out early (node==fe)
 808                         #   * last_node is fb
 809                         #   * last_node is still in the tree (not a duplicate)
 810                         if last_node.parent?
 811                                 debug_log "FEFIRST? last_node has parent"
 812                                 for c, i in last_node.parent.children
 813                                         if c is last_node
 814                                                 debug_log "removing last_node from parent"
 815                                                 last_node.parent.children.splice i, 1
 816                                                 break
 817
 818                         debug_log "after aaa inner loop"
 819                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
 820                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
 821                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
 822                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
 823                         debug_log "tree: #{serialize_els tree.children, false, true}"
 824
 825                         debug_log "insert"
 826
 827
 828                         # can't use standard insert token thing, because it's already in
 829                         # open_els and must stay at it's current position in open_els
 830                         dest = adjusted_insertion_location ca
 831                         dest[0].children.splice dest[1], 0, last_node
 832                         last_node.parent = dest[0]
 833
 834
 835                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
 836                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
 837                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
 838                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
 839                         debug_log "tree: #{serialize_els tree.children, false, true}"
 840
 841                         # 15. Create an element for the token for which formatting element
 842                         # was created, in the HTML namespace, with furthest block as the
 843                         # intended parent.
 844                         new_element = fe.shallow_clone() # FIXME intended parent thing
 845                         # 16. Take all of the child nodes of furthest block and append them
 846                         # to the element created in the last step.
 847                         while fb.children.length
 848                                 t = fb.children.shift()
 849                                 t.parent = new_element
 850                                 new_element.children.push t
 851                         # 17. Append that new element to furthest block.
 852                         new_element.parent = fb
 853                         fb.children.push new_element
 854                         # 18. Remove formatting element from the list of active formatting
 855                         # elements, and insert the new element into the list of active
 856                         # formatting elements at the position of the aforementioned
 857                         # bookmark.
 858                         for t, i in afe
 859                                 if t is fe
 860                                         afe.splice i, 1
 861                                         break
 862                         for t, i in afe
 863                                 if t is bookmark
 864                                         afe[i] = new_element
 865                                         break
 866                         # 19. Remove formatting element from the stack of open elements,
 867                         # and insert the new element into the stack of open elements
 868                         # immediately below the position of furthest block in that stack.
 869                         for t, i in open_els
 870                                 if t is fe
 871                                         open_els.splice i, 1
 872                                         break
 873                         for t, i in open_els
 874                                 if t is fb
 875                                         open_els.splice i, 0, new_element
 876                                         break
 877                         # 20. Jump back to the step labeled outer loop.
 878                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
 879                         debug_log "tree: #{serialize_els tree.children, false, true}"
 880                         debug_log "open_els: #{serialize_els open_els, true, true}"
 881                         debug_log "afe: #{serialize_els afe, true, true}"
 882                 debug_log "AAA DONE"
 883
 884         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
 885         close_p_element = ->
 886                 generate_implied_end_tags 'p' # arg is exception
 887                 if open_els[0].name isnt 'p'
 888                         parse_error()
 889                 while open_els.length > 1 # just in case
 890                         el = open_els.shift()
 891                         if el.name is 'p'
 892                                 return
 893         close_p_if_in_button_scope = ->
 894                 if is_in_button_scope 'p'
 895                         close_p_element()
 896
 897         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
 898         # aka insert_a_character = (t) ->
 899         insert_character = (t) ->
 900                 dest = adjusted_insertion_location()
 901                 # fixfull check for Document node
 902                 if dest[1] > 0
 903                         prev = dest[0].children[dest[1] - 1]
 904                         if prev.type is TYPE_TEXT
 905                                 prev.text += t.text
 906                                 return
 907                 dest[0].children.splice dest[1], 0, t
 908
 909         # 8.2.5.1
 910         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
 911         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
 912         adjusted_insertion_location = (override_target = null) ->
 913                 # 1. If there was an override target specified, then let target be the
 914                 # override target.
 915                 if override_target?
 916                         target = override_target
 917                 else # Otherwise, let target be the current node.
 918                         target = open_els[0]
 919                 # 2. Determine the adjusted insertion location using the first matching
 920                 # steps from the following list:
 921                 #
 922                 # If foster parenting is enabled and target is a table, tbody, tfoot,
 923                 # thead, or tr element Foster parenting happens when content is
 924                 # misnested in tables.
 925                 if flag_foster_parenting and foster_parenting_targets[target.name]
 926                         loop # once. this is here so we can ``break`` to "abort these substeps"
 927                                 # 1. Let last template be the last template element in the
 928                                 # stack of open elements, if any.
 929                                 last_template = null
 930                                 last_template_i = null
 931                                 for el, i in open_els
 932                                         if el.name is 'template'
 933                                                 last_template = el
 934                                                 last_template_i = i
 935                                                 break
 936                                 # 2. Let last table be the last table element in the stack of
 937                                 # open elements, if any.
 938                                 last_table = null
 939                                 last_table_i
 940                                 for el, i in open_els
 941                                         if el.name is 'table'
 942                                                 last_table = el
 943                                                 last_table_i = i
 944                                                 break
 945                                 # 3. If there is a last template and either there is no last
 946                                 # table, or there is one, but last template is lower (more
 947                                 # recently added) than last table in the stack of open
 948                                 # elements, then: let adjusted insertion location be inside
 949                                 # last template's template contents, after its last child (if
 950                                 # any), and abort these substeps.
 951                                 if last_template and (last_table is null or last_template_i < last_table_i)
 952                                         target = template # fixfull should be it's contents
 953                                         target_i = target.children.length
 954                                         break
 955                                 # 4. If there is no last table, then let adjusted insertion
 956                                 # location be inside the first element in the stack of open
 957                                 # elements (the html element), after its last child (if any),
 958                                 # and abort these substeps. (fragment case)
 959                                 if last_table is null
 960                                         # this is odd
 961                                         target = open_els[open_els.length - 1]
 962                                         target_i = target.children.length
 963                                 # 5. If last table has a parent element, then let adjusted
 964                                 # insertion location be inside last table's parent element,
 965                                 # immediately before last table, and abort these substeps.
 966                                 if last_table.parent?
 967                                         for c, i in last_table.parent.children
 968                                                 if c is last_table
 969                                                         target = last_table.parent
 970                                                         target_i = i
 971                                                         break
 972                                         break
 973                                 # 6. Let previous element be the element immediately above last
 974                                 # table in the stack of open elements.
 975                                 #
 976                                 # huh? how could it not have a parent?
 977                                 previous_element = open_els[last_table_i + 1]
 978                                 # 7. Let adjusted insertion location be inside previous
 979                                 # element, after its last child (if any).
 980                                 target = previous_element
 981                                 target_i = target.children.length
 982                                 # Note: These steps are involved in part because it's possible
 983                                 # for elements, the table element in this case in particular,
 984                                 # to have been moved by a script around in the DOM, or indeed
 985                                 # removed from the DOM entirely, after the element was inserted
 986                                 # by the parser.
 987                                 break # don't really loop
 988                 else
 989                         # Otherwise Let adjusted insertion location be inside target, after
 990                         # its last child (if any).
 991                         target_i = target.children.length
 992
 993                 # 3. If the adjusted insertion location is inside a template element,
 994                 # let it instead be inside the template element's template contents,
 995                 # after its last child (if any).
 996                 # fixfull (template)
 997
 998                 # 4. Return the adjusted insertion location.
 999                 return [target, target_i]
1000
1001         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1002         # aka create_an_element_for_token
1003         token_to_element = (t, namespace, intended_parent) ->
1004                 t.type = TYPE_TAG # not TYPE_START_TAG
1005                 # convert attributes into a hash
1006                 attrs = {}
1007                 while t.attrs_a.length
1008                         a = t.attrs_a.pop()
1009                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1010                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
1011
1012                 # TODO 2. If the newly created element has an xmlns attribute in the
1013                 # XMLNS namespace whose value is not exactly the same as the element's
1014                 # namespace, that is a parse error. Similarly, if the newly created
1015                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1016                 # value is not the XLink Namespace, that is a parse error.
1017
1018                 # fixfull: the spec says stuff about form pointers and ownerDocument
1019
1020                 return el
1021
1022         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1023         insert_foreign_element = (token, namespace) ->
1024                 ail = adjusted_insertion_location()
1025                 ail_el = ail[0]
1026                 ail_i = ail[1]
1027                 el = token_to_element token, namespace, ail_el
1028                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1029                 el.parent = ail_el
1030                 ail_el.children.splice ail_i, 0, el
1031                 open_els.unshift el
1032                 return el
1033         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1034         insert_html_element = insert_foreign_element # (token, namespace) ->
1035
1036         # FIXME read implement "foster parenting" part
1037         # FIXME read spec, do this right
1038         # FIXME implement the override target thing
1039         # note: this assumes it's an open tag
1040         # FIXME what part of the spec is this?
1041         # TODO look through all callers of this, and see what they should really be doing.
1042         #   eg probably insert_html_element for tokens
1043         tree_insert_element = (el, override_target = null, namespace = null) ->
1044                 if namespace?
1045                         el.namespace = namespace
1046                 dest = adjusted_insertion_location override_target
1047                 if el.type is TYPE_START_TAG # means it's a "token"
1048                         el = token_to_element el, namespace, dest[0]
1049                 unless el.namespace?
1050                         namespace = dest.namespace
1051                 # fixfull: Document nodes sometimes can't accept more chidren
1052                 dest[0].children.splice dest[1], 0, el
1053                 el.parent = dest[0]
1054                 open_els.unshift el
1055                 return el
1056
1057         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1058         # position should be [node, index_within_children]
1059         insert_comment = (t, position = null) ->
1060                 position ?= adjusted_insertion_location()
1061                 position[0].children.splice position[1], 0, t
1062
1063         # 8.2.5.2
1064         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1065         parse_generic_raw_text = (t) ->
1066                 insert_html_element t
1067                 tok_state = tok_state_rawtext
1068                 original_insertion_mode = insertion_mode
1069                 insertion_mode = ins_mode_text
1070         parse_generic_rcdata_text = (t) ->
1071                 insert_html_element t
1072                 tok_state = tok_state_rcdata
1073                 original_insertion_mode = insertion_mode
1074                 insertion_mode = ins_mode_text
1075
1076         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1077         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1078         generate_implied_end_tags = (except = null) ->
1079                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1080                         open_els.shift()
1081
1082         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1083         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1084                 open_els.shift() # spec says this will be a 'head' node
1085                 insertion_mode = ins_mode_after_head
1086                 insertion_mode t
1087         ins_mode_in_head = (t) ->
1088                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1089                         insert_character t
1090                         return
1091                 if t.type is TYPE_COMMENT
1092                         insert_comment t
1093                         return
1094                 if t.type is TYPE_DOCTYPE
1095                         parse_error()
1096                         return
1097                 if t.type is TYPE_START_TAG and t.name is 'html'
1098                         ins_mode_in_body t
1099                         return
1100                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1101                         el = insert_html_element t
1102                         open_els.shift()
1103                         el.acknowledge_self_closing()
1104                         return
1105                 if t.type is TYPE_START_TAG and t.name is 'meta'
1106                         el = insert_html_element t
1107                         open_els.shift()
1108                         el.acknowledge_self_closing()
1109                         # fixfull encoding stuff
1110                         return
1111                 if t.type is TYPE_START_TAG and t.name is 'title'
1112                         parse_generic_rcdata_element t
1113                         return
1114                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1115                         parse_generic_raw_text t
1116                         return
1117                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1118                         insert_html_element t
1119                         insertion_mode = in_head_noscript # FIXME implement
1120                         return
1121                 if t.type is TYPE_START_TAG and t.name is 'script'
1122                         ail = adjusted_insertion_location()
1123                         el = token_to_element t, NS_HTML, ail
1124                         el.flag_parser_inserted true # FIXME implement
1125                         # fixfull frament case
1126                         ail[0].children.splice ail[1], 0, el
1127                         open_els.unshift el
1128                         tok_state = tok_state_script_data
1129                         original_insertion_mode = insertion_mode # make sure orig... is defined
1130                         insertion_mode = ins_mode_text # FIXME implement
1131                         return
1132                 if t.type is TYPE_END_TAG and t.name is 'head'
1133                         open_els.shift() # will be a head element... spec says so
1134                         insertion_mode = ins_mode_after_head
1135                         return
1136                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1137                         ins_mode_in_head_else t
1138                         return
1139                 if t.type is TYPE_START_TAG and t.name is 'template'
1140                         insert_html_element t
1141                         afe_push_marker()
1142                         flag_frameset_ok = false
1143                         insertion_mode = ins_mode_in_template
1144                         template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1145                         return
1146                 if t.type is TYPE_END_TAG and t.name is 'template'
1147                         if template_tag_is_open()
1148                                 generate_implied_end_tags
1149                                 if open_els[0].name isnt 'template'
1150                                         parse_error()
1151                                 loop
1152                                         el = open_els.shift()
1153                                         if el.name is 'template'
1154                                                 break
1155                                 clear_afe_to_marker()
1156                                 template_insertion_modes.shift()
1157                                 reset_insertion_mode()
1158                         else
1159                                 parse_error()
1160                         return
1161                 if (t.type is TYPE_OPEN_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1162                         parse_error()
1163                         return
1164                 ins_mode_in_head_else t
1165
1166         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1167         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1168                 for node, i in open_els
1169                         if node.name is name # FIXME check namespace too
1170                                 generate_implied_end_tags name # arg is exception
1171                                 parse_error() unless i is 0
1172                                 while i >= 0
1173                                         open_els.shift()
1174                                         i -= 1
1175                                 return
1176                         if special_elements[node.name]? # FIXME check namespac too
1177                                 parse_error()
1178                                 return
1179         ins_mode_in_body = (t) ->
1180                 switch t.type
1181                         when TYPE_TEXT
1182                                 switch t.text
1183                                         when "\u0000"
1184                                                 parse_error()
1185                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
1186                                                 reconstruct_active_formatting_elements()
1187                                                 insert_character t
1188                                         else
1189                                                 reconstruct_active_formatting_elements()
1190                                                 insert_character t
1191                                                 flag_frameset_ok = false
1192                         when TYPE_COMMENT
1193                                 insert_comment t
1194                         when TYPE_DOCTYPE
1195                                 parse_error()
1196                         when TYPE_START_TAG
1197                                 switch t.name
1198                                         when 'html'
1199                                                 parse_error()
1200                                                 return if template_tag_is_open()
1201                                                 root_attrs = open_els[open_els.length - 1].attrs
1202                                                 for k, v of t.attrs
1203                                                         root_attrs[k] = v unless root_attrs[k]?
1204                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1205                                                 # FIXME also do this for </template> (end tag)
1206                                                 return ins_mode_in_head t
1207                                         when 'body'
1208                                                 parse_error()
1209                                                 # TODO
1210                                         when 'frameset'
1211                                                 parse_error()
1212                                                 # TODO
1213                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1214                                                 close_p_if_in_button_scope()
1215                                                 insert_html_element t
1216                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1217                                                 close_p_if_in_button_scope()
1218                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1219                                                         parse_error()
1220                                                         open_els.shift()
1221                                                 insert_html_element t
1222                                         # TODO lots more to implement here
1223                                         when 'a'
1224                                                 # If the list of active formatting elements
1225                                                 # contains an a element between the end of the list and
1226                                                 # the last marker on the list (or the start of the list
1227                                                 # if there is no marker on the list), then this is a
1228                                                 # parse error; run the adoption agency algorithm for
1229                                                 # the tag name "a", then remove that element from the
1230                                                 # list of active formatting elements and the stack of
1231                                                 # open elements if the adoption agency algorithm didn't
1232                                                 # already remove it (it might not have if the element
1233                                                 # is not in table scope).
1234                                                 found = false
1235                                                 for el in afe
1236                                                         if el.type is TYPE_AFE_MARKER
1237                                                                 break
1238                                                         if el.name is 'a'
1239                                                                 found = el
1240                                                 if found?
1241                                                         parse_error()
1242                                                         adoption_agency 'a'
1243                                                         for el, i in afe
1244                                                                 if el is found
1245                                                                         afe.splice i, 1
1246                                                         for el, i in open_els
1247                                                                 if el is found
1248                                                                         open_els.splice i, 1
1249                                                 reconstruct_active_formatting_elements()
1250                                                 el = insert_html_element t
1251                                                 afe_push el
1252                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1253                                                 reconstruct_active_formatting_elements()
1254                                                 el = insert_html_element t
1255                                                 afe_push el
1256                                         when 'table'
1257                                                 # fixfull quirksmode thing
1258                                                 close_p_if_in_button_scope()
1259                                                 insert_html_element t
1260                                                 insertion_mode = ins_mode_in_table
1261                                         # TODO lots more to implement here
1262                                         else # any other start tag
1263                                                 reconstruct_active_formatting_elements()
1264                                                 insert_html_element t
1265                         when TYPE_EOF
1266                                 ok_tags = {
1267                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1268                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1269                                 }
1270                                 for t in open_els
1271                                         unless ok_tags[t.name]?
1272                                                 parse_error()
1273                                                 break
1274                                 # TODO stack of template insertion modes thing
1275                                 flag_parsing = false # stop parsing
1276                         when TYPE_END_TAG
1277                                 switch t.name
1278                                         when 'body'
1279                                                 unless is_in_scope 'body'
1280                                                         parse_error()
1281                                                         return
1282                                                 # TODO implement parse error and move to tree_after_body
1283                                         when 'html'
1284                                                 unless is_in_scope 'body' # weird, but it's what the spec says
1285                                                         parse_error()
1286                                                         return
1287                                                 # TODO implement parse error and move to tree_after_body, reprocess
1288                                         when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1289                                                 unless is_in_scope t.name, NS_HTML
1290                                                         parse_error()
1291                                                         return
1292                                                 generate_implied_end_tags()
1293                                                 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1294                                                         parse_error()
1295                                                 loop
1296                                                         el = open_els.shift()
1297                                                         if el.name is t.name and el.namespace is NS_HTML
1298                                                                 return
1299                                         # TODO lots more close tags to implement here
1300                                         when 'p'
1301                                                 unless is_in_button_scope 'p'
1302                                                         parse_error()
1303                                                         insert_html_element new_open_tag 'p'
1304                                                 close_p_element()
1305                                         # TODO lots more close tags to implement here
1306                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1307                                                 adoption_agency t.name
1308                                         # TODO lots more close tags to implement here
1309                                         else
1310                                                 in_body_any_other_end_tag t.name
1311                 return
1312
1313         ins_mode_in_table_else = (t) ->
1314                 parse_error()
1315                 flag_foster_parenting = true # FIXME
1316                 ins_mode_in_body t
1317                 flag_foster_parenting = false
1318         can_in_table = {
1319                 'table': true
1320                 'tbody': true
1321                 'tfoot': true
1322                 'thead': true
1323                 'tr': true
1324         }
1325         clear_to_table_stopers = {
1326                 'table': true
1327                 'template': true
1328                 'html': true
1329         }
1330         clear_stack_to_table_context = ->
1331                 loop
1332                         if clear_to_table_stopers[open_els[0].name]?
1333                                 break
1334                         open_els.shift()
1335                 return
1336         clear_to_table_body_stopers = {
1337                 'tbody': true
1338                 'tfoot': true
1339                 'thead': true
1340                 'template': true
1341                 'html': true
1342         }
1343         clear_stack_to_table_body_context = ->
1344                 loop
1345                         if clear_to_table_body_stopers[open_els[0].name]?
1346                                 break
1347                         open_els.shift()
1348                 return
1349         clear_to_table_row_stopers = {
1350                 'tr': true
1351                 'template': true
1352                 'html': true
1353         }
1354         clear_stack_to_table_row_context = ->
1355                 loop
1356                         if clear_to_table_row_stopers[open_els[0].name]?
1357                                 break
1358                         open_els.shift()
1359                 return
1360         clear_afe_to_marker = ->
1361                 loop
1362                         el = afe.shift()
1363                         if el.type is TYPE_AFE_MARKER
1364                                 return
1365
1366         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1367         ins_mode_text = (t) ->
1368                 if t.type is TYPE_TEXT
1369                         insert_character t
1370                         return
1371                 if t.type is TYPE_EOF
1372                         parse_error()
1373                         if open_els[0].name is 'script'
1374                                 open_els[0].flag 'already started', true
1375                         open_els.shift()
1376                         insertion_mode = original_insertion_mode
1377                         insertion_mode t
1378                         return
1379                 if t.type is TYPE_END_TAG and t.name is 'script'
1380                         open_els.shift()
1381                         insertion_mode = original_insertion_mode
1382                         # fixfull the spec seems to assume that I'm going to run the script
1383                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1384                         return
1385                 if t.type is TYPE_END_TAG
1386                         open_els.shift()
1387                         insertion_mode = original_insertion_mode
1388                         return
1389                 console.log 'warning: end of ins_mode_text reached'
1390
1391         # the functions below implement the tokenizer stats described here:
1392         # http://www.w3.org/TR/html5/syntax.html#tokenization
1393
1394         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1395         ins_mode_in_table = (t) ->
1396                 switch t.type
1397                         when TYPE_TEXT
1398                                 if can_in_table[t.name]
1399                                         original_insertion_mode = insertion_mode
1400                                         insertion_mode = ins_mode_in_table_text
1401                                         insertion_mode t
1402                                 else
1403                                         ins_mode_in_table_else t
1404                         when TYPE_COMMENT
1405                                 insert_comment t
1406                         when TYPE_DOCTYPE
1407                                 parse_error()
1408                         when TYPE_START_TAG
1409                                 switch t.name
1410                                         when 'caption'
1411                                                 clear_stack_to_table_context()
1412                                                 afe_push_marker()
1413                                                 insert_html_element t
1414                                                 insertion_mode = ins_mode_in_caption
1415                                         when 'colgroup'
1416                                                 clear_stack_to_table_context()
1417                                                 insert_html_element t
1418                                                 insertion_mode = ins_mode_in_column_group
1419                                         when 'col'
1420                                                 clear_stack_to_table_context()
1421                                                 insert_html_element new_open_tag 'colgroup'
1422                                                 insertion_mode = ins_mode_in_column_group
1423                                                 insertion_mode t
1424                                         when 'tbody', 'tfoot', 'thead'
1425                                                 clear_stack_to_table_context()
1426                                                 insert_html_element t
1427                                                 insertion_mode = ins_mode_in_table_body
1428                                         when 'td', 'th', 'tr'
1429                                                 clear_stack_to_table_context()
1430                                                 insert_html_element new_open_tag 'tbody'
1431                                                 insertion_mode = ins_mode_in_table_body
1432                                                 insertion_mode t
1433                                         when 'table'
1434                                                 parse_error()
1435                                                 if is_in_table_scope 'table'
1436                                                         loop
1437                                                                 el = open_els.shift()
1438                                                                 if el.name is 'table'
1439                                                                         break
1440                                                         reset_insertion_mode()
1441                                                         insertion_mode t
1442                                         when 'style', 'script', 'template'
1443                                                 ins_mode_in_head t
1444                                         when 'input'
1445                                                 if token_is_input_hidden t
1446                                                         ins_mode_in_table_else t
1447                                                 else
1448                                                         parse_error()
1449                                                         el = insert_html_element t
1450                                                         open_els.shift()
1451                                                         el.acknowledge_self_closing()
1452                                         when 'form'
1453                                                 parse_error()
1454                                                 if form_element_pointer?
1455                                                         return
1456                                                 if template_tag_is_open()
1457                                                         return
1458                                                 form_element_pointer = insert_html_element t
1459                                                 open_els.shift()
1460                                         else
1461                                                 ins_mode_in_table_else t
1462                         when TYPE_END_TAG
1463                                 switch t.name
1464                                         when 'table'
1465                                                 if is_in_table_scope 'table'
1466                                                         loop
1467                                                                 el = open_els.shift()
1468                                                                 if el.name is 'table'
1469                                                                         break
1470                                                         reset_insertion_mode()
1471                                                 else
1472                                                         parse_error
1473                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1474                                                 parse_error()
1475                                         when 'template'
1476                                                 ins_mode_in_head t
1477                                         else
1478                                                 ins_mode_in_table_else t
1479                         when TYPE_EOF
1480                                 ins_mode_in_body t
1481                         else
1482                                 ins_mode_in_table_else t
1483
1484
1485         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1486         ins_mode_in_table_text = (t) ->
1487                 if t.type is TYPE_TEXT and t.text is "\u0000"
1488                         # huh? I thought the tokenizer didn't emit these
1489                         parse_error()
1490                         return
1491                 if t.type is TYPE_TEXT
1492                         pending_table_character_tokens.push t
1493                         return
1494                 # Anything else
1495                 all_space = true
1496                 for old in pending_table_character_tokens
1497                         unless space_chars.indexOf(old.text) > -1
1498                                 all_space = false
1499                                 break
1500                 if all_space
1501                         for old in pending_table_character_tokens
1502                                 insert_character old
1503                 else
1504                         for old in pending_table_character_tokens
1505                                 ins_mode_table_else old
1506                 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1507                 insertion_mode = original_insertion_mode
1508                 insertion_mode t
1509
1510         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1511         ins_mode_in_caption = (t) ->
1512                 if t.type is TYPE_END_TAG and t.name is 'caption'
1513                         if is_in_table_scope 'caption'
1514                                 generate_implied_end_tags()
1515                                 if open_els[0].name isnt 'caption'
1516                                         parse_error()
1517                                 loop
1518                                         el = open_els.shift()
1519                                         if el.name is 'caption'
1520                                                 break
1521                                 clear_afe_to_marker()
1522                                 insertion_mode = in_table
1523                         else
1524                                 parse_error()
1525                                 # fragment case
1526                         return
1527                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1528                         parse_error()
1529                         if is_in_table_scope 'caption'
1530                                 loop
1531                                         el = open_els.shift()
1532                                         if el.name is 'caption'
1533                                                 break
1534                                 clear_afe_to_marker()
1535                                 insertion_mode = in_table
1536                                 insertion_mode t
1537                         # else fragment case
1538                         return
1539                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1540                         parse_error()
1541                         return
1542                 # Anything else
1543                 ins_mode_in_body t
1544
1545         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1546         ins_mode_in_column_group = (t) ->
1547                 if t.type is TYPE_TEXT and space_chars.indexOf(t.text) > -1
1548                         insert_character t
1549                         return
1550                 if t.type is TYPE_COMMENT
1551                         insert_comment t
1552                         return
1553                 if t.type is TYPE_DOCTYPE
1554                         parse_error()
1555                         return
1556                 if t.type is TYPE_START_TAG and t.name is 'html'
1557                         ins_mode_in_body t
1558                         return
1559                 if t.type is TYPE_START_TAG and t.name is 'col'
1560                         el = insert_html_element t
1561                         open_els.shift()
1562                         el.acknowledge_self_closing()
1563                         return
1564                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1565                         if open_els[0].name is 'colgroup'
1566                                 open_els[0].shift()
1567                                 insertion_mode = ins_mode_in_table
1568                         else
1569                                 parse_error()
1570                         return
1571                 if t.type is TYPE_END_TAG and t.name is 'col'
1572                         parse_error()
1573                         return
1574                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1575                         ins_mode_in_head t
1576                         return
1577                 if t.type is TYPE_EOF
1578                         ins_mode_in_body t
1579                         return
1580                 # Anything else
1581                 if open_els[0].name isnt 'colgroup'
1582                         parse_error()
1583                         return
1584                 open_els.shift()
1585                 insertion_mode = ins_mode_in_table
1586                 insertion_mode t
1587                 return
1588
1589         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1590         ins_mode_in_table_body = (t) ->
1591                 if t.type is TYPE_START_TAG and t.name is 'tr'
1592                         clear_stack_to_table_body_context()
1593                         insert_html_element t
1594                         insertion_mode = ins_mode_in_row
1595                         return
1596                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1597                         parse_error()
1598                         clear_stack_to_table_body_context()
1599                         insert_html_element new_open_tag 'tr'
1600                         insertion_mode = ins_mode_in_row
1601                         insertion_mode t
1602                         return
1603                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1604                         unless is_in_table_scope t.name # fixfull check namespace
1605                                 parse_error()
1606                                 return
1607                         clear_stack_to_table_body_context()
1608                         open_els.shift()
1609                         insertion_mode = ins_mode_in_table
1610                         return
1611                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1612                         has = false
1613                         for el in open_els
1614                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1615                                         has = true
1616                                         break
1617                                 if table_scopers[el.name]
1618                                         break
1619                         if !has
1620                                 parse_error()
1621                                 return
1622                         clear_stack_to_table_body_context()
1623                         open_els.shift()
1624                         insertion_mode = ins_mode_in_table
1625                         insertion_mode t
1626                         return
1627                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1628                         parse_error()
1629                         return
1630                 # Anything else
1631                 ins_mode_in_table t
1632
1633         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1634         ins_mode_in_row = (t) ->
1635                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1636                         clear_stack_to_table_row_context()
1637                         insert_html_element t
1638                         insertion_mode = ins_mode_in_cell
1639                         afe_push_marker()
1640                         return
1641                 if t.type is TYPE_END_TAG and t.name is 'tr'
1642                         if is_in_table_scope 'tr'
1643                                 clear_stack_to_table_row_context()
1644                                 open_els.shift()
1645                                 insertion_mode = ins_mode_in_table_body
1646                         else
1647                                 parse_error()
1648                         return
1649                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1650                         if is_in_table_scope 'tr'
1651                                 clear_stack_to_table_row_context()
1652                                 open_els.shift()
1653                                 insertion_mode = ins_mode_in_table_body
1654                                 insertion_mode t
1655                         else
1656                                 parse_error()
1657                         return
1658                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1659                         if is_in_table_scope t.name # fixfull namespace
1660                                 if is_in_table_scope 'tr'
1661                                         clear_stack_to_table_row_context()
1662                                         open_els.shift()
1663                                         insertion_mode = ins_mode_in_table_body
1664                                         insertion_mode t
1665                         else
1666                                 parse_error()
1667                         return
1668                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1669                         parse_error()
1670                         return
1671                 # Anything else
1672                 ins_mode_in_table t
1673
1674         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1675         close_the_cell = ->
1676                 generate_implied_end_tags()
1677                 unless open_els[0].name is 'td' or open_els[0] is 'th'
1678                         parse_error()
1679                 loop
1680                         el = open_els.shift()
1681                         if el.name is 'td' or el.name is 'th'
1682                                 break
1683                 clear_afe_to_marker()
1684                 insertion_mode = ins_mode_in_row
1685
1686         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1687         ins_mode_in_cell = (t) ->
1688                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1689                         if is_in_table_scope t.name
1690                                 generate_implied_end_tags()
1691                                 if open_els[0].name isnt t.name
1692                                         parse_error
1693                                 loop
1694                                         el = open_els.shift()
1695                                         if el.name is t.name
1696                                                 break
1697                                 clear_afe_to_marker()
1698                                 insertion_mode = ins_mode_in_row
1699                         else
1700                                 parse_error()
1701                         return
1702                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1703                         has = false
1704                         for el in open_els
1705                                 if el.name is 'td' or el.name is 'th'
1706                                         has = true
1707                                         break
1708                                 if table_scopers[el.name]
1709                                         break
1710                         if !has
1711                                 parse_error()
1712                                 return
1713                         close_the_cell()
1714                         insertion_mode t
1715                         return
1716                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1717                         parse_error()
1718                         return
1719                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1720                         if is_in_table_scope t.name # fixfull namespace
1721                                 close_the_cell()
1722                                 insertion_mode t
1723                         else
1724                                 parse_error()
1725                         return
1726                 # Anything Else
1727                 ins_mode_in_body t
1728
1729         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
1730         tok_state_data = ->
1731                 switch c = txt.charAt(cur++)
1732                         when '&'
1733                                 return new_text_node parse_character_reference()
1734                         when '<'
1735                                 tok_state = tok_state_tag_open
1736                         when "\u0000"
1737                                 parse_error()
1738                                 return new_text_node c
1739                         when '' # EOF
1740                                 return new_eof_token()
1741                         else
1742                                 return new_text_node c
1743                 return null
1744
1745         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
1746         # not needed: tok_state_character_reference_in_data = ->
1747         # just call parse_character_reference()
1748
1749         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
1750         tok_state_rcdata = ->
1751                 switch c = txt.charAt(cur++)
1752                         when '&'
1753                                 return new_text_node parse_character_reference()
1754                         when '<'
1755                                 tok_state = tok_state_rcdata_less_than_sign
1756                         when "\u0000"
1757                                 parse_error()
1758                                 return new_character_token "\ufffd"
1759                         when '' # EOF
1760                                 return new_eof_token()
1761                         else
1762                                 return new_character_token c
1763                 return null
1764
1765         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
1766         # not needed: tok_state_character_reference_in_rcdata = ->
1767         # just call parse_character_reference()
1768
1769         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
1770         tok_state_rawtext = ->
1771                 switch c = txt.charAt(cur++)
1772                         when '<'
1773                                 tok_state = tok_state_rawtext_less_than_sign
1774                         when "\u0000"
1775                                 parse_error()
1776                                 return new_character_token "\ufffd"
1777                         when '' # EOF
1778                                 return new_eof_token()
1779                         else
1780                                 return new_character_token c
1781                 return null
1782
1783         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
1784         tok_state_script_data = ->
1785                 switch c = txt.charAt(cur++)
1786                         when '<'
1787                                 tok_state = tok_state_script_data_less_than_sign
1788                         when "\u0000"
1789                                 parse_error()
1790                                 return new_character_token "\ufffd"
1791                         when '' # EOF
1792                                 return new_eof_token()
1793                         else
1794                                 return new_character_token c
1795                 return null
1796
1797         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
1798         tok_state_plaintext = ->
1799                 switch c = txt.charAt(cur++)
1800                         when "\u0000"
1801                                 parse_error()
1802                                 return new_character_token "\ufffd"
1803                         when '' # EOF
1804                                 return new_eof_token()
1805                         else
1806                                 return new_character_token c
1807                 return null
1808
1809
1810         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
1811         tok_state_tag_open = ->
1812                 switch c = txt.charAt(cur++)
1813                         when '!'
1814                                 tok_state = tok_state_markup_declaration_open
1815                         when '/'
1816                                 tok_state = tok_state_end_tag_open
1817                         when '?'
1818                                 parse_error()
1819                                 tok_state = tok_state_bogus_comment
1820                         else
1821                                 if lc_alpha.indexOf(c) > -1
1822                                         tok_cur_tag = new_open_tag c
1823                                         tok_state = tok_state_tag_name
1824                                 else if uc_alpha.indexOf(c) > -1
1825                                         tok_cur_tag = new_open_tag c.toLowerCase()
1826                                         tok_state = tok_state_tag_name
1827                                 else
1828                                         parse_error()
1829                                         tok_state = tok_state_data
1830                                         cur -= 1 # we didn't parse/handle the char after <
1831                                         return new_text_node '<'
1832                 return null
1833
1834         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
1835         tok_state_end_tag_open = ->
1836                 switch c = txt.charAt(cur++)
1837                         when '>'
1838                                 parse_error()
1839                                 tok_state = tok_state_data
1840                         when '' # EOF
1841                                 parse_error()
1842                                 tok_state = tok_state_data
1843                                 return new_text_node '</'
1844                         else
1845                                 if uc_alpha.indexOf(c) > -1
1846                                         tok_cur_tag = new_end_tag c.toLowerCase()
1847                                         tok_state = tok_state_tag_name
1848                                 else if lc_alpha.indexOf(c) > -1
1849                                         tok_cur_tag = new_end_tag c
1850                                         tok_state = tok_state_tag_name
1851                                 else
1852                                         parse_error()
1853                                         tok_state = tok_state_bogus_comment
1854                 return null
1855
1856         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
1857         tok_state_tag_name = ->
1858                 switch c = txt.charAt(cur++)
1859                         when "\t", "\n", "\u000c", ' '
1860                                 tok_state = tok_state_before_attribute_name
1861                         when '/'
1862                                 tok_state = tok_state_self_closing_start_tag
1863                         when '>'
1864                                 tok_state = tok_state_data
1865                                 tmp = tok_cur_tag
1866                                 tok_cur_tag = null
1867                                 return tmp
1868                         when "\u0000"
1869                                 parse_error()
1870                                 tok_cur_tag.name += "\ufffd"
1871                         when '' # EOF
1872                                 parse_error()
1873                                 tok_state = tok_state_data
1874                         else
1875                                 if uc_alpha.indexOf(c) > -1
1876                                         tok_cur_tag.name += c.toLowerCase()
1877                                 else
1878                                         tok_cur_tag.name += c
1879                 return null
1880
1881         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
1882         tok_state_rcdata_less_than_sign = ->
1883                 c = txt.charAt(cur++)
1884                 if c is '/'
1885                         temporary_buffer = ''
1886                         tok_state = tok_state_rcdata_end_tag_open
1887                         return null
1888                 # Anything else
1889                 tok_state = tok_state_rcdata
1890                 cur -= 1 # reconsume the input character
1891                 return new_character_token '<'
1892
1893         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
1894         tok_state_rcdata_end_tag_open = ->
1895                 c = txt.charAt(cur++)
1896                 if uc_alpha.indexOf(c) > -1
1897                         tok_cur_tag = new_end_tag c.toLowerCase()
1898                         temporary_buffer += c
1899                         tok_state = tok_state_rcdata_end_tag_name
1900                         return null
1901                 if lc_alpha.indexOf(c) > -1
1902                         tok_cur_tag = new_end_tag c
1903                         temporary_buffer += c
1904                         tok_state = tok_state_rcdata_end_tag_name
1905                         return null
1906                 # Anything else
1907                 tok_state = tok_state_rcdata
1908                 cur -= 1 # reconsume the input character
1909                 return new_character_token "</" # fixfull separate these
1910
1911         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
1912         is_appropriate_end_tag = (t) ->
1913                 # spec says to check against "the tag name of the last start tag to
1914                 # have been emitted from this tokenizer", but this is only called from
1915                 # the various "raw" states, which I'm pretty sure all push the start
1916                 # token onto open_els. TODO: verify this after the script data states
1917                 # are implemented
1918                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
1919                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
1920
1921         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
1922         tok_state_rcdata_end_tag_name = ->
1923                 c = txt.charAt(cur++)
1924                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
1925                         if is_appropriate_end_tag tok_cur_tag
1926                                 tok_state = tok_state_before_attribute_name
1927                                 return
1928                         # else fall through to "Anything else"
1929                 if c is '/'
1930                         if is_appropriate_end_tag tok_cur_tag
1931                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
1932                                 return
1933                         # else fall through to "Anything else"
1934                 if c is '>'
1935                         if is_appropriate_end_tag tok_cur_tag
1936                                 tok_state = tok_state_data
1937                                 return tok_cur_tag
1938                         # else fall through to "Anything else"
1939                 if uc_alpha.indexOf(c) > -1
1940                         tok_cur_tag.name += c.toLowerCase()
1941                         temporary_buffer += c
1942                         return null
1943                 if lc_alpha.indexOf(c) > -1
1944                         tok_cur_tag.name += c
1945                         temporary_buffer += c
1946                         return null
1947                 # Anything else
1948                 tok_state = tok_state_rcdata
1949                 cur -= 1 # reconsume the input character
1950                 return new_character_token '</' + temporary_buffer # fixfull separate these
1951
1952         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
1953         tok_state_rawtext_less_than_sign = ->
1954                 c = txt.charAt(cur++)
1955                 if c is '/'
1956                         temporary_buffer = ''
1957                         tok_state = tok_state_rawtext_end_tag_open
1958                         return null
1959                 # Anything else
1960                 tok_state = tok_state_rawtext
1961                 cur -= 1 # reconsume the input character
1962                 return new_character_token '<'
1963
1964         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
1965         tok_state_rawtext_end_tag_open = ->
1966                 c = txt.charAt(cur++)
1967                 if uc_alpha.indexOf(c) > -1
1968                         tok_cur_tag = new_end_tag c.toLowerCase()
1969                         temporary_buffer += c
1970                         tok_state = tok_state_rawtext_end_tag_name
1971                         return null
1972                 if lc_alpha.indexOf(c) > -1
1973                         tok_cur_tag = new_end_tag c
1974                         temporary_buffer += c
1975                         tok_state = tok_state_rawtext_end_tag_name
1976                         return null
1977                 # Anything else
1978                 tok_state = tok_state_rawtext
1979                 cur -= 1 # reconsume the input character
1980                 return new_character_token "</" # fixfull separate these
1981
1982         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
1983         tok_state_rawtext_end_tag_name = ->
1984                 c = txt.charAt(cur++)
1985                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
1986                         if is_appropriate_end_tag tok_cur_tag
1987                                 tok_state = tok_state_before_attribute_name
1988                                 return
1989                         # else fall through to "Anything else"
1990                 if c is '/'
1991                         if is_appropriate_end_tag tok_cur_tag
1992                                 tok_state = tok_state_self_closing_start_tag
1993                                 return
1994                         # else fall through to "Anything else"
1995                 if c is '>'
1996                         if is_appropriate_end_tag tok_cur_tag
1997                                 tok_state = tok_state_data
1998                                 return tok_cur_tag
1999                         # else fall through to "Anything else"
2000                 if uc_alpha.indexOf(c) > -1
2001                         tok_cur_tag.name += c.toLowerCase()
2002                         temporary_buffer += c
2003                         return null
2004                 if lc_alpha.indexOf(c) > -1
2005                         tok_cur_tag.name += c
2006                         temporary_buffer += c
2007                         return null
2008                 # Anything else
2009                 tok_state = tok_state_rawtext
2010                 cur -= 1 # reconsume the input character
2011                 return new_character_token '</' + temporary_buffer # fixfull separate these
2012
2013         # TODO _all_ of the missing states here (17-33) are for parsing script tags
2014
2015         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2016         tok_state_before_attribute_name = ->
2017                 attr_name = null
2018                 switch c = txt.charAt(cur++)
2019                         when "\t", "\n", "\u000c", ' '
2020                                 return null
2021                         when '/'
2022                                 tok_state = tok_state_self_closing_start_tag
2023                                 return null
2024                         when '>'
2025                                 tok_state = tok_state_data
2026                                 tmp = tok_cur_tag
2027                                 tok_cur_tag = null
2028                                 return tmp
2029                         when "\u0000"
2030                                 parse_error()
2031                                 attr_name = "\ufffd"
2032                         when '"', "'", '<', '='
2033                                 parse_error()
2034                                 attr_name = c
2035                         when '' # EOF
2036                                 parse_error()
2037                                 tok_state = tok_state_data
2038                         else
2039                                 if uc_alpha.indexOf(c) > -1
2040                                         attr_name = c.toLowerCase()
2041                                 else
2042                                         attr_name = c
2043                 if attr_name?
2044                         tok_cur_tag.attrs_a.unshift [attr_name, '']
2045                         tok_state = tok_state_attribute_name
2046                 return null
2047
2048         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2049         tok_state_attribute_name = ->
2050                 switch c = txt.charAt(cur++)
2051                         when "\t", "\n", "\u000c", ' '
2052                                 tok_state = tok_state_after_attribute_name
2053                         when '/'
2054                                 tok_state = tok_state_self_closing_start_tag
2055                         when '='
2056                                 tok_state = tok_state_before_attribute_value
2057                         when '>'
2058                                 tok_state = tok_state_data
2059                                 tmp = tok_cur_tag
2060                                 tok_cur_tag = null
2061                                 return tmp
2062                         when "\u0000"
2063                                 parse_error()
2064                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2065                         when '"', "'", '<'
2066                                 parse_error()
2067                                 tok_cur_tag.attrs_a[0][0] = c
2068                         when '' # EOF
2069                                 parse_error()
2070                                 tok_state = tok_state_data
2071                         else
2072                                 if uc_alpha.indexOf(c) > -1
2073                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2074                                 else
2075                                         tok_cur_tag.attrs_a[0][0] += c
2076                 return null
2077
2078         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2079         tok_state_after_attribute_name = ->
2080                 c = txt.charAt(cur++)
2081                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2082                         return
2083                 if c is '/'
2084                         tok_state = tok_state_self_closing_start_tag
2085                         return
2086                 if c is '='
2087                         tok_state = tok_state_before_attribute_value
2088                         return
2089                 if c is '>'
2090                         tok_state = tok_state_data
2091                         return
2092                 if uc_alpha.indexOf(c) > -1
2093                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2094                         tok_state = tok_state_attribute_name
2095                         return
2096                 if c is "\u0000"
2097                         parse_error()
2098                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2099                         tok_state = tok_state_attribute_name
2100                         return
2101                 if c is '' # EOF
2102                         parse_error()
2103                         tok_state = tok_state_data
2104                         cur -= 1 # reconsume
2105                         return
2106                 if c is '"' or c is "'" or c is '<'
2107                         parse_error()
2108                         # fall through to Anything else
2109                 # Anything else
2110                 tok_cur_tag.attrs_a.unshift [c, '']
2111                 tok_state = tok_state_attribute_name
2112
2113         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2114         tok_state_before_attribute_value = ->
2115                 switch c = txt.charAt(cur++)
2116                         when "\t", "\n", "\u000c", ' '
2117                                 return null
2118                         when '"'
2119                                 tok_state = tok_state_attribute_value_double_quoted
2120                         when '&'
2121                                 tok_state = tok_state_attribute_value_unquoted
2122                                 cur -= 1
2123                         when "'"
2124                                 tok_state = tok_state_attribute_value_single_quoted
2125                         when "\u0000"
2126                                 # Parse error
2127                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2128                                 tok_state = tok_state_attribute_value_unquoted
2129                         when '>'
2130                                 # Parse error
2131                                 tok_state = tok_state_data
2132                                 tmp = tok_cur_tag
2133                                 tok_cur_tag = null
2134                                 return tmp
2135                         when '' # EOF
2136                                 parse_error()
2137                                 tok_state = tok_state_data
2138                         else
2139                                 tok_cur_tag.attrs_a[0][1] += c
2140                                 tok_state = tok_state_attribute_value_unquoted
2141                 return null
2142
2143         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2144         tok_state_attribute_value_double_quoted = ->
2145                 switch c = txt.charAt(cur++)
2146                         when '"'
2147                                 tok_state = tok_state_after_attribute_value_quoted
2148                         when '&'
2149                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2150                         when "\u0000"
2151                                 # Parse error
2152                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2153                         when '' # EOF
2154                                 parse_error()
2155                                 tok_state = tok_state_data
2156                         else
2157                                 tok_cur_tag.attrs_a[0][1] += c
2158                 return null
2159
2160         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2161         tok_state_attribute_value_single_quoted = ->
2162                 switch c = txt.charAt(cur++)
2163                         when "'"
2164                                 tok_state = tok_state_after_attribute_value_quoted
2165                         when '&'
2166                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2167                         when "\u0000"
2168                                 # Parse error
2169                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2170                         when '' # EOF
2171                                 parse_error()
2172                                 tok_state = tok_state_data
2173                         else
2174                                 tok_cur_tag.attrs_a[0][1] += c
2175                 return null
2176
2177         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2178         tok_state_attribute_value_unquoted = ->
2179                 switch c = txt.charAt(cur++)
2180                         when "\t", "\n", "\u000c", ' '
2181                                 tok_state = tok_state_before_attribute_name
2182                         when '&'
2183                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2184                         when '>'
2185                                 tok_state = tok_state_data
2186                                 tmp = tok_cur_tag
2187                                 tok_cur_tag = null
2188                                 return tmp
2189                         when "\u0000"
2190                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2191                         when '' # EOF
2192                                 parse_error()
2193                                 tok_state = tok_state_data
2194                         else
2195                                 # Parse Error if ', <, = or ` (backtick)
2196                                 tok_cur_tag.attrs_a[0][1] += c
2197                 return null
2198
2199         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2200         tok_state_after_attribute_value_quoted = ->
2201                 switch c = txt.charAt(cur++)
2202                         when "\t", "\n", "\u000c", ' '
2203                                 tok_state = tok_state_before_attribute_name
2204                         when '/'
2205                                 tok_state = tok_state_self_closing_start_tag
2206                         when '>'
2207                                 tok_state = tok_state_data
2208                                 tmp = tok_cur_tag
2209                                 tok_cur_tag = null
2210                                 return tmp
2211                         when '' # EOF
2212                                 parse_error()
2213                                 tok_state = tok_state_data
2214                         else
2215                                 # Parse Error
2216                                 tok_state = tok_state_before_attribute_name
2217                                 cur -= 1 # we didn't handle that char
2218                 return null
2219
2220         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
2221         # Don't set this as a state, just call it
2222         # returns a string (NOT a text node)
2223         parse_character_reference = (allowed_char = null, in_attr = false) ->
2224                 if cur >= txt.length
2225                         return '&'
2226                 switch c = txt.charAt(cur)
2227                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
2228                                 # explicitly not a parse error
2229                                 return '&'
2230                         when ';'
2231                                 # there has to be "one or more" alnums between & and ; to be a parse error
2232                                 return '&'
2233                         when '#'
2234                                 if cur + 1 >= txt.length
2235                                         return '&'
2236                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
2237                                         prefix = '#x'
2238                                         charset = hex_chars
2239                                         start = cur + 2
2240                                 else
2241                                         charset = digits
2242                                         start = cur + 1
2243                                         prefix = '#'
2244                                 i = 0
2245                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
2246                                         i += 1
2247                                 if i is 0
2248                                         return '&'
2249                                 if txt.charAt(start + i) is ';'
2250                                         i += 1
2251                                 # FIXME This is supposed to generate parse errors for some chars
2252                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
2253                                 if decoded?
2254                                         cur = start + i
2255                                         return decoded
2256                                 return '&'
2257                         else
2258                                 for i in [0...31]
2259                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
2260                                                 break
2261                                 if i is 0
2262                                         # exit early, because parse_error() below needs at least one alnum
2263                                         return '&'
2264                                 if txt.charAt(cur + i) is ';'
2265                                         i += 1 # include ';' terminator in value
2266                                         decoded = decode_named_char_ref txt.substr(cur, i)
2267                                         if decoded?
2268                                                 cur += i
2269                                                 return decoded
2270                                         parse_error()
2271                                         return '&'
2272                                 else
2273                                         # no ';' terminator (only legacy char refs)
2274                                         max = i
2275                                         for i in [2..max] # no prefix matches, so ok to check shortest first
2276                                                 c = legacy_char_refs[txt.substr(cur, i)]
2277                                                 if c?
2278                                                         if in_attr
2279                                                                 if txt.charAt(cur + i) is '='
2280                                                                         # "because some legacy user agents will
2281                                                                         # misinterpret the markup in those cases"
2282                                                                         parse_error()
2283                                                                         return '&'
2284                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
2285                                                                         # this makes attributes forgiving about url args
2286                                                                         return '&'
2287                                                         # ok, and besides the weird exceptions for attributes...
2288                                                         # return the matching char
2289                                                         cur += i # consume entity chars
2290                                                         parse_error() # because no terminating ";"
2291                                                         return c
2292                                         parse_error()
2293                                         return '&'
2294                 return # never reached
2295
2296         # tree constructor initialization
2297         # see comments on TYPE_TAG/etc for the structure of this data
2298         tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
2299         open_els = [tree]
2300         afe = [] # active formatting elements
2301         template_insertion_modes = []
2302         insertion_mode = ins_mode_in_body
2303         original_insertion_mode = insertion_mode # TODO check spec
2304         flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
2305         flag_frameset_ok = true
2306         flag_parsing = true
2307         flag_foster_parenting = false
2308         form_element_pointer = null
2309         temporary_buffer = null
2310         pending_table_character_tokens = []
2311
2312         # tokenizer initialization
2313         tok_state = tok_state_data
2314
2315         # proccess input
2316         while flag_parsing
2317                 t = tok_state()
2318                 if t?
2319                         insertion_mode t
2320         return tree.children
2321
2322 # everything below is tests on the above
2323 test_equals = (description, output, expected_output) ->
2324         if output is expected_output
2325                 console.log "passed." # don't say name, so smart consoles can merge all of these
2326         else
2327                 console.log "FAILED: \"#{description}\""
2328                 console.log "   Expected: #{expected_output}"
2329                 console.log "     Actual: #{output}"
2330 serialize_els = (els, shallow, show_ids) ->
2331         serialized = ''
2332         sep = ''
2333         for t in els
2334                 serialized += sep
2335                 sep = ','
2336                 serialized += t.serialize shallow, show_ids
2337         return serialized
2338 test_parser = (args) ->
2339         debug_log_reset()
2340         parse_errors = []
2341         errors_cb = (i) ->
2342                 parse_errors.push i
2343         prev_node_id = 0 # reset counter
2344         parsed = parse_html args.html, errors_cb
2345         serialized = serialize_els parsed, false, false
2346         if serialized isnt args.expected
2347                 debug_log_each (str) ->
2348                         console.log str
2349                 console.log "FAILED: \"#{args.name}\""
2350                 console.log "      Input: #{args.html}"
2351                 console.log "    Correct: #{args.expected}"
2352                 console.log "     Output: #{serialized}"
2353                 if parse_errors.length > 0
2354                         console.log " parse errs: #{JSON.stringify parse_errors}"
2355                 else
2356                         console.log "   No parse errors"
2357         else
2358                 console.log "passed \"#{args.name}\""
2359
2360 test_parser name: "empty", \
2361         html: "",
2362         expected: ''
2363 test_parser name: "just text", \
2364         html: "abc",
2365         expected: 'text:"abc"'
2366 test_parser name: "named entity", \
2367         html: "a&amp;1234",
2368         expected: 'text:"a&1234"'
2369 test_parser name: "broken named character references", \
2370         html: "1&amp2&&amp;3&aabbcc;",
2371         expected: 'text:"1&2&&3&aabbcc;"'
2372 test_parser name: "numbered entity overrides", \
2373         html: "1&#X80&#x80; &#x83",
2374         expected: 'text:"1€€ ƒ"'
2375 test_parser name: "open tag", \
2376         html: "foo<span>bar",
2377         expected: 'text:"foo",tag:"span",{},[text:"bar"]'
2378 test_parser name: "open tag with attributes", \
2379         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
2380         expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
2381 test_parser name: "open tag with attributes of various quotings", \
2382         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
2383         expected: 'text:"foo",tag:"span",{"abc":"def","autofocus":"","g":"hij","klm":"nopqrstuv\\""},[text:"bar"]'
2384 test_parser name: "attribute entity exceptions dq", \
2385         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
2386         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
2387 test_parser name: "attribute entity exceptions sq", \
2388         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
2389         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
2390 test_parser name: "attribute entity exceptions uq", \
2391         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
2392         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
2393 test_parser name: "matching closing tags", \
2394         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
2395         expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"'
2396 test_parser name: "missing closing tag inside", \
2397         html: "foo<div>bar<span>baz</div>qux",
2398         expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"'
2399 test_parser name: "mis-matched closing tags", \
2400         html: "<span>12<div>34</span>56</div>78",
2401         expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]'
2402 test_parser name: "mis-matched formatting elements", \
2403         html: "12<b>34<i>56</b>78</i>90",
2404         expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"'
2405 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
2406         html: '<p>1<b>2<i>3</b>4</i>5</p>',
2407         expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]'
2408 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
2409         html: '<b>1<p>2</b>3</p>',
2410         expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]'
2411 test_parser name: "crazy formatting elements test", \
2412         html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
2413         # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
2414         # firefox does this:
2415         expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
2416 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
2417 test_parser name: "html5lib aaa 1", \
2418         html: '<a><p></a></p>',
2419         expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]'
2420 test_parser name: "html5lib aaa 2", \
2421         html: '<a>1<p>2</a>3</p>',
2422         expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]'
2423 test_parser name: "html5lib aaa 3", \
2424         html: '<a>1<button>2</a>3</button>',
2425         expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]'
2426 test_parser name: "html5lib aaa 4", \
2427         html: '<a>1<b>2</a>3</b>',
2428         expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]'
2429 test_parser name: "html5lib aaa 5 (two divs deep)", \
2430         html: '<a>1<div>2<div>3</a>4</div>5</div>',
2431         expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]'
2432 test_parser name: "html5lib aaa 6 (foster parenting)", \
2433         html: '<table><a>1<p>2</a>3</p>',
2434         expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]'
2435 test_parser name: "html5lib aaa 7 (aaa, eof) 1", \
2436         html: '<b><b><a><p></a>',
2437         expected: 'tag:"b",{},[tag:"b",{},[tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]]]'
2438 test_parser name: "html5lib aaa 8 (aaa, eof) 2", \
2439         html: '<b><a><b><p></a>',
2440         expected: 'tag:"b",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2441 test_parser name: "html5lib aaa 9 (aaa, eof) 3", \
2442         html: '<a><b><b><p></a>',
2443         expected: 'tag:"a",{},[tag:"b",{},[tag:"b",{},[]]],tag:"b",{},[tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2444 test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \
2445         html: '<p>1<s id="A">2<b id="B">3</p>4</s>5</b>',
2446         expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]'
2447 test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
2448         html: '<table><a>1<td>2</td>3</table>',
2449         expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]'
2450 test_parser name: "html5lib aaa 12 (table with foster parenting, split text)", \
2451         html: '<table>A<td>B</td>C</table>',
2452         expected: 'text:"AC",tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2453 # TODO implement svg and namespacing
2454 #test_parser name: "html5lib aaa 13 (svg tr input)", \
2455 #       html: '<a><svg><tr><input></a>',
2456 #       expected: 'tag:"a",{},[svg:"svg",{},[svg:"tr",{},[svg:"input"]]]'
2457 test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \
2458         html: '<div><a><b><div><div><div><div><div><div><div><div><div><div></a>',
2459         expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"div",{},[tag:"div",{},[]]]]]]]]]]]]]'
2460 test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \
2461         html: '<div><a><b><u><i><code><div></a>',
2462         expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]'
2463 test_parser name: "html5lib aaa 16 (correctly nested 4b)", \
2464         html: '<b><b><b><b>x</b></b></b></b>y',
2465         expected: 'tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]],text:"y"'
2466 test_parser name: "html5lib aaa 17 (formatting, implied /p, noah's ark)", \
2467         html: '<p><b><b><b><b><p>x',
2468         expected: 'tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[]]]]],tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]]'
2469 test_parser name: "variation on html5lib aaa 17 (with attributes in various orders)", \
2470         html: '<p><b c="d" e="f"><b e="f" c="d"><b e="f" c="d"><b c="d" e="f"><p>x',
2471         expected: 'tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[]]]]],tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[text:"x"]]]]'
2472 test_parser name: "junk after attribute close-quote", \
2473         html: '<p><b c="d", e="f">foo<p>x',
2474         expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'
2475 test_parser name: "html5lib aaa02 1", \
2476         html: '<b>1<i>2<p>3</b>4',
2477         expected: 'tag:"b",{},[text:"1",tag:"i",{},[text:"2"]],tag:"i",{},[tag:"p",{},[tag:"b",{},[text:"3"],text:"4"]]'
2478 test_parser name: "html5lib aaa02 2", \
2479         html: '<a><div><style></style><address><a>',
2480         expected: 'tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"style",{},[]],tag:"address",{},[tag:"a",{},[],tag:"a",{},[]]]'
2481 test_parser name: "html5lib tables 1", \
2482         html: '<table><th>',
2483         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"th",{},[]]]]'
2484 test_parser name: "html5lib tables 2", \
2485         html: '<table><td>',
2486         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
2487 test_parser name: "html5lib tables 3", \
2488         html: "<table><col foo='bar'>",
2489         expected: 'tag:"table",{},[tag:"colgroup",{},[tag:"col",{"foo":"bar"},[]]]'
2490 test_parser name: "html5lib tables 4", \
2491         html: '<table><colgroup></html>foo',
2492         expected: 'text:"foo",tag:"table",{},[tag:"colgroup",{},[]]'
2493 test_parser name: "html5lib tables 5", \
2494         html: '<table></table><p>foo',
2495         expected: 'tag:"table",{},[],tag:"p",{},[text:"foo"]'
2496 test_parser name: "html5lib tables 6", \
2497         html: '<table></body></caption></col></colgroup></html></tbody></td></tfoot></th></thead></tr><td>',
2498         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
2499 test_parser name: "html5lib tables 7", \
2500         html: '<table><select><option>3</select></table>',
2501         expected: 'tag:"select",{},[tag:"option",{},[text:"3"]],tag:"table",{},[]'
2502 test_parser name: "html5lib tables 8", \
2503         html: '<table><select><table></table></select></table>',
2504         expected: 'tag:"select",{},[],tag:"table",{},[],tag:"table",{},[]'
2505 test_parser name: "html5lib tables 9", \
2506         html: '<table><select></table>',
2507         expected: 'tag:"select",{},[],tag:"table",{},[]'
2508 test_parser name: "html5lib tables 10", \
2509         html: '<table><select><option>A<tr><td>B</td></tr></table>',
2510         expected: 'tag:"select",{},[tag:"option",{},[text:"A"]],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2511 test_parser name: "html5lib tables 11", \
2512         html: '<table><td></body></caption></col></colgroup></html>foo',
2513         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
2514 test_parser name: "html5lib tables 12", \
2515         html: '<table><td>A</table>B',
2516         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"A"]]]],text:"B"'
2517 test_parser name: "html5lib tables 13", \
2518         html: '<table><tr><caption>',
2519         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[]],tag:"caption",{},[]]'
2520 test_parser name: "html5lib tables 14", \
2521         html: '<table><tr></body></caption></col></colgroup></html></td></th><td>foo',
2522         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
2523 test_parser name: "html5lib tables 15", \
2524         html: '<table><td><tr>',
2525         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]],tag:"tr",{},[]]]'
2526 test_parser name: "html5lib tables 16", \
2527         html: '<table><td><button><td>',
2528         expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[tag:"button",{},[]],tag:"td",{},[]]]]'
2529 # TODO implement svg parsing
2530 #test_parser name: "html5lib tables 17", \
2531 #       html: '<table><tr><td><svg><desc><td>',
2532 #       expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[svg:"svg",{},[svg:"desc",{},[]]],tag:"td",{},[]]]]'