parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
  20 # or <body> tags, nor does it produce the top level "document" node in the dom
  21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
  22 # indicate places where additional code is needed for full HTML document
  23 # parsing.
  24 #
  25 # Instead, the data structure produced by this parser is an array of Nodes.
  26
  27
  28 # stacks/lists
  29 #
  30 # the spec uses a many different words do indicate which ends of lists/stacks
  31 # they are talking about (and relative movement within the lists/stacks). This
  32 # section splains. I'm implementing "lists" (afe and open_els) the same way
  33 # (both as stacks)
  34 #
  35 # stacks grow downward (current element is index=0)
  36 #
  37 # example: open_els = [a, b, c, d, e, f, g]
  38 #
  39 # "grows downwards" means it's visualized like this: (index: el, names)
  40 #
  41 #   6: g "start of the list", "topmost", "first"
  42 #   5: f
  43 #   4: e "previous" (to d), "above", "before"
  44 #   3: d   (previous/next are relative to this element)
  45 #   2: c "next", "after", "lower", "below"
  46 #   1: b
  47 #   0: a "end of the list", "current node", "bottommost", "last"
  48
  49
  50
  51 # Each node is an obect of the Node class. Here are the Node types:
  52 TYPE_TAG = 0 # name, {attributes}, [children]
  53 TYPE_TEXT = 1 # "text"
  54 TYPE_COMMENT = 2
  55 TYPE_DOCTYPE = 3
  56 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  57 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
  58 TYPE_END_TAG = 5 # name
  59 TYPE_EOF = 6
  60 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
  61 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
  62
  63 # namespace constants
  64 NS_HTML = 1
  65 NS_MATHML = 2
  66 NS_SVG = 3
  67
  68 g_debug_log = []
  69 debug_log_reset = ->
  70         g_debug_log = []
  71 debug_log = (str) ->
  72         g_debug_log.push str
  73 debug_log_each = (cb) ->
  74         for str in g_debug_log
  75                 cb str
  76
  77 prev_node_id = 0
  78 class Node
  79         constructor: (type, args = {}) ->
  80                 @type = type # one of the TYPE_* constants above
  81                 @name = args.name ? '' # tag name
  82                 @text = args.text ? '' # contents for text/comment nodes
  83                 @attrs = args.attrs ? {}
  84                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
  85                 @children = args.children ? []
  86                 @namespace = args.namespace ? NS_HTML
  87                 @parent = args.parent ? null
  88                 if args.id?
  89                         @id = "#{args.id}+"
  90                 else
  91                         @id = "#{++prev_node_id}"
  92         shallow_clone: -> # return a new node that's the same except without the children or parent
  93                 # WARNING this doesn't work right on open tags that are still being parsed
  94                 attrs = {}
  95                 attrs[k] = v for k, v of @attrs
  96                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
  97         serialize: (shallow = false, show_ids = false) -> # for unit tests
  98                 ret = ''
  99                 switch @type
 100                         when TYPE_TAG
 101                                 ret += 'tag:'
 102                                 ret += JSON.stringify @name
 103                                 ret += ','
 104                                 if show_ids
 105                                         ret += "##{@id},"
 106                                 if shallow
 107                                         break
 108                                 ret += JSON.stringify @attrs
 109                                 ret += ',['
 110                                 sep = ''
 111                                 for c in @children
 112                                         ret += sep
 113                                         sep = ','
 114                                         ret += c.serialize shallow, show_ids
 115                                 ret += ']'
 116                         when TYPE_TEXT
 117                                 ret += 'text:'
 118                                 ret += JSON.stringify @text
 119                         when TYPE_COMMENT
 120                                 ret += 'comment:'
 121                                 ret += JSON.stringify @text
 122                         when TYPE_DOCTYPE
 123                                 ret += 'doctype'
 124                                 # FIXME
 125                         when TYPE_AFE_MARKER
 126                                 ret += 'marker'
 127                         when TYPE_AAA_BOOKMARK
 128                                 ret += 'aaa_bookmark'
 129                         else
 130                                 ret += 'unknown:'
 131                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
 132                 return ret
 133
 134 # helpers: (only take args that are normally known when parser creates nodes)
 135 new_open_tag = (name) ->
 136         return new Node TYPE_START_TAG, name: name
 137 new_end_tag = (name) ->
 138         return new Node TYPE_END_TAG, name: name
 139 new_element = (name) ->
 140         return new Node TYPE_TAG, name: name
 141 new_text_node = (txt) ->
 142         return new Node TYPE_TEXT, text: txt
 143 new_comment_node = (txt) ->
 144         return new Node TYPE_COMMENT, text: txt
 145 new_eof_token = ->
 146         return new Node TYPE_EOF
 147 new_afe_marker = ->
 148         return new Node TYPE_AFE_MARKER
 149 new_aaa_bookmark = ->
 150         return new Node TYPE_AAA_BOOKMARK
 151
 152 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
 153 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
 154 digits = "0123456789"
 155 alnum = lc_alpha + uc_alpha + digits
 156 hex_chars = digits + "abcdefABCDEF"
 157
 158 # some SVG elements have dashes in them
 159 tag_name_chars = alnum + "-"
 160
 161 # http://www.w3.org/TR/html5/infrastructure.html#space-character
 162 space_chars = "\u0009\u000a\u000c\u000d\u0020"
 163
 164 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
 165 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
 166
 167 # These are the character references that don't need a terminating semicolon
 168 # min length: 2, max: 6, none are a prefix of any other.
 169 legacy_char_refs = {
 170         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
 171         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
 172         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
 173         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
 174         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
 175         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
 176         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
 177         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
 178         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
 179         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
 180         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
 181         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
 182         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
 183         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
 184         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
 185         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
 186         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
 187         yen: '¥', yuml: 'ÿ'
 188 }
 189
 190 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
 191 raw_text_elements = ['script', 'style']
 192 escapable_raw_text_elements = ['textarea', 'title']
 193 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
 194 svg_elements = [
 195         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
 196         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
 197         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
 198         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
 199         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
 200         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
 201         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
 202         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
 203         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
 204         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
 205         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
 206         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
 207         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
 208         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
 209         'view', 'vkern'
 210 ]
 211
 212 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
 213 mathml_elements = [
 214         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
 215         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
 216         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
 217         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
 218         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
 219         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 220         'determinant', 'diff', 'divergence', 'divide', 'domain',
 221         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 222         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 223         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 224         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 225         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 226         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 227         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 228         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 229         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 230         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 231         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 232         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 233         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 234         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 235         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 236         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 237         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 238         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 239         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 240         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 241         'vectorproduct', 'xor'
 242 ]
 243 # foreign_elements = [svg_elements..., mathml_elements...]
 244 #normal_elements = All other allowed HTML elements are normal elements.
 245
 246 special_elements = {
 247         # HTML:
 248         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
 249         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
 250         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
 251         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
 252         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
 253         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
 254         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
 255         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
 256         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
 257         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
 258         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
 259         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
 260         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
 261         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
 262         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
 263         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
 264         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
 265         wbr:NS_HTML, xmp:NS_HTML,
 266
 267         # MathML:
 268         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
 269         'annotation-xml':NS_MATHML,
 270
 271         # SVG:
 272         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
 273 }
 274
 275 formatting_elements = {
 276          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
 277          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
 278          u: true
 279 }
 280
 281 foster_parenting_targets = {
 282         table: true
 283         tbody: true
 284         tfoot: true
 285         thead: true
 286         tr: true
 287 }
 288
 289 # all html I presume
 290 end_tag_implied = {
 291         dd: true
 292         dt: true
 293         li: true
 294         option: true
 295         optgroup: true
 296         p: true
 297         rb: true
 298         rp: true
 299         rt: true
 300         rtc: true
 301 }
 302
 303 el_is_special = (e) ->
 304         return special_elements[e.name]?
 305         # FIXME it should really be:
 306         #return special_elements[e.name] is e.namespace
 307
 308 # decode_named_char_ref()
 309 #
 310 # The list of named character references is _huge_ so ask the browser to decode
 311 # for us instead of wasting bandwidth/space on including the table here.
 312 #
 313 # Pass without the "&" but with the ";" examples:
 314 #    for "&amp" pass "amp;"
 315 #    for "&#x2032" pass "x2032;"
 316 g_dncr = {
 317         cache: {}
 318         textarea: document.createElement('textarea')
 319 }
 320 # TODO test this in IE8
 321 decode_named_char_ref = (txt) ->
 322         txt = "&#{txt}"
 323         decoded = g_dncr.cache[txt]
 324         return decoded if decoded?
 325         g_dncr.textarea.innerHTML = txt
 326         decoded = g_dncr.textarea.value
 327         return null if decoded is txt
 328         return g_dncr.cache[txt] = decoded
 329
 330 parse_html = (txt, parse_error_cb = null) ->
 331         cur = 0 # index of next char in txt to be parsed
 332         # declare tree and tokenizer variables so they're in scope below
 333         tree = null
 334         open_els = [] # stack of open elements
 335         insertion_mode = null
 336         tok_state = null
 337         tok_cur_tag = null # partially parsed tag
 338         flag_frameset_ok = null
 339         flag_parsing = null
 340         flag_foster_parenting = null
 341         form_element_pointer = null
 342         afe = [] # active formatting elements
 343
 344         parse_error = ->
 345                 if parse_error_cb?
 346                         parse_error_cb cur
 347                 else
 348                         console.log "Parse error at character #{cur} of #{txt.length}"
 349
 350
 351         # the functions below impliment the Tree Contstruction algorithm
 352         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 353
 354         # But first... the helpers
 355         template_tag_is_open = ->
 356                 for t in open_els
 357                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
 358                                 return true
 359                 return false
 360         is_in_scope_x = (tag_name, scope, namespace) ->
 361                 for t in open_els
 362                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 363                                 return true
 364                         if scope[t.name] is t.namespace
 365                                 return false
 366                 return false
 367         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
 368                 for t in open_els
 369                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 370                                 return true
 371                         if scope[t.name] is t.namespace
 372                                 return false
 373                         if scope2[t.name] is t.namespace
 374                                 return false
 375                 return false
 376         standard_scopers = { # FIXME these are supposed to be namespace specific
 377                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
 378                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
 379                 template: NS_HTML, mi: NS_MATHML,
 380
 381                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
 382                 'annotation-xml': NS_MATHML,
 383
 384                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
 385         }
 386         button_scopers = button: NS_HTML
 387         li_scopers = ol: NS_HTML, ul: NS_HTML
 388         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
 389         is_in_scope = (tag_name, namespace = null) ->
 390                 return is_in_scope_x tag_name, standard_scopers, namespace
 391         is_in_button_scope = (tag_name, namespace = null) ->
 392                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
 393         is_in_table_scope = (tag_name, namespace = null) ->
 394                 return is_in_scope_x tag_name, table_scopers, namespace
 395         is_in_select_scope = (tag_name, namespace = null) ->
 396                 for t in open_els
 397                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 398                                 return true
 399                         if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
 400                                 return false
 401                 return false
 402         # this checks for a particular element, not by name
 403         el_is_in_scope = (el) ->
 404                 for t in open_els
 405                         if t is el
 406                                 return true
 407                         if standard_scopers[t.name] is t.namespace
 408                                 return false
 409                 return false
 410
 411         # 8.2.3.1 ...
 412         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
 413         reset_insertion_mode = ->
 414                 # 1. Let last be false.
 415                 last = false
 416                 # 2. Let node be the last node in the stack of open elements.
 417                 node_i = 0
 418                 node = open_els[node_i]
 419                 # 3. Loop: If node is the first node in the stack of open elements,
 420                 # then set last to true, and, if the parser was originally created as
 421                 # part of the HTML fragment parsing algorithm (fragment case) set node
 422                 # to the context element.
 423                 loop
 424                         if node_i is open_els.length - 1
 425                                 last = true
 426                                 # fixfull (fragment case)
 427
 428                         # 4. If node is a select element, run these substeps:
 429                         if node.name is 'select'
 430                                 # 1. If last is true, jump to the step below labeled done.
 431                                 unless last
 432                                         # 2. Let ancestor be node.
 433                                         ancestor_i = node_i
 434                                         ancestor = node
 435                                         # 3. Loop: If ancestor is the first node in the stack of
 436                                         # open elements, jump to the step below labeled done.
 437                                         loop
 438                                                 if ancestor_i is open_els.length - 1
 439                                                         break
 440                                                 # 4. Let ancestor be the node before ancestor in the stack
 441                                                 # of open elements.
 442                                                 ancestor_i += 1
 443                                                 ancestor = open_els[ancestor_i]
 444                                                 # 5. If ancestor is a template node, jump to the step below
 445                                                 # labeled done.
 446                                                 if ancestor.name is 'template'
 447                                                         break
 448                                                 # 6. If ancestor is a table node, switch the insertion mode
 449                                                 # to "in select in table" and abort these steps.
 450                                                 if ancestor.name is 'table'
 451                                                         insertion_mode = ins_mode_in_select_in_table
 452                                                         return
 453                                                 # 7. Jump back to the step labeled loop.
 454                                 # 8. Done: Switch the insertion mode to "in select" and abort
 455                                 # these steps.
 456                                 insertion_mode = ins_mode_in_select
 457                                 return
 458                         # 5. If node is a td or th element and last is false, then switch
 459                         # the insertion mode to "in cell" and abort these steps.
 460                         if (node.name is 'td' or node.name is 'th') and last is false
 461                                 insertion_mode = ins_mode_in_cell
 462                                 return
 463                         # 6. If node is a tr element, then switch the insertion mode to "in
 464                         # row" and abort these steps.
 465                         if node.name is 'tr'
 466                                 insertion_mode = ins_mode_in_row
 467                                 return
 468                         # 7. If node is a tbody, thead, or tfoot element, then switch the
 469                         # insertion mode to "in table body" and abort these steps.
 470                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
 471                                 insertion_mode = ins_mode_in_table_body
 472                                 return
 473                         # 8. If node is a caption element, then switch the insertion mode
 474                         # to "in caption" and abort these steps.
 475                         if node.name is 'caption'
 476                                 insertion_mode = ins_mode_in_caption
 477                                 return
 478                         # 9. If node is a colgroup element, then switch the insertion mode
 479                         # to "in column group" and abort these steps.
 480                         if node.name is 'colgroup'
 481                                 insertion_mode = ins_mode_in_column_group
 482                                 return
 483                         # 10. If node is a table element, then switch the insertion mode to
 484                         # "in table" and abort these steps.
 485                         if node.name is 'table'
 486                                 insertion_mode = ins_mode_in_table
 487                                 return
 488                         # 11. If node is a template element, then switch the insertion mode
 489                         # to the current template insertion mode and abort these steps.
 490                         # fixfull (template insertion mode stack)
 491
 492                         # 12. If node is a head element and last is true, then switch the
 493                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
 494                         # these steps. (fragment case)
 495                         if node.name is 'head' and last
 496                                 insertion_mode = ins_mode_in_body
 497                                 return
 498                         # 13. If node is a head element and last is false, then switch the
 499                         # insertion mode to "in head" and abort these steps.
 500                         if node.name is 'head' and last is false
 501                                 insertion_mode = ins_mode_in_head
 502                                 return
 503                         # 14. If node is a body element, then switch the insertion mode to
 504                         # "in body" and abort these steps.
 505                         if node.name is 'body'
 506                                 insertion_mode = ins_mode_in_body
 507                                 return
 508                         # 15. If node is a frameset element, then switch the insertion mode
 509                         # to "in frameset" and abort these steps. (fragment case)
 510                         if node.name is 'frameset'
 511                                 insertion_mode = ins_mode_in_frameset
 512                                 return
 513                         # 16. If node is an html element, run these substeps:
 514                         if node.name is 'html'
 515                                 # 1. If the head element pointer is null, switch the insertion
 516                                 # mode to "before head" and abort these steps. (fragment case)
 517                                 # fixfull (fragment case)
 518
 519                                 # 2. Otherwise, the head element pointer is not null, switch
 520                                 # the insertion mode to "after head" and abort these steps.
 521                                 insertion_mode = ins_mode_in_body # FIXME fixfull
 522                                 return
 523                         # 17. If last is true, then switch the insertion mode to "in body"
 524                         # and abort these steps. (fragment case)
 525                         if last
 526                                 insertion_mode = ins_mode_in_body
 527                                 return
 528                         # 18. Let node now be the node before node in the stack of open
 529                         # elements.
 530                         node_i += 1
 531                         node = open_els[node_i]
 532                         # 19. Return to the step labeled loop.
 533
 534         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
 535         # this implementation is structured (mostly) as described at the link above.
 536         # capitalized comments are the "labels" described at the link above.
 537         reconstruct_active_formatting_elements = ->
 538                 return if afe.length is 0
 539                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
 540                         return
 541                 # Rewind
 542                 i = 0
 543                 loop
 544                         if i is afe.length - 1
 545                                 break
 546                         i += 1
 547                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
 548                                 i -= 1 # Advance
 549                                 break
 550                 # Create
 551                 loop
 552                         el = afe[i].shallow_clone()
 553                         tree_insert_element el
 554                         afe[i] = el
 555                         break if i is 0
 556                         i -= 1
 557
 558         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
 559         # adoption agency algorithm
 560         # overview here:
 561         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
 562         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
 563         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
 564         adoption_agency = (subject) ->
 565                 if open_els[0].name is subject
 566                         el = open_els[0]
 567                         open_els.shift()
 568                         # remove it from the list of active formatting elements (if found)
 569                         for t, i in afe
 570                                 if t is el
 571                                         afe.splice i, 1
 572                                         break
 573                         return
 574                 outer = 0
 575                 loop
 576                         if outer >= 8
 577                                 return
 578                         outer += 1
 579                         # 5. Let formatting element be the last element in the list of
 580                         # active formatting elements that: is between the end of the list
 581                         # and the last scope marker in the list, if any, or the start of
 582                         # the list otherwise, and  has the tag name subject.
 583                         fe = null
 584                         for t, fe_of_afe in afe
 585                                 if t.type is TYPE_AFE_MARKER
 586                                         break
 587                                 if t.name is subject
 588                                         fe = t
 589                                         break
 590                         # If there is no such element, then abort these steps and instead
 591                         # act as described in the "any other end tag" entry above.
 592                         if fe is null
 593                                 in_body_any_other_end_tag subject
 594                                 return
 595                         # 6. If formatting element is not in the stack of open elements,
 596                         # then this is a parse error; remove the element from the list, and
 597                         # abort these steps.
 598                         in_open_els = false
 599                         for t, fe_of_open_els in open_els
 600                                 if t is fe
 601                                         in_open_els = true
 602                                         break
 603                         unless in_open_els
 604                                 parse_error()
 605                                 # "remove it from the list" must mean afe, since it's not in open_els
 606                                 afe.splice fe_of_afe, 1
 607                                 return
 608                         # 7. If formatting element is in the stack of open elements, but
 609                         # the element is not in scope, then this is a parse error; abort
 610                         # these steps.
 611                         unless el_is_in_scope fe
 612                                 parse_error()
 613                                 return
 614                         # 8. If formatting element is not the current node, this is a parse
 615                         # error. (But do not abort these steps.)
 616                         unless open_els[0] is fe
 617                                 parse_error()
 618                                 # continue
 619                         # 9. Let furthest block be the topmost node in the stack of open
 620                         # elements that is lower in the stack than formatting element, and
 621                         # is an element in the special category. There might not be one.
 622                         fb = null
 623                         fb_of_open_els = null
 624                         for t, i in open_els
 625                                 if t is fe
 626                                         break
 627                                 if el_is_special t
 628                                         fb = t
 629                                         fb_of_open_els = i
 630                                         # and continue, to see if there's one that's more "topmost"
 631                         # 10. If there is no furthest block, then the UA must first pop all
 632                         # the nodes from the bottom of the stack of open elements, from the
 633                         # current node up to and including formatting element, then remove
 634                         # formatting element from the list of active formatting elements,
 635                         # and finally abort these steps.
 636                         if fb is null
 637                                 loop
 638                                         t = open_els.shift()
 639                                         if t is fe
 640                                                 afe.splice fe_of_afe, 1
 641                                                 return
 642                         # 11. Let common ancestor be the element immediately above
 643                         # formatting element in the stack of open elements.
 644                         ca = open_els[fe_of_open_els + 1] # common ancestor
 645
 646                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
 647                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
 648                         bookmark = new_aaa_bookmark()
 649                         for t, i in afe
 650                                 if t is fe
 651                                         afe.splice i, 0, bookmark
 652                                         break
 653                         node = last_node = fb
 654                         inner = 0
 655                         loop
 656                                 inner += 1
 657                                 # 3. Let node be the element immediately above node in the
 658                                 # stack of open elements, or if node is no longer in the stack
 659                                 # of open elements (e.g. because it got removed by this
 660                                 # algorithm), the element that was immediately above node in
 661                                 # the stack of open elements before node was removed.
 662                                 node_next = null
 663                                 for t, i in open_els
 664                                         if t is node
 665                                                 node_next = open_els[i + 1]
 666                                                 break
 667                                 node = node_next ? node_above
 668                                 debug_log "inner loop #{inner}"
 669                                 debug_log "open_els: #{serialize_els open_els, true, true}"
 670                                 debug_log "tree: #{serialize_els tree.children, false, true}"
 671                                 debug_log "afe: #{serialize_els afe, true, true}"
 672                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
 673                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
 674                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
 675                                 debug_log "node: #{node.serialize true, true}"
 676                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
 677
 678                                 # 4. If node is formatting element, then go to the next step in
 679                                 # the overall algorithm.
 680                                 if node is fe
 681                                         break
 682                                 debug_log "the meat"
 683                                 # 5. If inner loop counter is greater than three and node is in
 684                                 # the list of active formatting elements, then remove node from
 685                                 # the list of active formatting elements.
 686                                 node_in_afe = false
 687                                 for t, i in afe
 688                                         if t is node
 689                                                 if inner > 3
 690                                                         afe.splice i, 1
 691                                                         debug_log "max out inner"
 692                                                 else
 693                                                         node_in_afe = true
 694                                                         debug_log "in afe"
 695                                                 break
 696                                 # 6. If node is not in the list of active formatting elements,
 697                                 # then remove node from the stack of open elements and then go
 698                                 # back to the step labeled inner loop.
 699                                 unless node_in_afe
 700                                         debug_log "not in afe"
 701                                         for t, i in open_els
 702                                                 if t is node
 703                                                         node_above = open_els[i + 1]
 704                                                         open_els.splice i, 1
 705                                                         break
 706                                         continue
 707                                 debug_log "the bones"
 708                                 # 7. create an element for the token for which the element node
 709                                 # was created, in the HTML namespace, with common ancestor as
 710                                 # the intended parent; replace the entry for node in the list
 711                                 # of active formatting elements with an entry for the new
 712                                 # element, replace the entry for node in the stack of open
 713                                 # elements with an entry for the new element, and let node be
 714                                 # the new element.
 715                                 new_node = node.shallow_clone()
 716                                 for t, i in afe
 717                                         if t is node
 718                                                 afe[i] = new_node
 719                                                 debug_log "replaced in afe"
 720                                                 break
 721                                 for t, i in open_els
 722                                         if t is node
 723                                                 node_above = open_els[i + 1]
 724                                                 open_els[i] = new_node
 725                                                 debug_log "replaced in open_els"
 726                                                 break
 727                                 node = new_node
 728                                 # 8. If last node is furthest block, then move the
 729                                 # aforementioned bookmark to be immediately after the new node
 730                                 # in the list of active formatting elements.
 731                                 if last_node is fb
 732                                         for t, i in afe
 733                                                 if t is bookmark
 734                                                         afe.splice i, 1
 735                                                         debug_log "removed bookmark"
 736                                                         break
 737                                         for t, i in afe
 738                                                 if t is node
 739                                                         # "after" means lower
 740                                                         afe.splice i, 0, bookmark # "after as <-
 741                                                         debug_log "placed bookmark after node"
 742                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
 743                                                         break
 744                                 # 9. Insert last node into node, first removing it from its
 745                                 # previous parent node if any.
 746                                 if last_node.parent?
 747                                         debug_log "last_node has parent"
 748                                         for c, i in last_node.parent.children
 749                                                 if c is last_node
 750                                                         debug_log "removing last_node from parent"
 751                                                         last_node.parent.children.splice i, 1
 752                                                         break
 753                                 node.children.push last_node
 754                                 last_node.parent = node
 755                                 # 10. Let last node be node.
 756                                 last_node = node
 757                                 debug_log "at last"
 758                                 # 11. Return to the step labeled inner loop.
 759                         # 14. Insert whatever last node ended up being in the previous step
 760                         # at the appropriate place for inserting a node, but using common
 761                         # ancestor as the override target.
 762
 763                         # JASON: In the case where fe is immediately followed by fb:
 764                         #   * inner loop exits out early (node==fe)
 765                         #   * last_node is fb
 766                         #   * last_node is still in the tree (not a duplicate)
 767                         if last_node.parent?
 768                                 debug_log "FEFIRST? last_node has parent"
 769                                 for c, i in last_node.parent.children
 770                                         if c is last_node
 771                                                 debug_log "removing last_node from parent"
 772                                                 last_node.parent.children.splice i, 1
 773                                                 break
 774
 775                         debug_log "after aaa inner loop"
 776                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
 777                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
 778                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
 779                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
 780                         debug_log "tree: #{serialize_els tree.children, false, true}"
 781
 782                         debug_log "insert"
 783
 784
 785                         # can't use standard insert token thing, because it's already in
 786                         # open_els and must stay at it's current position in open_els
 787                         dest = adjusted_insertion_location ca
 788                         dest[0].children.splice dest[1], 0, last_node
 789                         last_node.parent = dest[0]
 790
 791
 792                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
 793                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
 794                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
 795                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
 796                         debug_log "tree: #{serialize_els tree.children, false, true}"
 797
 798                         # 15. Create an element for the token for which formatting element
 799                         # was created, in the HTML namespace, with furthest block as the
 800                         # intended parent.
 801                         new_element = fe.shallow_clone() # FIXME intended parent thing
 802                         # 16. Take all of the child nodes of furthest block and append them
 803                         # to the element created in the last step.
 804                         while fb.children.length
 805                                 t = fb.children.shift()
 806                                 t.parent = new_element
 807                                 new_element.children.push t
 808                         # 17. Append that new element to furthest block.
 809                         new_element.parent = fb
 810                         fb.children.push new_element
 811                         # 18. Remove formatting element from the list of active formatting
 812                         # elements, and insert the new element into the list of active
 813                         # formatting elements at the position of the aforementioned
 814                         # bookmark.
 815                         for t, i in afe
 816                                 if t is fe
 817                                         afe.splice i, 1
 818                                         break
 819                         for t, i in afe
 820                                 if t is bookmark
 821                                         afe[i] = new_element
 822                                         break
 823                         # 19. Remove formatting element from the stack of open elements,
 824                         # and insert the new element into the stack of open elements
 825                         # immediately below the position of furthest block in that stack.
 826                         for t, i in open_els
 827                                 if t is fe
 828                                         open_els.splice i, 1
 829                                         break
 830                         for t, i in open_els
 831                                 if t is fb
 832                                         open_els.splice i, 0, new_element
 833                                         break
 834                         # 20. Jump back to the step labeled outer loop.
 835                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
 836                         debug_log "tree: #{serialize_els tree.children, false, true}"
 837                         debug_log "open_els: #{serialize_els open_els, true, true}"
 838                         debug_log "afe: #{serialize_els afe, true, true}"
 839                 debug_log "AAA DONE"
 840
 841         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
 842         # FIXME test this (particularly emplied end tags)
 843         close_p_element = ->
 844                 generate_implied_end_tags 'p' # arg is exception
 845                 if open_els[0].name isnt 'p'
 846                         parse_error()
 847                 while open_els.length > 1 # just in case
 848                         t = open_els.shift()
 849                         if t.name is 'p'
 850                                 return
 851         close_p_if_in_button_scope = ->
 852                 if is_in_button_scope 'p'
 853                         close_a_p_element()
 854
 855         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
 856         tree_insert_text = (t) ->
 857                 dest = adjusted_insertion_location()
 858                 if dest[1] > 0
 859                         prev = dest[0].children[dest[1] - 1]
 860                         if prev.type is TYPE_TEXT
 861                                 prev.text += t.text
 862                                 return
 863                 dest[0].children.splice dest[1], 0, t
 864
 865         # 8.2.5.1
 866         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
 867         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
 868         adjusted_insertion_location = (override_target = null) ->
 869                 # 1. If there was an override target specified, then let target be the
 870                 # override target.
 871                 if override_target?
 872                         target = override_target
 873                 else # Otherwise, let target be the current node.
 874                         target = open_els[0]
 875                 # 2. Determine the adjusted insertion location using the first matching
 876                 # steps from the following list:
 877                 #
 878                 # If foster parenting is enabled and target is a table, tbody, tfoot,
 879                 # thead, or tr element Foster parenting happens when content is
 880                 # misnested in tables.
 881                 if flag_foster_parenting and foster_parenting_targets[target.name]
 882                         loop # once. this is here so we can ``break`` to "abort these substeps"
 883                                 # 1. Let last template be the last template element in the
 884                                 # stack of open elements, if any.
 885                                 last_template = null
 886                                 last_template_i = null
 887                                 for el, i in open_els
 888                                         if el.name is 'template'
 889                                                 last_template = el
 890                                                 last_template_i = i
 891                                                 break
 892                                 # 2. Let last table be the last table element in the stack of
 893                                 # open elements, if any.
 894                                 last_table = null
 895                                 last_table_i
 896                                 for el, i in open_els
 897                                         if el.name is 'table'
 898                                                 last_table = el
 899                                                 last_table_i = i
 900                                                 break
 901                                 # 3. If there is a last template and either there is no last
 902                                 # table, or there is one, but last template is lower (more
 903                                 # recently added) than last table in the stack of open
 904                                 # elements, then: let adjusted insertion location be inside
 905                                 # last template's template contents, after its last child (if
 906                                 # any), and abort these substeps.
 907                                 if last_template and (last_table is null or last_template_i < last_table_i)
 908                                         target = template # fixfull should be it's contents
 909                                         target_i = target.children.length
 910                                         break
 911                                 # 4. If there is no last table, then let adjusted insertion
 912                                 # location be inside the first element in the stack of open
 913                                 # elements (the html element), after its last child (if any),
 914                                 # and abort these substeps. (fragment case)
 915                                 if last_table is null
 916                                         # this is odd
 917                                         target = open_els[open_els.length - 1]
 918                                         target_i = target.children.length
 919                                 # 5. If last table has a parent element, then let adjusted
 920                                 # insertion location be inside last table's parent element,
 921                                 # immediately before last table, and abort these substeps.
 922                                 if last_table.parent?
 923                                         for c, i in last_table.parent.children
 924                                                 if c is last_table
 925                                                         target = last_table.parent
 926                                                         target_i = i
 927                                                         break
 928                                         break
 929                                 # 6. Let previous element be the element immediately above last
 930                                 # table in the stack of open elements.
 931                                 #
 932                                 # huh? how could it not have a parent?
 933                                 previous_element = open_els[last_table_i + 1]
 934                                 # 7. Let adjusted insertion location be inside previous
 935                                 # element, after its last child (if any).
 936                                 target = previous_element
 937                                 target_i = target.children.length
 938                                 # Note: These steps are involved in part because it's possible
 939                                 # for elements, the table element in this case in particular,
 940                                 # to have been moved by a script around in the DOM, or indeed
 941                                 # removed from the DOM entirely, after the element was inserted
 942                                 # by the parser.
 943                                 break # don't really loop
 944                 else
 945                         # Otherwise Let adjusted insertion location be inside target, after
 946                         # its last child (if any).
 947                         target_i = target.children.length
 948
 949                 # 3. If the adjusted insertion location is inside a template element,
 950                 # let it instead be inside the template element's template contents,
 951                 # after its last child (if any).
 952                 # fixfull (template)
 953
 954                 # 4. Return the adjusted insertion location.
 955                 return [target, target_i]
 956
 957         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
 958         # aka create_an_element_for_token
 959         token_to_element = (t, namespace, intended_parent) ->
 960                 t.type = TYPE_TAG # not TYPE_START_TAG
 961                 # convert attributes into a hash
 962                 attrs = {}
 963                 while t.attrs_a.length
 964                         a = t.attrs_a.pop()
 965                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
 966                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
 967
 968                 # TODO 2. If the newly created element has an xmlns attribute in the
 969                 # XMLNS namespace whose value is not exactly the same as the element's
 970                 # namespace, that is a parse error. Similarly, if the newly created
 971                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
 972                 # value is not the XLink Namespace, that is a parse error.
 973
 974                 # fixfull: the spec says stuff about form pointers and ownerDocument
 975
 976                 return el
 977
 978         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
 979         insert_foreign_element = (token, namespace) ->
 980                 ail = adjusted_insertion_location()
 981                 ail_el = ail[0]
 982                 ail_i = ail[1]
 983                 el = token_to_element token, namespace, ail_el
 984                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
 985                 el.parent = ail_el
 986                 ail_el.children.splice ail_i, 0, el
 987                 open_els.unshift el
 988                 return el
 989         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
 990         insert_html_element = insert_foreign_element # (token, namespace) ->
 991
 992         # FIXME read implement "foster parenting" part
 993         # FIXME read spec, do this right
 994         # FIXME implement the override target thing
 995         # note: this assumes it's an open tag
 996         # FIXME what part of the spec is this?
 997         # TODO look through all callers of this, and see what they should really be doing.
 998         #   eg probably insert_html_element for tokens
 999         tree_insert_element = (el, override_target = null, namespace = null) ->
1000                 if namespace?
1001                         el.namespace = namespace
1002                 dest = adjusted_insertion_location override_target
1003                 if el.type is TYPE_START_TAG # means it's a "token"
1004                         el = token_to_element el, namespace, dest[0]
1005                 unless el.namespace?
1006                         namespace = dest.namespace
1007                 # fixfull: Document nodes sometimes can't accept more chidren
1008                 dest[0].children.splice dest[1], 0, el
1009                 el.parent = dest[0]
1010                 open_els.unshift el
1011                 return el
1012
1013         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1014         # position should be [node, index_within_children]
1015         tree_insert_a_comment = (t, position = null) ->
1016                 position ?= adjusted_insertion_location()
1017                 position[0].children.splice position[1], 0, t
1018
1019         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1020         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1021         generate_implied_end_tags = (except = null) ->
1022                 while end_tag_implied[open_els[0]] and open_els[0].name isnt except
1023                         open_els.shift()
1024
1025         # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1026         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1027                 for node, i in open_els
1028                         if node.name is name # FIXME check namespace too
1029                                 generate_implied_end_tags name # arg is exception
1030                                 parse_error() unless i is 0
1031                                 while i >= 0
1032                                         open_els.shift()
1033                                         i -= 1
1034                                 return
1035                         if special_elements[node.name]? # FIXME check namespac too
1036                                 parse_error()
1037                                 return
1038         ins_mode_in_body = (t) ->
1039                 switch t.type
1040                         when TYPE_TEXT
1041                                 switch t.text
1042                                         when "\u0000"
1043                                                 parse_error()
1044                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
1045                                                 reconstruct_active_formatting_elements()
1046                                                 tree_insert_text t
1047                                         else
1048                                                 reconstruct_active_formatting_elements()
1049                                                 tree_insert_text t
1050                                                 flag_frameset_ok = false
1051                         when TYPE_COMMENT
1052                                 tree_insert_a_comment t
1053                         when TYPE_DOCTYPE
1054                                 parse_error()
1055                         when TYPE_START_TAG
1056                                 switch t.name
1057                                         when 'html'
1058                                                 parse_error()
1059                                                 return if template_tag_is_open()
1060                                                 root_attrs = open_els[open_els.length - 1].attrs
1061                                                 for k, v of t.attrs
1062                                                         root_attrs[k] = v unless root_attrs[k]?
1063                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1064                                                 # FIXME also do this for </template> (end tag)
1065                                                 return tree_in_head t
1066                                         when 'body'
1067                                                 parse_error()
1068                                                 # TODO
1069                                         when 'frameset'
1070                                                 parse_error()
1071                                                 # TODO
1072                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1073                                                 close_p_if_in_button_scope()
1074                                                 insert_html_element t
1075                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1076                                                 close_p_if_in_button_scope()
1077                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1078                                                         parse_error()
1079                                                         open_els.shift()
1080                                                 insert_html_element t
1081                                         # TODO lots more to implement here
1082                                         when 'a'
1083                                                 # If the list of active formatting elements
1084                                                 # contains an a element between the end of the list and
1085                                                 # the last marker on the list (or the start of the list
1086                                                 # if there is no marker on the list), then this is a
1087                                                 # parse error; run the adoption agency algorithm for
1088                                                 # the tag name "a", then remove that element from the
1089                                                 # list of active formatting elements and the stack of
1090                                                 # open elements if the adoption agency algorithm didn't
1091                                                 # already remove it (it might not have if the element
1092                                                 # is not in table scope).
1093                                                 found = false
1094                                                 for el in afe
1095                                                         if el.type is TYPE_AFE_MARKER
1096                                                                 break
1097                                                         if el.name is 'a'
1098                                                                 found = el
1099                                                 if found?
1100                                                         parse_error()
1101                                                         adoption_agency 'a'
1102                                                         for el, i in afe
1103                                                                 if el is found
1104                                                                         afe.splice i, 1
1105                                                         for el, i in open_els
1106                                                                 if el is found
1107                                                                         open_els.splice i, 1
1108                                                 reconstruct_active_formatting_elements()
1109                                                 el = tree_insert_element t
1110                                                 afe.unshift el
1111                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1112                                                 reconstruct_active_formatting_elements()
1113                                                 el = tree_insert_element t
1114                                                 afe.unshift el
1115                                         when 'table'
1116                                                 # fixfull quirksmode thing
1117                                                 close_p_if_in_button_scope()
1118                                                 insert_html_element t
1119                                                 insertion_mode = ins_mode_in_table
1120                                         # TODO lots more to implement here
1121                                         else # any other start tag
1122                                                 reconstruct_active_formatting_elements()
1123                                                 tree_insert_element t
1124                         when TYPE_EOF
1125                                 ok_tags = {
1126                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1127                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1128                                 }
1129                                 for t in open_els
1130                                         unless ok_tags[t.name]?
1131                                                 parse_error()
1132                                                 break
1133                                 # TODO stack of template insertion modes thing
1134                                 flag_parsing = false # stop parsing
1135                         when TYPE_END_TAG
1136                                 switch t.name
1137                                         when 'body'
1138                                                 unless is_in_scope 'body'
1139                                                         parse_error()
1140                                                         return
1141                                                 # TODO implement parse error and move to tree_after_body
1142                                         when 'html'
1143                                                 unless is_in_scope 'body' # weird, but it's what the spec says
1144                                                         parse_error()
1145                                                         return
1146                                                 # TODO implement parse error and move to tree_after_body, reprocess
1147                                         when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1148                                                 unless is_in_scope t.name, NS_HTML
1149                                                         parse_error()
1150                                                         return
1151                                                 generate_implied_end_tags()
1152                                                 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1153                                                         parse_error()
1154                                                 loop
1155                                                         el = open_els.shift()
1156                                                         if el.name is t.name and el.namespace is NS_HTML
1157                                                                 return
1158                                         # TODO lots more close tags to implement here
1159                                         when 'p'
1160                                                 unless is_in_button_scope 'p'
1161                                                         parse_error()
1162                                                         insert_html_element new_open_tag 'p'
1163                                                         close_p_element()
1164                                         # TODO lots more close tags to implement here
1165                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1166                                                 adoption_agency t.name
1167                                         # TODO lots more close tags to implement here
1168                                         else
1169                                                 in_body_any_other_end_tag t.name
1170                 return
1171
1172         ins_mode_in_table_else = (t) ->
1173                 parse_error()
1174                 flag_foster_parenting = true # FIXME
1175                 ins_mode_in_body t
1176                 flag_foster_parenting = false
1177         can_in_table = {
1178                 'table': true
1179                 'tbody': true
1180                 'tfoot': true
1181                 'thead': true
1182                 'tr': true
1183         }
1184         clear_to_table_stopers = {
1185                 'table': true
1186                 'template': true
1187                 'html': true
1188         }
1189         clear_stack_to_table_context = ->
1190                 loop
1191                         if clear_to_table_stopers[open_els[0].name]?
1192                                 break
1193                         open_els.shift()
1194                 return
1195         clear_to_table_body_stopers = {
1196                 'tbody': true
1197                 'tfoot': true
1198                 'thead': true
1199                 'template': true
1200                 'html': true
1201         }
1202         clear_stack_to_table_body_context = ->
1203                 loop
1204                         if clear_to_table_body_stopers[open_els[0].name]?
1205                                 break
1206                         open_els.shift()
1207                 return
1208         clear_to_table_row_stopers = {
1209                 'tr': true
1210                 'template': true
1211                 'html': true
1212         }
1213         clear_stack_to_table_row_context = ->
1214                 loop
1215                         if clear_to_table_row_stopers[open_els[0].name]?
1216                                 break
1217                         open_els.shift()
1218                 return
1219         clear_afe_to_marker = ->
1220                 loop
1221                         el = afe.shift()
1222                         if el.type is TYPE_AFE_MARKER
1223                                 return
1224         ins_mode_in_table = (t) ->
1225                 switch t.type
1226                         when TYPE_TEXT
1227                                 if can_in_table[t.name]
1228                                         original_insertion_mode = insertion_mode
1229                                         insertion_mode = ins_mode_in_table_text
1230                                         insertion_mode t
1231                                 else
1232                                         ins_mode_in_table_else t
1233                         when TYPE_COMMENT
1234                                 tree_insert_a_comment t
1235                         when TYPE_DOCTYPE
1236                                 parse_error()
1237                         when TYPE_START_TAG
1238                                 switch t.name
1239                                         when 'caption'
1240                                                 clear_stack_to_table_context()
1241                                                 afe.unshift new_afe_marker()
1242                                                 insert_html_element t
1243                                                 insertion_mode = ins_mode_in_caption
1244                                         when 'colgroup'
1245                                                 clear_stack_to_table_context()
1246                                                 insert_html_element t
1247                                                 insertion_mode = ins_mode_in_column_group
1248                                         when 'col'
1249                                                 clear_stack_to_table_context()
1250                                                 insert_html_element new_open_tag 'colgroup'
1251                                                 insertion_mode = ins_mode_in_column_group
1252                                                 insertion_mode t
1253                                         when 'tbody', 'tfoot', 'thead'
1254                                                 clear_stack_to_table_context()
1255                                                 insert_html_element t
1256                                                 insertion_mode = ins_mode_in_table_body
1257                                         when 'td', 'th', 'tr'
1258                                                 clear_stack_to_table_context()
1259                                                 insert_html_element new_open_tag 'tbody'
1260                                                 insertion_mode = ins_mode_in_table_body
1261                                                 insertion_mode t
1262                                         when 'table'
1263                                                 parse_error()
1264                                                 if is_in_table_scope 'table'
1265                                                         loop
1266                                                                 el = open_els.shift()
1267                                                                 if el.name is 'table'
1268                                                                         break
1269                                                         reset_insertion_mode()
1270                                                         insertion_mode t
1271                                         when 'style', 'script', 'template'
1272                                                 ins_mode_in_head t
1273                                         when 'input'
1274                                                 if token_is_input_hidden t
1275                                                         ins_mode_in_table_else t
1276                                                 else
1277                                                         parse_error()
1278                                                         insert_html_element t
1279                                                         open_els.shift()
1280                                                         # fixfull acknowledge sef-closing flag
1281                                         when 'form'
1282                                                 parse_error()
1283                                                 if form_element_pointer?
1284                                                         return
1285                                                 if template_tag_is_open()
1286                                                         return
1287                                                 form_element_pointer = insert_html_element t
1288                                                 open_els.shift()
1289                                         else
1290                                                 ins_mode_in_table_else t
1291                         when TYPE_END_TAG
1292                                 switch t.name
1293                                         when 'table'
1294                                                 if is_in_table_scope 'table'
1295                                                         loop
1296                                                                 el = open_els.shift()
1297                                                                 if el.name is 'table'
1298                                                                         break
1299                                                         reset_insertion_mode()
1300                                                 else
1301                                                         parse_error
1302                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1303                                                 parse_error()
1304                                         when 'template'
1305                                                 ins_mode_in_head t
1306                                         else
1307                                                 ins_mode_in_table_else t
1308                         when TYPE_EOF
1309                                 ins_mode_in_body t
1310                         else
1311                                 ins_mode_in_table_else t
1312
1313
1314         ins_mode_in_table_text = (t) ->
1315                 switch t.type
1316                         when TYPE_TEXT
1317                                 switch t.text
1318                                         when "\u0000"
1319                                                 parse_error()
1320                                                 return
1321                 console.log "unimplemented ins_mode_in_table_text"
1322                 # FIXME CONTINUE
1323
1324         ins_mode_in_table_body = (t) ->
1325                 if t.type is TYPE_START_TAG and t.name is 'tr'
1326                         clear_stack_to_table_body_context()
1327                         insert_html_element t
1328                         return
1329                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1330                         parse_error()
1331                         clear_stack_to_table_body_context()
1332                         insert_html_element new_open_tag 'tr'
1333                         insertion_mode = ins_mode_in_row
1334                         insertion_mode t
1335                         return
1336                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1337                         unless is_in_table_scope t.name # fixfull check namespace
1338                                 parse_error()
1339                                 return
1340                         clear_stack_to_table_body_context()
1341                         open_els.shift()
1342                         insertion_mode = ins_mode_in_table
1343                         return
1344                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1345                         has = false
1346                         for el in open_els
1347                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1348                                         has = true
1349                                         break
1350                                 if table_scopers[el.name]
1351                                         break
1352                         if !has
1353                                 parse_error()
1354                                 return
1355                         clear_stack_to_table_body_context()
1356                         open_els.shift()
1357                         insertion_mode = ins_mode_in_table
1358                         insertion_mode t
1359                         return
1360                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1361                         parse_error()
1362                         return
1363                 # Anything else
1364                 ins_mode_in_table t
1365
1366         ins_mode_in_row = (t) ->
1367                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1368                         clear_stack_to_table_row_context()
1369                         insert_html_element t
1370                         insertion_mode = ins_mode_in_cell
1371                         afe.unshift new_afe_marker()
1372                         return
1373                 if t.type is TYPE_END_TAG and t.name is 'tr'
1374                         if is_in_table_scope 'tr'
1375                                 clear_stack_to_table_row_context()
1376                                 open_els.shift()
1377                                 insertion_mode = ins_mode_in_table_body
1378                         else
1379                                 parse_error()
1380                         return
1381                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1382                         if is_in_table_scope 'tr'
1383                                 clear_stack_to_table_row_context()
1384                                 open_els.shift()
1385                                 insertion_mode = ins_mode_in_table_body
1386                                 insertion_mode t
1387                         else
1388                                 parse_error()
1389                         return
1390                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1391                         if is_in_table_scope t.name # fixfull namespace
1392                                 if is_in_table_scope 'tr'
1393                                         clear_stack_to_table_row_context()
1394                                         open_els.shift()
1395                                         insertion_mode = ins_mode_in_table_body
1396                                         insertion_mode t
1397                         else
1398                                 parse_error()
1399                         return
1400                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1401                         parse_error()
1402                         return
1403                 # Anything else
1404                 ins_mode_in_table t
1405
1406         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1407         close_the_cell = ->
1408                 generate_implied_end_tags()
1409                 unless open_els[0].name is 'td' or open_els[0] is 'th'
1410                         parse_error()
1411                 loop
1412                         el = open_els.shift()
1413                         if el.name is 'td' or el.name is 'th'
1414                                 break
1415                 clear_afe_to_marker()
1416                 insertion_mode = ins_mode_in_row
1417
1418         # http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1419         ins_mode_in_cell = (t) ->
1420                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1421                         if is_in_table_scope t.name
1422                                 generate_implied_end_tags()
1423                                 if open_els[0].name isnt t.name
1424                                         parse_error
1425                                 loop
1426                                         el = open_els.shift()
1427                                         if el.name is t.name
1428                                                 break
1429                                 clear_afe_to_marker()
1430                                 insertion_mode = ins_mode_in_row
1431                         else
1432                                 parse_error()
1433                         return
1434                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1435                         has = false
1436                         for el in open_els
1437                                 if el.name is 'td' or el.name is 'th'
1438                                         has = true
1439                                         break
1440                                 if table_scopers[el.name]
1441                                         break
1442                         if !has
1443                                 parse_error()
1444                                 return
1445                         close_the_cell()
1446                         insertion_mode t
1447                         return
1448                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1449                         parse_error()
1450                         return
1451                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1452                         if is_in_table_scope t.name # fixfull namespace
1453                                 close_the_cell()
1454                                 insertion_mode t
1455                         else
1456                                 parse_error()
1457                         return
1458                 # Anything Else
1459                 ins_mode_in_body t
1460
1461
1462         # the functions below implement the tokenizer stats described here:
1463         # http://www.w3.org/TR/html5/syntax.html#tokenization
1464
1465         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
1466         tok_state_data = ->
1467                 switch c = txt.charAt(cur++)
1468                         when '&'
1469                                 return new_text_node tokenize_character_reference()
1470                         when '<'
1471                                 tok_state = tok_state_tag_open
1472                         when "\u0000"
1473                                 parse_error()
1474                                 return new_text_node c
1475                         when '' # EOF
1476                                 return new_eof_token()
1477                         else
1478                                 return new_text_node c
1479                 return null
1480
1481         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
1482         # not needed: tok_state_character_reference_in_data = ->
1483         # just call tok_state_character_reference_in_data()
1484
1485         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
1486         tok_state_tag_open = ->
1487                 switch c = txt.charAt(cur++)
1488                         when '!'
1489                                 tok_state = tok_state_markup_declaration_open
1490                         when '/'
1491                                 tok_state = tok_state_end_tag_open
1492                         when '?'
1493                                 parse_error()
1494                                 tok_state = tok_state_bogus_comment
1495                         else
1496                                 if lc_alpha.indexOf(c) > -1
1497                                         tok_cur_tag = new_open_tag c
1498                                         tok_state = tok_state_tag_name
1499                                 else if uc_alpha.indexOf(c) > -1
1500                                         tok_cur_tag = new_open_tag c.toLowerCase()
1501                                         tok_state = tok_state_tag_name
1502                                 else
1503                                         parse_error()
1504                                         tok_state = tok_state_data
1505                                         cur -= 1 # we didn't parse/handle the char after <
1506                                         return new_text_node '<'
1507                 return null
1508
1509         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
1510         tok_state_end_tag_open = ->
1511                 switch c = txt.charAt(cur++)
1512                         when '>'
1513                                 parse_error()
1514                                 tok_state = tok_state_data
1515                         when '' # EOF
1516                                 parse_error()
1517                                 tok_state = tok_state_data
1518                                 return new_text_node '</'
1519                         else
1520                                 if uc_alpha.indexOf(c) > -1
1521                                         tok_cur_tag = new_end_tag c.toLowerCase()
1522                                         tok_state = tok_state_tag_name
1523                                 else if lc_alpha.indexOf(c) > -1
1524                                         tok_cur_tag = new_end_tag c
1525                                         tok_state = tok_state_tag_name
1526                                 else
1527                                         parse_error()
1528                                         tok_state = tok_state_bogus_comment
1529                 return null
1530
1531         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
1532         tok_state_tag_name = ->
1533                 switch c = txt.charAt(cur++)
1534                         when "\t", "\n", "\u000c", ' '
1535                                 tok_state = tok_state_before_attribute_name
1536                         when '/'
1537                                 tok_state = tok_state_self_closing_start_tag
1538                         when '>'
1539                                 tok_state = tok_state_data
1540                                 tmp = tok_cur_tag
1541                                 tok_cur_tag = null
1542                                 return tmp
1543                         when "\u0000"
1544                                 parse_error()
1545                                 tok_cur_tag.name += "\ufffd"
1546                         when '' # EOF
1547                                 parse_error()
1548                                 tok_state = tok_state_data
1549                         else
1550                                 if uc_alpha.indexOf(c) > -1
1551                                         tok_cur_tag.name += c.toLowerCase()
1552                                 else
1553                                         tok_cur_tag.name += c
1554                 return null
1555
1556         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
1557         tok_state_before_attribute_name = ->
1558                 attr_name = null
1559                 switch c = txt.charAt(cur++)
1560                         when "\t", "\n", "\u000c", ' '
1561                                 return null
1562                         when '/'
1563                                 tok_state = tok_state_self_closing_start_tag
1564                                 return null
1565                         when '>'
1566                                 tok_state = tok_state_data
1567                                 tmp = tok_cur_tag
1568                                 tok_cur_tag = null
1569                                 return tmp
1570                         when "\u0000"
1571                                 parse_error()
1572                                 attr_name = "\ufffd"
1573                         when '"', "'", '<', '='
1574                                 parse_error()
1575                                 attr_name = c
1576                         when '' # EOF
1577                                 parse_error()
1578                                 tok_state = tok_state_data
1579                         else
1580                                 if uc_alpha.indexOf(c) > -1
1581                                         attr_name = c.toLowerCase()
1582                                 else
1583                                         attr_name = c
1584                 if attr_name?
1585                         tok_cur_tag.attrs_a.unshift [attr_name, '']
1586                         tok_state = tok_state_attribute_name
1587                 return null
1588
1589         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
1590         tok_state_attribute_name = ->
1591                 switch c = txt.charAt(cur++)
1592                         when "\t", "\n", "\u000c", ' '
1593                                 tok_state = tok_state_after_attribute_name
1594                         when '/'
1595                                 tok_state = tok_state_self_closing_start_tag
1596                         when '='
1597                                 tok_state = tok_state_before_attribute_value
1598                         when '>'
1599                                 tok_state = tok_state_data
1600                                 tmp = tok_cur_tag
1601                                 tok_cur_tag = null
1602                                 return tmp
1603                         when "\u0000"
1604                                 parse_error()
1605                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
1606                         when '"', "'", '<'
1607                                 parse_error()
1608                                 tok_cur_tag.attrs_a[0][0] = c
1609                         when '' # EOF
1610                                 parse_error()
1611                                 tok_state = tok_state_data
1612                         else
1613                                 if uc_alpha.indexOf(c) > -1
1614                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
1615                                 else
1616                                         tok_cur_tag.attrs_a[0][0] += c
1617                 return null
1618
1619         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
1620         tok_state_before_attribute_value = ->
1621                 switch c = txt.charAt(cur++)
1622                         when "\t", "\n", "\u000c", ' '
1623                                 return null
1624                         when '"'
1625                                 tok_state = tok_state_attribute_value_double_quoted
1626                         when '&'
1627                                 tok_state = tok_state_attribute_value_unquoted
1628                                 cur -= 1
1629                         when "'"
1630                                 tok_state = tok_state_attribute_value_single_quoted
1631                         when "\u0000"
1632                                 # Parse error
1633                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1634                                 tok_state = tok_state_attribute_value_unquoted
1635                         when '>'
1636                                 # Parse error
1637                                 tok_state = tok_state_data
1638                                 tmp = tok_cur_tag
1639                                 tok_cur_tag = null
1640                                 return tmp
1641                         when '' # EOF
1642                                 parse_error()
1643                                 tok_state = tok_state_data
1644                         else
1645                                 tok_cur_tag.attrs_a[0][1] += c
1646                                 tok_state = tok_state_attribute_value_unquoted
1647                 return null
1648
1649         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
1650         tok_state_attribute_value_double_quoted = ->
1651                 switch c = txt.charAt(cur++)
1652                         when '"'
1653                                 tok_state = tok_state_after_attribute_value_quoted
1654                         when '&'
1655                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
1656                         when "\u0000"
1657                                 # Parse error
1658                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1659                         when '' # EOF
1660                                 parse_error()
1661                                 tok_state = tok_state_data
1662                         else
1663                                 tok_cur_tag.attrs_a[0][1] += c
1664                 return null
1665
1666         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
1667         tok_state_attribute_value_single_quoted = ->
1668                 switch c = txt.charAt(cur++)
1669                         when "'"
1670                                 tok_state = tok_state_after_attribute_value_quoted
1671                         when '&'
1672                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
1673                         when "\u0000"
1674                                 # Parse error
1675                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1676                         when '' # EOF
1677                                 parse_error()
1678                                 tok_state = tok_state_data
1679                         else
1680                                 tok_cur_tag.attrs_a[0][1] += c
1681                 return null
1682
1683         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
1684         tok_state_attribute_value_unquoted = ->
1685                 switch c = txt.charAt(cur++)
1686                         when "\t", "\n", "\u000c", ' '
1687                                 tok_state = tok_state_before_attribute_name
1688                         when '&'
1689                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
1690                         when '>'
1691                                 tok_state = tok_state_data
1692                                 tmp = tok_cur_tag
1693                                 tok_cur_tag = null
1694                                 return tmp
1695                         when "\u0000"
1696                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1697                         when '' # EOF
1698                                 parse_error()
1699                                 tok_state = tok_state_data
1700                         else
1701                                 # Parse Error if ', <, = or ` (backtick)
1702                                 tok_cur_tag.attrs_a[0][1] += c
1703                 return null
1704
1705         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
1706         tok_state_after_attribute_value_quoted = ->
1707                 switch c = txt.charAt(cur++)
1708                         when "\t", "\n", "\u000c", ' '
1709                                 tok_state = tok_state_before_attribute_name
1710                         when '/'
1711                                 tok_state = tok_state_self_closing_start_tag
1712                         when '>'
1713                                 tok_state = tok_state_data
1714                                 tmp = tok_cur_tag
1715                                 tok_cur_tag = null
1716                                 return tmp
1717                         when '' # EOF
1718                                 parse_error()
1719                                 tok_state = tok_state_data
1720                         else
1721                                 # Parse Error
1722                                 tok_state = tok_state_before_attribute_name
1723                                 cur -= 1 # we didn't handle that char
1724                 return null
1725
1726         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
1727         # Don't set this as a state, just call it
1728         # returns a string (NOT a text node)
1729         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
1730                 if cur >= txt.length
1731                         return '&'
1732                 switch c = txt.charAt(cur)
1733                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
1734                                 # explicitly not a parse error
1735                                 return '&'
1736                         when ';'
1737                                 # there has to be "one or more" alnums between & and ; to be a parse error
1738                                 return '&'
1739                         when '#'
1740                                 if cur + 1 >= txt.length
1741                                         return '&'
1742                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
1743                                         prefix = '#x'
1744                                         charset = hex_chars
1745                                         start = cur + 2
1746                                 else
1747                                         charset = digits
1748                                         start = cur + 1
1749                                         prefix = '#'
1750                                 i = 0
1751                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
1752                                         i += 1
1753                                 if i is 0
1754                                         return '&'
1755                                 if txt.charAt(start + i) is ';'
1756                                         i += 1
1757                                 # FIXME This is supposed to generate parse errors for some chars
1758                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
1759                                 if decoded?
1760                                         cur = start + i
1761                                         return decoded
1762                                 return '&'
1763                         else
1764                                 for i in [0...31]
1765                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
1766                                                 break
1767                                 if i is 0
1768                                         # exit early, because parse_error() below needs at least one alnum
1769                                         return '&'
1770                                 if txt.charAt(cur + i) is ';'
1771                                         i += 1 # include ';' terminator in value
1772                                         decoded = decode_named_char_ref txt.substr(cur, i)
1773                                         if decoded?
1774                                                 cur += i
1775                                                 return decoded
1776                                         parse_error()
1777                                         return '&'
1778                                 else
1779                                         # no ';' terminator (only legacy char refs)
1780                                         max = i
1781                                         for i in [2..max] # no prefix matches, so ok to check shortest first
1782                                                 c = legacy_char_refs[txt.substr(cur, i)]
1783                                                 if c?
1784                                                         if in_attr
1785                                                                 if txt.charAt(cur + i) is '='
1786                                                                         # "because some legacy user agents will
1787                                                                         # misinterpret the markup in those cases"
1788                                                                         parse_error()
1789                                                                         return '&'
1790                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
1791                                                                         # this makes attributes forgiving about url args
1792                                                                         return '&'
1793                                                         # ok, and besides the weird exceptions for attributes...
1794                                                         # return the matching char
1795                                                         cur += i # consume entity chars
1796                                                         parse_error() # because no terminating ";"
1797                                                         return c
1798                                         parse_error()
1799                                         return '&'
1800                 return # never reached
1801
1802         # tree constructor initialization
1803         # see comments on TYPE_TAG/etc for the structure of this data
1804         tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
1805         open_els = [tree]
1806         insertion_mode = ins_mode_in_body
1807         flag_frameset_ok = true
1808         flag_parsing = true
1809         flag_foster_parenting = false
1810         form_element_pointer = null
1811         afe = [] # active formatting elements
1812
1813         # tokenizer initialization
1814         tok_state = tok_state_data
1815
1816         # proccess input
1817         while flag_parsing
1818                 t = tok_state()
1819                 if t?
1820                         insertion_mode t
1821         return tree.children
1822
1823 # everything below is tests on the above
1824 test_equals = (description, output, expected_output) ->
1825         if output is expected_output
1826                 console.log "passed." # don't say name, so smart consoles can merge all of these
1827         else
1828                 console.log "FAILED: \"#{description}\""
1829                 console.log "   Expected: #{expected_output}"
1830                 console.log "     Actual: #{output}"
1831 serialize_els = (els, shallow, show_ids) ->
1832         serialized = ''
1833         sep = ''
1834         for t in els
1835                 serialized += sep
1836                 sep = ','
1837                 serialized += t.serialize shallow, show_ids
1838         return serialized
1839 test_parser = (args) ->
1840         debug_log_reset()
1841         parse_errors = []
1842         errors_cb = (i) ->
1843                 parse_errors.push i
1844         prev_node_id = 0 # reset counter
1845         parsed = parse_html args.html, errors_cb
1846         serialized = serialize_els parsed, false, false
1847         if serialized isnt args.expected # or parse_errors.length isnt args.errors
1848                 debug_log_each (str) ->
1849                         console.log str
1850                 console.log "FAILED: \"#{args.name}\""
1851         else
1852                 console.log "passed \"#{args.name}\""
1853         if serialized isnt args.expected
1854                 console.log "      Input: #{args.html}"
1855                 console.log "    Correct: #{args.expected}"
1856                 console.log "     Output: #{serialized}"
1857                 if parse_errors.length isnt args.errors
1858                         console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
1859
1860 test_parser name: "empty", \
1861         html: "",
1862         expected: '',
1863         errors: 0
1864 test_parser name: "just text", \
1865         html: "abc",
1866         expected: 'text:"abc"',
1867         errors: 0
1868 test_parser name: "named entity", \
1869         html: "a&amp;1234",
1870         expected: 'text:"a&1234"',
1871         errors: 0
1872 test_parser name: "broken named character references", \
1873         html: "1&amp2&&amp;3&aabbcc;",
1874         expected: 'text:"1&2&&3&aabbcc;"',
1875         errors: 2
1876 test_parser name: "numbered entity overrides", \
1877         html: "1&#X80&#x80; &#x83",
1878         expected: 'text:"1€€ ƒ"',
1879         errors: 0
1880 test_parser name: "open tag", \
1881         html: "foo<span>bar",
1882         expected: 'text:"foo",tag:"span",{},[text:"bar"]',
1883         errors: 1 # no close tag
1884 test_parser name: "open tag with attributes", \
1885         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
1886         expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
1887         errors: 1 # no close tag
1888 test_parser name: "open tag with attributes of various quotings", \
1889         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
1890         expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
1891         errors: 1 # no close tag
1892 test_parser name: "attribute entity exceptions dq", \
1893         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
1894         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
1895         errors: 2 # no close tag, &amp= in attr
1896 test_parser name: "attribute entity exceptions sq", \
1897         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
1898         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
1899         errors: 2 # no close tag, &amp= in attr
1900 test_parser name: "attribute entity exceptions uq", \
1901         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
1902         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
1903         errors: 2 # no close tag, &amp= in attr
1904 test_parser name: "matching closing tags", \
1905         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
1906         expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
1907         errors: 0
1908 test_parser name: "missing closing tag inside", \
1909         html: "foo<div>bar<span>baz</div>qux",
1910         expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
1911         errors: 1 # close tag mismatch
1912 test_parser name: "mis-matched closing tags", \
1913         html: "<span>12<div>34</span>56</div>78",
1914         expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
1915         errors: 2 # misplaced </span>, no </span> at the end
1916 test_parser name: "mis-matched formatting elements", \
1917         html: "12<b>34<i>56</b>78</i>90",
1918         expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
1919         errors: 1 # no idea how many their should be
1920 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
1921         html: '<p>1<b>2<i>3</b>4</i>5</p>',
1922         expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]',
1923         errors: 1
1924 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
1925         html: '<b>1<p>2</b>3</p>',
1926         expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]',
1927         errors: 1
1928 test_parser name: "crazy formatting elements test", \
1929         html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
1930         # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
1931         # firefox does this:
1932         expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"',
1933         errors: 6 # no idea how many there should be
1934 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
1935 test_parser name: "html5lib aaa 1", \
1936         html: '<a><p></a></p>',
1937         expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]',
1938         errors: 2
1939 test_parser name: "html5lib aaa 2", \
1940         html: '<a>1<p>2</a>3</p>',
1941         expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]',
1942         errors: 2
1943 test_parser name: "html5lib aaa 3", \
1944         html: '<a>1<button>2</a>3</button>',
1945         expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]',
1946         errors: 2
1947 test_parser name: "html5lib aaa 4", \
1948         html: '<a>1<b>2</a>3</b>',
1949         expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]',
1950         errors: 2
1951 test_parser name: "html5lib aaa 5 (two divs deep)", \
1952         html: '<a>1<div>2<div>3</a>4</div>5</div>',
1953         expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]',
1954         errors: 3
1955 test_parser name: "html5lib aaa 6 (foster parenting)", \
1956         html: '<table><a>1<p>2</a>3</p>',
1957         expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]',
1958         errors: 10
1959 test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
1960         html: '<table><a>1<td>2</td>3</table>',
1961         expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]',
1962         errors: 10