parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
  20 # or <body> tags, nor does it produce the top level "document" node in the dom
  21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
  22 # indicate places where additional code is needed for full HTML document
  23 # parsing.
  24 #
  25 # Instead, the data structure produced by this parser is an array of nodes.
  26 #
  27 # Each node is an obect of the Node class. Here are the Node types:
  28 TYPE_TAG = 0 # name, {attributes}, [children]
  29 TYPE_TEXT = 1 # "text"
  30 TYPE_COMMENT = 2
  31 TYPE_DOCTYPE = 3
  32 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  33 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
  34 TYPE_END_TAG = 5 # name
  35 TYPE_EOF = 6
  36 TYPE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
  37 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
  38
  39 # namespace constants
  40 NS_HTML = 1
  41 NS_MATHML = 2
  42 NS_SVG = 3
  43
  44 class Node
  45         constructor: (type, args = {}) ->
  46                 @type = type # one of the TYPE_* constants above
  47                 @name = args.name ? '' # tag name
  48                 @text = args.text ? '' # contents for text/comment nodes
  49                 @attrs = args.attrs ? {}
  50                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
  51                 @children = args.children ? []
  52                 @namespace = args.namespace ? NS_HTML
  53                 @parent = args.parent ? null
  54         shallow_clone: -> # return a new node that's the same except without the children or parent
  55                 # WARNING this doesn't work right on open tags that are still being parsed
  56                 attrs = {}
  57                 attrs[k] = v for k, v of @attrs
  58                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace
  59         serialize: -> # for unit tests
  60                 ret = ''
  61                 switch @type
  62                         when TYPE_TAG
  63                                 ret += 'tag:'
  64                                 ret += JSON.stringify @name
  65                                 ret += ','
  66                                 ret += JSON.stringify @attrs
  67                                 ret += ',['
  68                                 sep = ''
  69                                 for c in @children
  70                                         ret += sep
  71                                         sep = ','
  72                                         ret += c.serialize()
  73                                 ret += ']'
  74                         when TYPE_TEXT
  75                                 ret += 'text:'
  76                                 ret += JSON.stringify @text
  77                         when TYPE_COMMENT
  78                                 ret += 'comment:'
  79                                 ret += JSON.stringify @text
  80                         when TYPE_DOCTYPE
  81                                 ret += 'doctype'
  82                                 # FIXME
  83                         else
  84                                 ret += 'unknown:'
  85                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
  86                 return ret
  87
  88 # helpers: (only take args that are normally known when parser creates nodes)
  89 new_open_tag = (name) ->
  90         return new Node TYPE_OPEN_TAG, name: name
  91 new_end_tag = (name) ->
  92         return new Node TYPE_END_TAG, name: name
  93 new_text_node = (txt) ->
  94         return new Node TYPE_TEXT, text: txt
  95 new_comment_node = (txt) ->
  96         return new Node TYPE_COMMENT, text: txt
  97 new_eof_token = ->
  98         return new Node TYPE_EOF
  99 new_aaa_bookmark = ->
 100         return new Node TYPE_AAA_BOOKMARK
 101
 102 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
 103 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
 104 digits = "0123456789"
 105 alnum = lc_alpha + uc_alpha + digits
 106 hex_chars = digits + "abcdefABCDEF"
 107
 108 # some SVG elements have dashes in them
 109 tag_name_chars = alnum + "-"
 110
 111 # http://www.w3.org/TR/html5/infrastructure.html#space-character
 112 space_chars = "\u0009\u000a\u000c\u000d\u0020"
 113
 114 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
 115 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
 116
 117 # These are the character references that don't need a terminating semicolon
 118 # min length: 2, max: 6, none are a prefix of any other.
 119 legacy_char_refs = {
 120         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
 121         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
 122         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
 123         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
 124         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
 125         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
 126         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
 127         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
 128         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
 129         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
 130         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
 131         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
 132         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
 133         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
 134         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
 135         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
 136         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
 137         yen: '¥', yuml: 'ÿ'
 138 }
 139
 140 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
 141 raw_text_elements = ['script', 'style']
 142 escapable_raw_text_elements = ['textarea', 'title']
 143 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
 144 svg_elements = [
 145         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
 146         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
 147         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
 148         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
 149         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
 150         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
 151         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
 152         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
 153         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
 154         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
 155         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
 156         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
 157         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
 158         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
 159         'view', 'vkern'
 160 ]
 161
 162 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
 163 mathml_elements = [
 164         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
 165         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
 166         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
 167         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
 168         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
 169         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 170         'determinant', 'diff', 'divergence', 'divide', 'domain',
 171         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 172         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 173         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 174         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 175         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 176         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 177         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 178         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 179         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 180         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 181         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 182         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 183         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 184         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 185         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 186         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 187         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 188         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 189         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 190         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 191         'vectorproduct', 'xor'
 192 ]
 193 # foreign_elements = [svg_elements..., mathml_elements...]
 194 #normal_elements = All other allowed HTML elements are normal elements.
 195
 196 special_elements = {
 197         # HTML:
 198         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
 199         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
 200         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
 201         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
 202         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
 203         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
 204         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
 205         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
 206         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
 207         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
 208         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
 209         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
 210         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
 211         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
 212         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
 213         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
 214         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
 215         wbr:NS_HTML, xmp:NS_HTML,
 216
 217         # MathML:
 218         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
 219         'annotation-xml':NS_MATHML,
 220
 221         # SVG:
 222         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
 223 }
 224
 225 formatting_elements = {
 226          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
 227          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
 228          u: true
 229 }
 230
 231 el_is_special = (e) ->
 232         return special_elements[e] is e.namespace
 233
 234 # decode_named_char_ref()
 235 #
 236 # The list of named character references is _huge_ so ask the browser to decode
 237 # for us instead of wasting bandwidth/space on including the table here.
 238 #
 239 # Pass without the "&" but with the ";" examples:
 240 #    for "&amp" pass "amp;"
 241 #    for "&#x2032" pass "x2032;"
 242 g_dncr = {
 243         cache: {}
 244         textarea: document.createElement('textarea')
 245 }
 246 # TODO test this in IE8
 247 decode_named_char_ref = (txt) ->
 248         txt = "&#{txt}"
 249         decoded = g_dncr.cache[txt]
 250         return decoded if decoded?
 251         g_dncr.textarea.innerHTML = txt
 252         decoded = g_dncr.textarea.value
 253         return null if decoded is txt
 254         return g_dncr.cache[txt] = decoded
 255
 256 parse_html = (txt, parse_error_cb = null) ->
 257         cur = 0 # index of next char in txt to be parsed
 258         # declare tree and tokenizer variables so they're in scope below
 259         tree = null
 260         open_els = [] # stack of open elements
 261         tree_state = null
 262         tok_state = null
 263         tok_cur_tag = null # partially parsed tag
 264         flag_frameset_ok = null
 265         flag_parsing = null
 266         flag_foster_parenting = null
 267         afe = [] # active formatting elements
 268
 269         parse_error = ->
 270                 if parse_error_cb?
 271                         parse_error_cb cur
 272                 else
 273                         console.log "Parse error at character #{cur} of #{txt.length}"
 274
 275
 276         # the functions below impliment the Tree Contstruction algorithm
 277         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 278
 279         # But first... the helpers
 280         template_tag_is_open = ->
 281                 for t in open_els
 282                         if t.type is TYPE_TAG and t.name is 'template'
 283                                 return true
 284                 return false
 285         is_in_scope_x = (tag_name, scope) ->
 286                 for t in open_els
 287                         if t.name is tag_name
 288                                 return true
 289                         if t.name of scope
 290                                 return false
 291                 return false
 292         is_in_scope_x_y = (tag_name, scope, scope2) ->
 293                 for t in open_els
 294                         if t.name is tag_name
 295                                 return true
 296                         if t.name of scope
 297                                 return false
 298                         if t.name of scope2
 299                                 return false
 300                 return false
 301         standard_scopers = { # FIXME these are supposed to be namespace specific
 302                 'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
 303                 'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
 304                 'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
 305                 'foreignObject': true, 'desc': true, 'title'
 306         }
 307         button_scopers = button: true
 308         li_scopers = ol: true, ul: true
 309         table_scopers = html: true, table: true, template: true
 310         is_in_scope = (tag_name) ->
 311                 return is_in_scope_x tag_name, standard_scopers
 312         is_in_button_scope = (tag_name) ->
 313                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers
 314         is_in_table_scope = (tag_name) ->
 315                 return is_in_scope_x tag_name, table_scopers
 316         is_in_select_scope = (tag_name) ->
 317                 for t in open_els
 318                         if t.name is tag_name
 319                                 return true
 320                         if t.name isnt 'optgroup' and t.name isnt 'option'
 321                                 return false
 322                 return false
 323         # this checks for a particular element, not by name
 324         el_is_in_scope = (el) ->
 325                 for t in open_els
 326                         if t is el
 327                                 return true
 328                         if t.name of standard_scopers
 329                                 return false
 330                 return false
 331
 332         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
 333         # this implementation is structured (mostly) as described at the link above.
 334         # capitalized comments are the "labels" described at the link above.
 335         reconstruct_active_formatting_elements = ->
 336                 return if afe.length is 0
 337                 if afe[0].type is TYPE_MARKER or afe[0] in open_els
 338                         return
 339                 # Rewind
 340                 i = 0
 341                 loop
 342                         if i is afe.length - 1
 343                                 break
 344                         i += 1
 345                         if afe[i].type is TYPE_MARKER or afe[i] in open_els
 346                                 i -= 1 # Advance
 347                                 break
 348                 # Create
 349                 loop
 350                         el = afe[i].shallow_clone()
 351                         tree_insert_element el
 352                         afe[i] = el
 353                         break if i is 0
 354                         i -= 1
 355
 356         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
 357         # adoption agency algorithm
 358         adoption_agency = (subject) ->
 359                 if open_els[0].name is subject
 360                         el = open_els[0]
 361                         open_els.shift()
 362                         # remove it from the list of active formatting elements (if found)
 363                         for t, i in afe
 364                                 if t is el
 365                                         afe.splice i, 1
 366                                         break
 367                         return
 368                 outer = 0
 369                 loop
 370                         if outer >= 8
 371                                 return
 372                         outer += 1
 373                         fe = null
 374                         for t, fe_index in afe
 375                                 if t.type is TYPE_MARKER
 376                                         break
 377                                 if t.name is subject
 378                                         fe = t
 379                                         break
 380                         if fe is null
 381                                 in_body_any_other_end_tag subject
 382                                 return
 383                         in_open_els = false
 384                         for t in open_els
 385                                 if t is fe
 386                                         in_open_els = true
 387                                         break
 388                         unless in_open_els
 389                                 parse_error()
 390                                 # "remove it from the list" must mean afe, since it's not in open_els
 391                                 afe.splice fe_index, 1
 392                                 return
 393                         unless el_is_in_scope fe
 394                                 parse_error()
 395                                 return
 396                         unless open_els[0] is fe
 397                                 parse_error()
 398                                 # continue
 399                         fb = null
 400                         fb_index
 401                         for t, i in open_els
 402                                 if t is fe
 403                                         break
 404                                 if el_is_special t
 405                                         fb = t
 406                                         fb_index = i
 407                         if fb is null
 408                                 loop
 409                                         t = open_els.shift()
 410                                         if t is fe
 411                                                 afe.splice fe_index, 1
 412                                                 return
 413                         ca = open_els[fe_index + 1] # common ancestor
 414                         node_above = open_els[fb_index + 1] # next node if node isn't in open_els anymore
 415                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
 416                         bookmark = new_aaa_bookmark()
 417                         for t, i in afe
 418                                 if t is fe
 419                                         afe.splice i, 0, bookmark
 420                         node = last_node = fb
 421                         inner = 0
 422                         loop
 423                                 inner += 1
 424                                 node_next = null
 425                                 for t, i in open_els
 426                                         if t is node
 427                                                 node_next = open_els[i + 1]
 428                                                 break
 429                                 node = node_next ? node_above
 430                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
 431                                 if node is fe
 432                                         break
 433                                 node_in_afe = false
 434                                 for t, i of afe
 435                                         if t is node
 436                                                 if inner > 3
 437                                                         afe.splice i, 1
 438                                                 else
 439                                                         node_in_afe = true
 440                                                 break
 441                                 unless node_in_afe
 442                                         for t, i in open_els
 443                                                 if t is node
 444                                                         node_above = open_els[i + 1]
 445                                                         open_els.splice i, 1
 446                                                         break
 447                                         continue
 448                                 # 7. reate an element for the token for which the element node
 449                                 # was created, in the HTML namespace, with common ancestor as
 450                                 # the intended parent; replace the entry for node in the list
 451                                 # of active formatting elements with an entry for the new
 452                                 # element, replace the entry for node in the stack of open
 453                                 # elements with an entry for the new element, and let node be
 454                                 # the new element.
 455                                 new_node = node.shallow_clone()
 456                                 for t, i in afe
 457                                         if t is node
 458                                                 afe[i] = new_node
 459                                                 break
 460                                 for t, i in open_els
 461                                         if t is node
 462                                                 open_els[i] = new_node
 463                                                 break
 464                                 node = new_node
 465                                 # 8. If last node is furthest block, then move the
 466                                 # aforementioned bookmark to be immediately after the new node
 467                                 # in the list of active formatting elements.
 468                                 if last_node is fb
 469                                         for t, i in afe
 470                                                 if t is bookmark
 471                                                         afe.splice i, 1
 472                                         for t, i in afe
 473                                                 if t is node
 474                                                         # TODO test: position i gets you "after"?
 475                                                         afe.splice i, 0, new_aaa_bookmark()
 476                                 # 9. Insert last node into node, first removing it from its
 477                                 # previous parent node if any.
 478                                 if last_node.parent?
 479                                         for c, i of last_node.parent.children
 480                                                 if c is last_node
 481                                                         last_node.parent.children.splice i, 1
 482                                 node.children.push last_node
 483                                 last_node.parent = node
 484                                 # 10. Let last node be node.
 485                                 last_node = node
 486                                 # 11. Return to the step labeled inner loop.
 487                         # 14. Insert whatever last node ended up being in the previous step
 488                         # at the appropriate place for inserting a node, but using common
 489                         # ancestor as the override target.
 490                         tree_insert_element last_node, ca
 491                         # 15. Create an element for the token for which formatting element
 492                         # was created, in the HTML namespace, with furthest block as the
 493                         # intended parent.
 494                         new_element = fe.shallow_clone()
 495                         # 16. Take all of the child nodes of furthest block and append them
 496                         # to the element created in the last step.
 497                         while fb.children.length
 498                                 t = fb.children.shift()
 499                                 t.parent = new_element
 500                                 new_element.children.push t
 501                         # 17. Append that new element to furthest block.
 502                         new_element.parent = fb
 503                         fb.children.push new_element
 504                         # 18. Remove formatting element from the list of active formatting
 505                         # elements, and insert the new element into the list of active
 506                         # formatting elements at the position of the aforementioned
 507                         # bookmark.
 508                         for t, i in afe
 509                                 if t is fe
 510                                         afe.splice i, 1
 511                                         break
 512                         for t, i in afe
 513                                 if t is bookmark
 514                                         afe[i] = node
 515                                         break
 516                         # 19. Remove formatting element from the stack of open elements,
 517                         # and insert the new element into the stack of open elements
 518                         # immediately below the position of furthest block in that stack.
 519                         for t, i of open_els
 520                                 if t is fe
 521                                         open_els.splice i, 1
 522                                         break
 523                         for t, i of open_els
 524                                 if t is fb
 525                                         open_els.splice i, 0, new_element
 526                                         break
 527                         # 20. Jump back to the step labeled outer loop.
 528
 529         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
 530         # FIXME implement this
 531         close_p_if_in_button_scope = ->
 532                 if open_els[0].name is 'p'
 533                         open_els.pop()
 534                 return
 535                 #p = find_button_scope 'p'
 536                 #if p?
 537                         # TODO generate_implied_end_tags except for p tags
 538                         # TODO parse_error unless open_els[0].name is 'p'
 539                         # TODO pop stack until 'p' popped
 540
 541         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
 542         tree_insert_text = (t) ->
 543                 dest = adjusted_insertion_location()
 544                 if dest[1] > 0
 545                         prev = dest[0].children[dest[1] - 1]
 546                         if prev.type is TYPE_TEXT
 547                                 prev.text += t.text
 548                                 return
 549                 dest[0].children.splice dest[1], 0, t
 550
 551         # 8.2.5.1
 552         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
 553         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
 554         adjusted_insertion_location = (override_target = null) ->
 555                 # 1. If there was an override target specified, then let target be the
 556                 # override target.
 557                 if override_target?
 558                         target = override_target
 559                 else # Otherwise, let target be the current node.
 560                         target = open_els[0]
 561                 # 2. Determine the adjusted insertion location using the first matching
 562                 # steps from the following list:
 563                 #
 564                 # If foster parenting is enabled and target is a table, tbody, tfoot,
 565                 # thead, or tr element Foster parenting happens when content is
 566                 # misnested in tables.
 567                 if flag_foster_parenting and target.name in foster_parenting_targets
 568                         console.log "foster parenting isn't implemented yet" # TODO
 569                         # 1. Let last template be the last template element in the stack of
 570                         # open elements, if any.
 571                         # 2. Let last table be the last table element in the stack of open
 572                         # elements, if any.
 573
 574                         # 3. If there is a last template and either there is no last table,
 575                         # or there is one, but last template is lower (more recently added)
 576                         # than last table in the stack of open elements, then: let adjusted
 577                         # insertion location be inside last template's template contents,
 578                         # after its last child (if any), and abort these substeps.
 579
 580                         # 4. If there is no last table, then let adjusted insertion
 581                         # location be inside the first element in the stack of open
 582                         # elements (the html element), after its last child (if any), and
 583                         # abort these substeps. (fragment case)
 584
 585                         # 5. If last table has a parent element, then let adjusted
 586                         # insertion location be inside last table's parent element,
 587                         # immediately before last table, and abort these substeps.
 588
 589                         # 6. Let previous element be the element immediately above last
 590                         # table in the stack of open elements.
 591
 592                         # 7. Let adjusted insertion location be inside previous element,
 593                         # after its last child (if any).
 594
 595                         # Note: These steps are involved in part because it's possible for
 596                         # elements, the table element in this case in particular, to have
 597                         # been moved by a script around in the DOM, or indeed removed from
 598                         # the DOM entirely, after the element was inserted by the parser.
 599                 else
 600                         # Otherwise Let adjusted insertion location be inside target, after
 601                         # its last child (if any).
 602                         target_i = target.children.length
 603
 604                 # 3. If the adjusted insertion location is inside a template element,
 605                 # let it instead be inside the template element's template contents,
 606                 # after its last child (if any). TODO
 607
 608                 # 4. Return the adjusted insertion location.
 609                 return [target, target_i]
 610
 611         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
 612         # aka create_an_element_for_token
 613         token_to_element = (t, namespace, intended_parent) ->
 614                 t.type = TYPE_TAG # not TYPE_OPEN_TAG
 615                 # convert attributes into a hash
 616                 attrs = {}
 617                 while t.attrs_a.length
 618                         a = t.attrs_a.pop()
 619                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
 620                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
 621
 622                 # TODO 2. If the newly created element has an xmlns attribute in the
 623                 # XMLNS namespace whose value is not exactly the same as the element's
 624                 # namespace, that is a parse error. Similarly, if the newly created
 625                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
 626                 # value is not the XLink Namespace, that is a parse error.
 627
 628                 # fixfull: the spec says stuff about form pointers and ownerDocument
 629
 630                 return el
 631
 632         # FIXME read implement "foster parenting" part
 633         # FIXME read spec, do this right
 634         # FIXME implement the override target thing
 635         # note: this assumes it's an open tag
 636         # TODO tree_insert_html_element = (t, ...
 637         tree_insert_element = (el, override_target = null, namespace = null) ->
 638                 dest = adjusted_insertion_location override_target
 639                 if el.type is TYPE_OPEN_TAG # means it's a "token"
 640                         el = token_to_element el, namespace, dest[0]
 641                 # fixfull: Document nodes sometimes can't accept more chidren
 642                 dest[0].children.splice dest[1], 0, el
 643                 el.parent = dest[0]
 644                 open_els.unshift el
 645                 return el
 646
 647         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
 648         tree_insert_a_comment = (t) ->
 649                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
 650                 open_els[0].children.push t
 651
 652         # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
 653         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
 654                 for node, i in open_els
 655                         if node.name is name
 656                                 # FIXME generate implied end tags except those with name==name
 657                                 parse_error() unless i is 0
 658                                 while i > 0
 659                                         open_els.shift()
 660                                         i -= 1
 661                                 open_els.shift()
 662                                 return
 663                         if special_elements[node.name]?
 664                                 parse_error()
 665                                 return
 666         tree_in_body = (t) ->
 667                 switch t.type
 668                         when TYPE_TEXT
 669                                 switch t.text
 670                                         when "\u0000"
 671                                                 parse_error()
 672                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
 673                                                 reconstruct_active_formatting_elements()
 674                                                 tree_insert_text t
 675                                         else
 676                                                 reconstruct_active_formatting_elements()
 677                                                 tree_insert_text t
 678                                                 flag_frameset_ok = false
 679                         when TYPE_COMMENT
 680                                 tree_insert_a_comment t
 681                         when TYPE_DOCTYPE
 682                                 parse_error()
 683                         when TYPE_OPEN_TAG
 684                                 switch t.name
 685                                         when 'html'
 686                                                 parse_error()
 687                                                 return if template_tag_is_open()
 688                                                 root_attrs = open_els[open_els.length - 1].children
 689                                                 for k, v of t.attrs
 690                                                         root_attrs[k] = v unless root_attrs[k]?
 691                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
 692                                                 # FIXME also do this for </template> (end tag)
 693                                                 return tree_in_head t
 694                                         when 'body'
 695                                                 parse_error()
 696                                                 # TODO
 697                                         when 'frameset'
 698                                                 parse_error()
 699                                                 # TODO
 700                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
 701                                                 close_p_if_in_button_scope()
 702                                                 tree_insert_element t
 703                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
 704                                                 close_p_if_in_button_scope()
 705                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
 706                                                         parse_error()
 707                                                         open_els.shift()
 708                                                 tree_insert_element t
 709                                         # TODO lots more to implement here
 710                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
 711                                                 reconstruct_active_formatting_elements()
 712                                                 el = tree_insert_element t
 713                                                 afe.push el
 714                                         # TODO lots more to implement here
 715                                         else # any other start tag
 716                                                 reconstruct_active_formatting_elements()
 717                                                 tree_insert_element t
 718                         when TYPE_EOF
 719                                 ok_tags = {
 720                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
 721                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
 722                                 }
 723                                 for t in open_els
 724                                         unless ok_tags[t.name]?
 725                                                 parse_error()
 726                                                 break
 727                                 # TODO stack of template insertion modes thing
 728                                 flag_parsing = false # stop parsing
 729                         when TYPE_END_TAG
 730                                 switch t.name
 731                                         when 'body'
 732                                                 unless is_in_scope 'body'
 733                                                         parse_error()
 734                                                         return
 735                                                 # TODO implement parse error and move to tree_after_body
 736                                         when 'html'
 737                                                 unless is_in_scope 'body' # weird, but it's what the spec says
 738                                                         parse_error()
 739                                                         return
 740                                                 # TODO implement parse error and move to tree_after_body, reprocess
 741                                         # TODO lots more close tags to implement here
 742                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
 743                                                 adoption_agency t.name
 744                                         # TODO lots more close tags to implement here
 745                                         else
 746                                                 in_body_any_other_end_tag t.name
 747                 return
 748
 749
 750         # the functions below implement the tokenizer stats described here:
 751         # http://www.w3.org/TR/html5/syntax.html#tokenization
 752
 753         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
 754         tok_state_data = ->
 755                 switch c = txt.charAt(cur++)
 756                         when '&'
 757                                 return new_text_node tokenize_character_reference()
 758                         when '<'
 759                                 tok_state = tok_state_tag_open
 760                         when "\u0000"
 761                                 parse_error()
 762                                 return new_text_node c
 763                         when '' # EOF
 764                                 return new_eof_token()
 765                         else
 766                                 return new_text_node c
 767                 return null
 768
 769         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
 770         # not needed: tok_state_character_reference_in_data = ->
 771         # just call tok_state_character_reference_in_data()
 772
 773         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
 774         tok_state_tag_open = ->
 775                 switch c = txt.charAt(cur++)
 776                         when '!'
 777                                 tok_state = tok_state_markup_declaration_open
 778                         when '/'
 779                                 tok_state = tok_state_end_tag_open
 780                         when '?'
 781                                 parse_error()
 782                                 tok_state = tok_state_bogus_comment
 783                         else
 784                                 if lc_alpha.indexOf(c) > -1
 785                                         tok_cur_tag = new_open_tag c
 786                                         tok_state = tok_state_tag_name
 787                                 else if uc_alpha.indexOf(c) > -1
 788                                         tok_cur_tag = new_open_tag c.toLowerCase()
 789                                         tok_state = tok_state_tag_name
 790                                 else
 791                                         parse_error()
 792                                         tok_state = tok_state_data
 793                                         cur -= 1 # we didn't parse/handle the char after <
 794                                         return new_text_node '<'
 795                 return null
 796
 797         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
 798         tok_state_end_tag_open = ->
 799                 switch c = txt.charAt(cur++)
 800                         when '>'
 801                                 parse_error()
 802                                 tok_state = tok_state_data
 803                         when '' # EOF
 804                                 parse_error()
 805                                 tok_state = tok_state_data
 806                                 return new_text_node '</'
 807                         else
 808                                 if uc_alpha.indexOf(c) > -1
 809                                         tok_cur_tag = new_end_tag c.toLowerCase()
 810                                         tok_state = tok_state_tag_name
 811                                 else if lc_alpha.indexOf(c) > -1
 812                                         tok_cur_tag = new_end_tag c
 813                                         tok_state = tok_state_tag_name
 814                                 else
 815                                         parse_error()
 816                                         tok_state = tok_state_bogus_comment
 817                 return null
 818
 819         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
 820         tok_state_tag_name = ->
 821                 switch c = txt.charAt(cur++)
 822                         when "\t", "\n", "\u000c", ' '
 823                                 tok_state = tok_state_before_attribute_name
 824                         when '/'
 825                                 tok_state = tok_state_self_closing_start_tag
 826                         when '>'
 827                                 tok_state = tok_state_data
 828                                 tmp = tok_cur_tag
 829                                 tok_cur_tag = null
 830                                 return tmp
 831                         when "\u0000"
 832                                 parse_error()
 833                                 tok_cur_tag.name += "\ufffd"
 834                         when '' # EOF
 835                                 parse_error()
 836                                 tok_state = tok_state_data
 837                         else
 838                                 if uc_alpha.indexOf(c) > -1
 839                                         tok_cur_tag.name += c.toLowerCase()
 840                                 else
 841                                         tok_cur_tag.name += c
 842                 return null
 843
 844         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
 845         tok_state_before_attribute_name = ->
 846                 attr_name = null
 847                 switch c = txt.charAt(cur++)
 848                         when "\t", "\n", "\u000c", ' '
 849                                 return null
 850                         when '/'
 851                                 tok_state = tok_state_self_closing_start_tag
 852                                 return null
 853                         when '>'
 854                                 tok_state = tok_state_data
 855                                 tmp = tok_cur_tag
 856                                 tok_cur_tag = null
 857                                 return tmp
 858                         when "\u0000"
 859                                 parse_error()
 860                                 attr_name = "\ufffd"
 861                         when '"', "'", '<', '='
 862                                 parse_error()
 863                                 attr_name = c
 864                         when '' # EOF
 865                                 parse_error()
 866                                 tok_state = tok_state_data
 867                         else
 868                                 if uc_alpha.indexOf(c) > -1
 869                                         attr_name = c.toLowerCase()
 870                                 else
 871                                         attr_name = c
 872                 if attr_name?
 873                         tok_cur_tag.attrs_a.unshift [attr_name, '']
 874                         tok_state = tok_state_attribute_name
 875                 return null
 876
 877         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
 878         tok_state_attribute_name = ->
 879                 switch c = txt.charAt(cur++)
 880                         when "\t", "\n", "\u000c", ' '
 881                                 tok_state = tok_state_after_attribute_name
 882                         when '/'
 883                                 tok_state = tok_state_self_closing_start_tag
 884                         when '='
 885                                 tok_state = tok_state_before_attribute_value
 886                         when '>'
 887                                 tok_state = tok_state_data
 888                                 tmp = tok_cur_tag
 889                                 tok_cur_tag = null
 890                                 return tmp
 891                         when "\u0000"
 892                                 parse_error()
 893                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
 894                         when '"', "'", '<'
 895                                 parse_error()
 896                                 tok_cur_tag.attrs_a[0][0] = c
 897                         when '' # EOF
 898                                 parse_error()
 899                                 tok_state = tok_state_data
 900                         else
 901                                 if uc_alpha.indexOf(c) > -1
 902                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
 903                                 else
 904                                         tok_cur_tag.attrs_a[0][0] += c
 905                 return null
 906
 907         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
 908         tok_state_before_attribute_value = ->
 909                 switch c = txt.charAt(cur++)
 910                         when "\t", "\n", "\u000c", ' '
 911                                 return null
 912                         when '"'
 913                                 tok_state = tok_state_attribute_value_double_quoted
 914                         when '&'
 915                                 tok_state = tok_state_attribute_value_unquoted
 916                                 cur -= 1
 917                         when "'"
 918                                 tok_state = tok_state_attribute_value_single_quoted
 919                         when "\u0000"
 920                                 # Parse error
 921                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
 922                                 tok_state = tok_state_attribute_value_unquoted
 923                         when '>'
 924                                 # Parse error
 925                                 tok_state = tok_state_data
 926                                 tmp = tok_cur_tag
 927                                 tok_cur_tag = null
 928                                 return tmp
 929                         when '' # EOF
 930                                 parse_error()
 931                                 tok_state = tok_state_data
 932                         else
 933                                 tok_cur_tag.attrs_a[0][1] += c
 934                                 tok_state = tok_state_attribute_value_unquoted
 935                 return null
 936
 937         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
 938         tok_state_attribute_value_double_quoted = ->
 939                 switch c = txt.charAt(cur++)
 940                         when '"'
 941                                 tok_state = tok_state_after_attribute_value_quoted
 942                         when '&'
 943                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
 944                         when "\u0000"
 945                                 # Parse error
 946                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
 947                         when '' # EOF
 948                                 parse_error()
 949                                 tok_state = tok_state_data
 950                         else
 951                                 tok_cur_tag.attrs_a[0][1] += c
 952                 return null
 953
 954         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
 955         tok_state_attribute_value_single_quoted = ->
 956                 switch c = txt.charAt(cur++)
 957                         when "'"
 958                                 tok_state = tok_state_after_attribute_value_quoted
 959                         when '&'
 960                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
 961                         when "\u0000"
 962                                 # Parse error
 963                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
 964                         when '' # EOF
 965                                 parse_error()
 966                                 tok_state = tok_state_data
 967                         else
 968                                 tok_cur_tag.attrs_a[0][1] += c
 969                 return null
 970
 971         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
 972         tok_state_attribute_value_unquoted = ->
 973                 switch c = txt.charAt(cur++)
 974                         when "\t", "\n", "\u000c", ' '
 975                                 tok_state = tok_state_before_attribute_name
 976                         when '&'
 977                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
 978                         when '>'
 979                                 tok_state = tok_state_data
 980                                 tmp = tok_cur_tag
 981                                 tok_cur_tag = null
 982                                 return tmp
 983                         when "\u0000"
 984                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
 985                         when '' # EOF
 986                                 parse_error()
 987                                 tok_state = tok_state_data
 988                         else
 989                                 # Parse Error if ', <, = or ` (backtick)
 990                                 tok_cur_tag.attrs_a[0][1] += c
 991                 return null
 992
 993         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
 994         tok_state_after_attribute_value_quoted = ->
 995                 switch c = txt.charAt(cur++)
 996                         when "\t", "\n", "\u000c", ' '
 997                                 tok_state = tok_state_before_attribute_name
 998                         when '/'
 999                                 tok_state = tok_state_self_closing_start_tag
1000                         when '>'
1001                                 tok_state = tok_state_data
1002                                 tmp = tok_cur_tag
1003                                 tok_cur_tag = null
1004                                 return tmp
1005                         when '' # EOF
1006                                 parse_error()
1007                                 tok_state = tok_state_data
1008                         else
1009                                 # Parse Error
1010                                 tok_state = tok_state_before_attribute_name
1011                                 cur -= 1 # we didn't handle that char
1012                 return null
1013
1014         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
1015         # Don't set this as a state, just call it
1016         # returns a string (NOT a text node)
1017         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
1018                 if cur >= txt.length
1019                         return '&'
1020                 switch c = txt.charAt(cur)
1021                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
1022                                 # explicitly not a parse error
1023                                 return '&'
1024                         when ';'
1025                                 # there has to be "one or more" alnums between & and ; to be a parse error
1026                                 return '&'
1027                         when '#'
1028                                 if cur + 1 >= txt.length
1029                                         return '&'
1030                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
1031                                         prefix = '#x'
1032                                         charset = hex_chars
1033                                         start = cur + 2
1034                                 else
1035                                         charset = digits
1036                                         start = cur + 1
1037                                         prefix = '#'
1038                                 i = 0
1039                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
1040                                         i += 1
1041                                 if i is 0
1042                                         return '&'
1043                                 if txt.charAt(start + i) is ';'
1044                                         i += 1
1045                                 # FIXME This is supposed to generate parse errors for some chars
1046                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
1047                                 if decoded?
1048                                         cur = start + i
1049                                         return decoded
1050                                 return '&'
1051                         else
1052                                 for i in [0...31]
1053                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
1054                                                 break
1055                                 if i is 0
1056                                         # exit early, because parse_error() below needs at least one alnum
1057                                         return '&'
1058                                 if txt.charAt(cur + i) is ';'
1059                                         i += 1 # include ';' terminator in value
1060                                         decoded = decode_named_char_ref txt.substr(cur, i)
1061                                         if decoded?
1062                                                 cur += i
1063                                                 return decoded
1064                                         parse_error()
1065                                         return '&'
1066                                 else
1067                                         # no ';' terminator (only legacy char refs)
1068                                         max = i
1069                                         for i in [2..max] # no prefix matches, so ok to check shortest first
1070                                                 c = legacy_char_refs[txt.substr(cur, i)]
1071                                                 if c?
1072                                                         if in_attr
1073                                                                 if txt.charAt(cur + i) is '='
1074                                                                         # "because some legacy user agents will
1075                                                                         # misinterpret the markup in those cases"
1076                                                                         parse_error()
1077                                                                         return '&'
1078                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
1079                                                                         # this makes attributes forgiving about url args
1080                                                                         return '&'
1081                                                         # ok, and besides the weird exceptions for attributes...
1082                                                         # return the matching char
1083                                                         cur += i # consume entity chars
1084                                                         parse_error() # because no terminating ";"
1085                                                         return c
1086                                         parse_error()
1087                                         return '&'
1088                 return # never reached
1089
1090         # tree constructor initialization
1091         # see comments on TYPE_TAG/etc for the structure of this data
1092         tree = new Node TYPE_TAG, name: 'html'
1093         open_els = [tree]
1094         tree_state = tree_in_body
1095         flag_frameset_ok = true
1096         flag_parsing = true
1097         flag_foster_parenting = false
1098         afe = [] # active formatting elements
1099
1100         # tokenizer initialization
1101         tok_state = tok_state_data
1102
1103         # proccess input
1104         while flag_parsing
1105                 t = tok_state()
1106                 if t?
1107                         tree_state t
1108         return tree.children
1109
1110 # everything below is tests on the above
1111 test_equals = (description, output, expected_output) ->
1112         if output is expected_output
1113                 console.log "passed." # don't say name, so smart consoles can merge all of these
1114         else
1115                 console.log "FAILED: \"#{description}\""
1116                 console.log "   Expected: #{expected_output}"
1117                 console.log "     Actual: #{output}"
1118 test_parser = (args) ->
1119         parse_errors = []
1120         errors_cb = (i) ->
1121                 parse_errors.push i
1122         parsed = parse_html args.html, errors_cb
1123         serialized = ''
1124         sep = ''
1125         for t in parsed
1126                 serialized += sep
1127                 sep = ','
1128                 serialized += t.serialize()
1129         if serialized isnt args.expected or parse_errors.length isnt args.errors
1130                 console.log "FAILED: \"#{args.name}\""
1131         else
1132                 console.log "passed \"#{args.name}\""
1133         if serialized isnt args.expected
1134                 console.log "      Input: #{args.html}"
1135                 console.log "    Correct: #{args.expected}"
1136                 console.log "     Output: #{serialized}"
1137         if parse_errors.length isnt args.errors
1138                 console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
1139
1140 test_parser name: "empty", \
1141         html: "",
1142         expected: '',
1143         errors: 0
1144 test_parser name: "just text", \
1145         html: "abc",
1146         expected: 'text:"abc"',
1147         errors: 0
1148 test_parser name: "named entity", \
1149         html: "a&amp;1234",
1150         expected: 'text:"a&1234"',
1151         errors: 0
1152 test_parser name: "broken named character references", \
1153         html: "1&amp2&&amp;3&aabbcc;",
1154         expected: 'text:"1&2&&3&aabbcc;"',
1155         errors: 2
1156 test_parser name: "numbered entity overrides", \
1157         html: "1&#X80&#x80; &#x83",
1158         expected: 'text:"1€€ ƒ"',
1159         errors: 0
1160 test_parser name: "open tag", \
1161         html: "foo<span>bar",
1162         expected: 'text:"foo",tag:"span",{},[text:"bar"]',
1163         errors: 1 # no close tag
1164 test_parser name: "open tag with attributes", \
1165         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
1166         expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
1167         errors: 1 # no close tag
1168 test_parser name: "open tag with attributes of various quotings", \
1169         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
1170         expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
1171         errors: 1 # no close tag
1172 test_parser name: "attribute entity exceptions dq", \
1173         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
1174         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
1175         errors: 2 # no close tag, &amp= in attr
1176 test_parser name: "attribute entity exceptions sq", \
1177         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
1178         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
1179         errors: 2 # no close tag, &amp= in attr
1180 test_parser name: "attribute entity exceptions uq", \
1181         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
1182         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
1183         errors: 2 # no close tag, &amp= in attr
1184 test_parser name: "matching closing tags", \
1185         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
1186         expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
1187         errors: 0
1188 test_parser name: "missing closing tag inside", \
1189         html: "foo<div>bar<span>baz</div>qux",
1190         expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
1191         errors: 1 # close tag mismatch
1192 test_parser name: "mis-matched closing tags", \
1193         html: "<span>12<div>34</span>56</div>78",
1194         expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
1195         errors: 2 # misplaced </span>, no </span> at the end
1196 test_parser name: "mis-matched formatting elements", \
1197         html: "12<b>34<i>56</b>78</i>90",
1198         expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
1199         errors: 1 # no idea how many their should be
1200 test_parser name: "crazy formatting elements test", \
1201         html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
1202         # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
1203         # firefox does this:
1204         expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"',
1205         errors: 6 # no idea how many there should be