parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
  20 # or <body> tags, nor does it produce the top level "document" node in the dom
  21 # tree, nor nodes for html, head or body.
  22 #
  23 # Instead, the data structure produced by this parser is an array of nodes.
  24 #
  25 # Each node is an obect of the Node class. Here are the Node types:
  26 TYPE_TAG = 0 # name, {attributes}, [children]
  27 TYPE_TEXT = 1 # "text"
  28 TYPE_COMMENT = 2
  29 TYPE_DOCTYPE = 3
  30 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  31 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
  32 TYPE_END_TAG = 5 # name
  33 TYPE_EOF = 6
  34 TYPE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
  35 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
  36
  37 # namespace constants
  38 NS_HTML = 1
  39 NS_MATHML = 2
  40 NS_SVG = 3
  41
  42 class Node
  43         constructor: (type, args = {}) ->
  44                 @type = type # one of the TYPE_* constants above
  45                 @name = args.name ? '' # tag name
  46                 @text = args.text ? '' # contents for text/comment nodes
  47                 @attrs = args.attrs ? {}
  48                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
  49                 @children = args.children ? []
  50                 @namespace = args.namespace ? NS_HTML
  51                 @parent = args.parent ? null
  52         shallow_clone: -> # return a new node that's the same except without the children or parent
  53                 # WARNING this doesn't work right on open tags that are still being parsed
  54                 attrs = {}
  55                 attrs[k] = v for k, v of @attrs
  56                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace
  57         serialize: -> # for unit tests
  58                 ret = ''
  59                 switch @type
  60                         when TYPE_TAG
  61                                 ret += 'tag:'
  62                                 ret += JSON.stringify @name
  63                                 ret += ','
  64                                 ret += JSON.stringify @attrs
  65                                 ret += ','
  66                                 sep = '['
  67                                 for c in @children
  68                                         ret += sep
  69                                         sep = ','
  70                                         ret += c.serialize()
  71                                 ret += ']'
  72                         when TYPE_TEXT
  73                                 ret += 'text:'
  74                                 ret += JSON.stringify @text
  75                         when TYPE_COMMENT
  76                                 ret += 'comment:'
  77                                 ret += JSON.stringify @text
  78                         when TYPE_DOCTYPE
  79                                 ret += 'doctype'
  80                                 # FIXME
  81                         else
  82                                 ret += 'unknown:'
  83                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
  84                 return ret
  85
  86 # helpers: (only take args that are normally known when parser creates nodes)
  87 new_open_tag = (name) ->
  88         return new Node TYPE_OPEN_TAG, name: name
  89 new_end_tag = (name) ->
  90         return new Node TYPE_END_TAG, name: name
  91 new_text_node = (txt) ->
  92         return new Node TYPE_TEXT, text: txt
  93 new_comment_node = (txt) ->
  94         return new Node TYPE_COMMENT, text: txt
  95 new_eof_token = ->
  96         return new Node TYPE_EOF
  97 new_aaa_bookmark = ->
  98         return new Node TYPE_AAA_BOOKMARK
  99
 100 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
 101 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
 102 digits = "0123456789"
 103 alnum = lc_alpha + uc_alpha + digits
 104 hex_chars = digits + "abcdefABCDEF"
 105
 106 # some SVG elements have dashes in them
 107 tag_name_chars = alnum + "-"
 108
 109 # http://www.w3.org/TR/html5/infrastructure.html#space-character
 110 space_chars = "\u0009\u000a\u000c\u000d\u0020"
 111
 112 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
 113 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
 114
 115 # These are the character references that don't need a terminating semicolon
 116 # min length: 2, max: 6, none are a prefix of any other.
 117 legacy_char_refs = {
 118         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
 119         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
 120         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
 121         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
 122         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
 123         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
 124         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
 125         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
 126         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
 127         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
 128         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
 129         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
 130         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
 131         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
 132         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
 133         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
 134         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
 135         yen: '¥', yuml: 'ÿ'
 136 }
 137
 138 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
 139 raw_text_elements = ['script', 'style']
 140 escapable_raw_text_elements = ['textarea', 'title']
 141 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
 142 svg_elements = [
 143         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
 144         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
 145         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
 146         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
 147         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
 148         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
 149         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
 150         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
 151         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
 152         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
 153         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
 154         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
 155         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
 156         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
 157         'view', 'vkern'
 158 ]
 159
 160 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
 161 mathml_elements = [
 162         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
 163         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
 164         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
 165         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
 166         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
 167         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 168         'determinant', 'diff', 'divergence', 'divide', 'domain',
 169         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 170         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 171         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 172         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 173         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 174         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 175         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 176         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 177         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 178         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 179         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 180         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 181         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 182         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 183         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 184         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 185         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 186         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 187         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 188         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 189         'vectorproduct', 'xor'
 190 ]
 191 # foreign_elements = [svg_elements..., mathml_elements...]
 192 #normal_elements = All other allowed HTML elements are normal elements.
 193
 194 special_elements = {
 195         # HTML:
 196         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
 197         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
 198         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
 199         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
 200         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
 201         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
 202         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
 203         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
 204         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
 205         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
 206         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
 207         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
 208         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
 209         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
 210         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
 211         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
 212         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
 213         wbr:NS_HTML, xmp:NS_HTML,
 214
 215         # MathML:
 216         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
 217         'annotation-xml':NS_MATHML,
 218
 219         # SVG:
 220         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
 221 }
 222
 223 formatting_elements = {
 224          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
 225          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
 226          u: true
 227 }
 228
 229 el_is_special = (e) ->
 230         return special_elements[e] is e.namespace
 231
 232 # decode_named_char_ref()
 233 #
 234 # The list of named character references is _huge_ so ask the browser to decode
 235 # for us instead of wasting bandwidth/space on including the table here.
 236 #
 237 # Pass without the "&" but with the ";" examples:
 238 #    for "&amp" pass "amp;"
 239 #    for "&#x2032" pass "x2032;"
 240 g_dncr = {
 241         cache: {}
 242         textarea: document.createElement('textarea')
 243 }
 244 # TODO test this in IE8
 245 decode_named_char_ref = (txt) ->
 246         txt = "&#{txt}"
 247         decoded = g_dncr.cache[txt]
 248         return decoded if decoded?
 249         g_dncr.textarea.innerHTML = txt
 250         decoded = g_dncr.textarea.value
 251         return null if decoded is txt
 252         return g_dncr.cache[txt] = decoded
 253
 254 parse_html = (txt, parse_error_cb = null) ->
 255         cur = 0 # index of next char in txt to be parsed
 256         # declare tree and tokenizer variables so they're in scope below
 257         tree = null
 258         open_els = [] # stack of open elements
 259         tree_state = null
 260         tok_state = null
 261         tok_cur_tag = null # partially parsed tag
 262         flag_frameset_ok = null
 263         flag_parsing = null
 264         afe = [] # active formatting elements
 265
 266         parse_error = ->
 267                 if parse_error_cb?
 268                         parse_error_cb cur
 269                 else
 270                         console.log "Parse error at character #{cur} of #{txt.length}"
 271
 272
 273         # the functions below impliment the Tree Contstruction algorithm
 274         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 275
 276         # But first... the helpers
 277         template_tag_is_open = ->
 278                 for t in open_els
 279                         if t.type is TYPE_TAG and t.name is 'template'
 280                                 return true
 281                 return false
 282         is_in_scope_x = (tag_name, scope) ->
 283                 for t in open_els
 284                         if t.name is tag_name
 285                                 return true
 286                         if t.name of scope
 287                                 return false
 288                 return false
 289         is_in_scope_x_y = (tag_name, scope, scope2) ->
 290                 for t in open_els
 291                         if t.name is tag_name
 292                                 return true
 293                         if t.name of scope
 294                                 return false
 295                         if t.name of scope2
 296                                 return false
 297                 return false
 298         standard_scopers = { # FIXME these are supposed to be namespace specific
 299                 'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
 300                 'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
 301                 'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
 302                 'foreignObject': true, 'desc': true, 'title'
 303         }
 304         button_scopers = button: true
 305         li_scopers = ol: true, ul: true
 306         table_scopers = html: true, table: true, template: true
 307         is_in_scope = (tag_name) ->
 308                 return is_in_scope_x tag_name, standard_scopers
 309         is_in_button_scope = (tag_name) ->
 310                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers
 311         is_in_table_scope = (tag_name) ->
 312                 return is_in_scope_x tag_name, table_scopers
 313         is_in_select_scope = (tag_name) ->
 314                 for t in open_els
 315                         if t.name is tag_name
 316                                 return true
 317                         if t.name isnt 'optgroup' and t.name isnt 'option'
 318                                 return false
 319                 return false
 320         # this checks for a particular element, not by name
 321         el_is_in_scope = (el) ->
 322                 for t in open_els
 323                         if t is el
 324                                 return true
 325                         if t.name of standard_scopers
 326                                 return false
 327                 return false
 328
 329         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
 330         # this implementation is structured (mostly) as described at the link above.
 331         # capitalized comments are the "labels" described at the link above.
 332         reconstruct_active_formatting_elements = ->
 333                 return if afe.length is 0
 334                 if afe[0].type is TYPE_MARKER or afe[0] in open_els
 335                         return
 336                 # Rewind
 337                 i = 0
 338                 loop
 339                         if i is afe.length - 1
 340                                 break
 341                         i += 1
 342                         if afe[i].type is TYPE_MARKER or afe[i] in open_els
 343                                 i -= 1 # Advance
 344                                 break
 345                 # Create
 346                 loop
 347                         el = afe[i].shallow_clone()
 348                         tree_insert_tag el
 349                         afe[i] = el
 350                         break if i is 0
 351                         i -= 1
 352
 353         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
 354         # adoption agency algorithm
 355         adoption_agency = (subject) ->
 356                 if open_els[0].name is subject
 357                         el = open_els[0]
 358                         open_els.shift()
 359                         # remove it from the list of active formatting elements (if found)
 360                         for t, i in afe
 361                                 if t is el
 362                                         afe.splice i, 1
 363                                         break
 364                         return
 365                 outer = 0
 366                 loop
 367                         if outer >= 8
 368                                 return
 369                         outer += 1
 370                         fe = null
 371                         for t, fe_index in afe
 372                                 if t.type is TYPE_MARKER
 373                                         break
 374                                 if t.name is subject
 375                                         fe = t
 376                                         break
 377                         if fe is null
 378                                 in_body_any_other_end_tag subject
 379                                 return
 380                         in_open_els = false
 381                         for t in open_els
 382                                 if t is fe
 383                                         in_open_els = true
 384                                         break
 385                         unless in_open_els
 386                                 parse_error()
 387                                 # "remove it from the list" must mean afe, since it's not in open_els
 388                                 afe.splice fe_index, 1
 389                                 return
 390                         unless el_is_in_scope fe
 391                                 parse_error()
 392                                 return
 393                         unless open_els[0] is fe
 394                                 parse_error()
 395                                 # continue
 396                         fb = null
 397                         fb_index
 398                         for t, i in open_els
 399                                 if t is fe
 400                                         break
 401                                 if el_is_special t
 402                                         fb = t
 403                                         fb_index = i
 404                         if fb is null
 405                                 loop
 406                                         t = open_els.shift()
 407                                         if t is fe
 408                                                 afe.splice fe_index, 1
 409                                                 return
 410                         ca = open_els[fe_index + 1] # common ancestor
 411                         node_above = open_els[fb_index + 1] # next node if node isn't in open_els anymore
 412                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
 413                         bookmark = new_aaa_bookmark()
 414                         for t, i in afe
 415                                 if t is fe
 416                                         afe.splice i, 0, bookmark
 417                         node = last_node = fb
 418                         inner = 0
 419                         loop
 420                                 inner += 1
 421                                 node_next = null
 422                                 for t, i in open_els
 423                                         if t is node
 424                                                 node_next = open_els[i + 1]
 425                                                 break
 426                                 node = node_next ? node_above
 427                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
 428                                 if node is fe
 429                                         break
 430                                 node_in_afe = false
 431                                 for t, i of afe
 432                                         if t is node
 433                                                 if inner > 3
 434                                                         afe.splice i, 1
 435                                                 else
 436                                                         node_in_afe = true
 437                                                 break
 438                                 unless node_in_afe
 439                                         for t, i in open_els
 440                                                 if t is node
 441                                                         node_above = open_els[i + 1]
 442                                                         open_els.splice i, 1
 443                                                         break
 444                                         continue
 445                                 # 7. reate an element for the token for which the element node
 446                                 # was created, in the HTML namespace, with common ancestor as
 447                                 # the intended parent; replace the entry for node in the list
 448                                 # of active formatting elements with an entry for the new
 449                                 # element, replace the entry for node in the stack of open
 450                                 # elements with an entry for the new element, and let node be
 451                                 # the new element.
 452                                 new_node = node.shallow_clone()
 453                                 for t, i in afe
 454                                         if t is node
 455                                                 afe[i] = new_node
 456                                                 break
 457                                 for t, i in open_els
 458                                         if t is node
 459                                                 open_els[i] = new_node
 460                                                 break
 461                                 node = new_node
 462                                 # 8. If last node is furthest block, then move the
 463                                 # aforementioned bookmark to be immediately after the new node
 464                                 # in the list of active formatting elements.
 465                                 if last_node is fb
 466                                         for t, i in afe
 467                                                 if t is bookmark
 468                                                         afe.splice i, 1
 469                                         for t, i in afe
 470                                                 if t is node
 471                                                         # TODO test: position i gets you "after"?
 472                                                         afe.splice i, 0, new_aaa_bookmark()
 473                                 # 9. Insert last node into node, first removing it from its
 474                                 # previous parent node if any.
 475                                 if last_node.parent?
 476                                         for c, i of last_node.parent.children
 477                                                 if c is last_node
 478                                                         last_node.parent.children.splice i, 1
 479                                 node.children.push last_node
 480                                 last_node.parent = node
 481                                 # 10. Let last node be node.
 482                                 last_node = node
 483                                 # 11. Return to the step labeled inner loop.
 484                         # 14. Insert whatever last node ended up being in the previous step
 485                         # at the appropriate place for inserting a node, but using common
 486                         # ancestor as the override target.
 487                         tree_insert_tag last_node, ca
 488                         # 15. Create an element for the token for which formatting element
 489                         # was created, in the HTML namespace, with furthest block as the
 490                         # intended parent.
 491                         new_element = fe.shallow_clone()
 492                         # 16. Take all of the child nodes of furthest block and append them
 493                         # to the element created in the last step.
 494                         while fb.children.length
 495                                 t = fb.children.shift()
 496                                 t.parent = new_element
 497                                 new_element.children.push t
 498                         # 17. Append that new element to furthest block.
 499                         new_element.parent = fb
 500                         fb.children.push new_element
 501                         # 18. Remove formatting element from the list of active formatting
 502                         # elements, and insert the new element into the list of active
 503                         # formatting elements at the position of the aforementioned
 504                         # bookmark.
 505                         for t, i in afe
 506                                 if t is fe
 507                                         afe.splice i, 1
 508                                         break
 509                         for t, i in afe
 510                                 if t is bookmark
 511                                         afe[i] = node
 512                                         break
 513                         # 19. Remove formatting element from the stack of open elements,
 514                         # and insert the new element into the stack of open elements
 515                         # immediately below the position of furthest block in that stack.
 516                         for t, i of open_els
 517                                 if t is fe
 518                                         open_els.splice i, 1
 519                                         break
 520                         for t, i of open_els
 521                                 if t is fb
 522                                         open_els.splice i, 0, new_element
 523                                         break
 524                         # 20. Jump back to the step labeled outer loop.
 525
 526         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
 527         # FIXME implement this
 528         close_p_if_in_button_scope = ->
 529                 if open_els[0].name is 'p'
 530                         open_els.pop()
 531                 return
 532                 #p = find_button_scope 'p'
 533                 #if p?
 534                         # TODO generate_implied_end_tags except for p tags
 535                         # TODO parse_error unless open_els[0].name is 'p'
 536                         # TODO pop stack until 'p' popped
 537
 538         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
 539         tree_insert_a_character = (t) ->
 540                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
 541                 dest = open_els[0].children
 542                 if dest.length > 0 and dest[dest.length - 1].type is TYPE_TEXT
 543                         dest[dest.length - 1].text += t.text
 544                 else
 545                         dest.push t
 546
 547         # FIXME read spec, do this right
 548         # FIXME implement the override target thing
 549         # note: this assumes it's an open tag
 550         tree_insert_tag = (t, override_target = null) ->
 551                 t.type = TYPE_TAG # not TYPE_OPEN_TAG
 552                 # convert attributes into a hash
 553                 while t.attrs_a.length
 554                         a = t.attrs_a.pop()
 555                         t.attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
 556                 if t.parent?
 557                         for c, i of t.parent.children
 558                                 if c is t
 559                                         t.parent.children.splice i, 1
 560                 # FIXME spec says to do something to figure out what parent should be
 561                 parent = open_els[0]
 562                 open_els.unshift t
 563                 parent.children.push t
 564                 t.parent = parent
 565
 566         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
 567         tree_insert_a_comment = (t) ->
 568                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
 569                 open_els[0].children.push t
 570
 571         # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
 572         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
 573                 for node, i in open_els
 574                         if node.name is name
 575                                 # FIXME generate implied end tags except those with name==name
 576                                 parse_error() unless i is 0
 577                                 while i > 0
 578                                         open_els.shift()
 579                                         i -= 1
 580                                 open_els.shift()
 581                                 return
 582                         if special_elements[node.name]?
 583                                 parse_error()
 584                                 return
 585         tree_in_body = (t) ->
 586                 switch t.type
 587                         when TYPE_TEXT
 588                                 switch t.text
 589                                         when "\u0000"
 590                                                 parse_error()
 591                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
 592                                                 reconstruct_active_formatting_elements()
 593                                                 tree_insert_a_character t
 594                                         else
 595                                                 reconstruct_active_formatting_elements()
 596                                                 tree_insert_a_character t
 597                                                 flag_frameset_ok = false
 598                         when TYPE_COMMENT
 599                                 tree_insert_a_comment t
 600                         when TYPE_DOCTYPE
 601                                 parse_error()
 602                         when TYPE_OPEN_TAG
 603                                 switch t.name
 604                                         when 'html'
 605                                                 parse_error()
 606                                                 return if template_tag_is_open()
 607                                                 root_attrs = open_els[open_els.length - 1].children
 608                                                 for k, v of t.attrs
 609                                                         root_attrs[k] = v unless root_attrs[k]?
 610                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
 611                                                 # FIXME also do this for </template> (end tag)
 612                                                 return tree_in_head t
 613                                         when 'body'
 614                                                 parse_error()
 615                                                 # TODO
 616                                         when 'frameset'
 617                                                 parse_error()
 618                                                 # TODO
 619                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
 620                                                 close_p_if_in_button_scope()
 621                                                 tree_insert_tag t
 622                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
 623                                                 close_p_if_in_button_scope()
 624                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
 625                                                         parse_error()
 626                                                         open_els.shift()
 627                                                 tree_insert_tag t
 628                                         # TODO lots more to implement here
 629                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
 630                                                 reconstruct_active_formatting_elements()
 631                                                 tree_insert_tag t
 632                                                 afe.push t
 633                                         # TODO lots more to implement here
 634                                         else # any other start tag
 635                                                 reconstruct_active_formatting_elements()
 636                                                 tree_insert_tag t
 637                         when TYPE_EOF
 638                                 ok_tags = {
 639                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
 640                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
 641                                 }
 642                                 for t in open_els
 643                                         unless ok_tags[t.name]?
 644                                                 parse_error()
 645                                                 break
 646                                 # TODO stack of template insertion modes thing
 647                                 flag_parsing = false # stop parsing
 648                         when TYPE_END_TAG
 649                                 switch t.name
 650                                         when 'body'
 651                                                 unless is_in_scope 'body'
 652                                                         parse_error()
 653                                                         return
 654                                                 # TODO implement parse error and move to tree_after_body
 655                                         when 'html'
 656                                                 unless is_in_scope 'body' # weird, but it's what the spec says
 657                                                         parse_error()
 658                                                         return
 659                                                 # TODO implement parse error and move to tree_after_body, reprocess
 660                                         # TODO lots more close tags to implement here
 661                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
 662                                                 adoption_agency t.name
 663                                         # TODO lots more close tags to implement here
 664                                         else
 665                                                 in_body_any_other_end_tag t.name
 666                 return
 667
 668
 669         # the functions below implement the tokenizer stats described here:
 670         # http://www.w3.org/TR/html5/syntax.html#tokenization
 671
 672         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
 673         tok_state_data = ->
 674                 switch c = txt.charAt(cur++)
 675                         when '&'
 676                                 return new_text_node tokenize_character_reference()
 677                         when '<'
 678                                 tok_state = tok_state_tag_open
 679                         when "\u0000"
 680                                 parse_error()
 681                                 return new_text_node c
 682                         when '' # EOF
 683                                 return new_eof_token()
 684                         else
 685                                 return new_text_node c
 686                 return null
 687
 688         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
 689         # not needed: tok_state_character_reference_in_data = ->
 690         # just call tok_state_character_reference_in_data()
 691
 692         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
 693         tok_state_tag_open = ->
 694                 switch c = txt.charAt(cur++)
 695                         when '!'
 696                                 tok_state = tok_state_markup_declaration_open
 697                         when '/'
 698                                 tok_state = tok_state_end_tag_open
 699                         when '?'
 700                                 parse_error()
 701                                 tok_state = tok_state_bogus_comment
 702                         else
 703                                 if lc_alpha.indexOf(c) > -1
 704                                         tok_cur_tag = new_open_tag c
 705                                         tok_state = tok_state_tag_name
 706                                 else if uc_alpha.indexOf(c) > -1
 707                                         tok_cur_tag = new_open_tag c.toLowerCase()
 708                                         tok_state = tok_state_tag_name
 709                                 else
 710                                         parse_error()
 711                                         tok_state = tok_state_data
 712                                         cur -= 1 # we didn't parse/handle the char after <
 713                                         return new_text_node '<'
 714                 return null
 715
 716         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
 717         tok_state_end_tag_open = ->
 718                 switch c = txt.charAt(cur++)
 719                         when '>'
 720                                 parse_error()
 721                                 tok_state = tok_state_data
 722                         when '' # EOF
 723                                 parse_error()
 724                                 tok_state = tok_state_data
 725                                 return new_text_node '</'
 726                         else
 727                                 if uc_alpha.indexOf(c) > -1
 728                                         tok_cur_tag = new_end_tag c.toLowerCase()
 729                                         tok_state = tok_state_tag_name
 730                                 else if lc_alpha.indexOf(c) > -1
 731                                         tok_cur_tag = new_end_tag c
 732                                         tok_state = tok_state_tag_name
 733                                 else
 734                                         parse_error()
 735                                         tok_state = tok_state_bogus_comment
 736                 return null
 737
 738         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
 739         tok_state_tag_name = ->
 740                 switch c = txt.charAt(cur++)
 741                         when "\t", "\n", "\u000c", ' '
 742                                 tok_state = tok_state_before_attribute_name
 743                         when '/'
 744                                 tok_state = tok_state_self_closing_start_tag
 745                         when '>'
 746                                 tok_state = tok_state_data
 747                                 tmp = tok_cur_tag
 748                                 tok_cur_tag = null
 749                                 return tmp
 750                         when "\u0000"
 751                                 parse_error()
 752                                 tok_cur_tag.name += "\ufffd"
 753                         when '' # EOF
 754                                 parse_error()
 755                                 tok_state = tok_state_data
 756                         else
 757                                 if uc_alpha.indexOf(c) > -1
 758                                         tok_cur_tag.name += c.toLowerCase()
 759                                 else
 760                                         tok_cur_tag.name += c
 761                 return null
 762
 763         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
 764         tok_state_before_attribute_name = ->
 765                 attr_name = null
 766                 switch c = txt.charAt(cur++)
 767                         when "\t", "\n", "\u000c", ' '
 768                                 return null
 769                         when '/'
 770                                 tok_state = tok_state_self_closing_start_tag
 771                                 return null
 772                         when '>'
 773                                 tok_state = tok_state_data
 774                                 tmp = tok_cur_tag
 775                                 tok_cur_tag = null
 776                                 return tmp
 777                         when "\u0000"
 778                                 parse_error()
 779                                 attr_name = "\ufffd"
 780                         when '"', "'", '<', '='
 781                                 parse_error()
 782                                 attr_name = c
 783                         when '' # EOF
 784                                 parse_error()
 785                                 tok_state = tok_state_data
 786                         else
 787                                 if uc_alpha.indexOf(c) > -1
 788                                         attr_name = c.toLowerCase()
 789                                 else
 790                                         attr_name = c
 791                 if attr_name?
 792                         tok_cur_tag.attrs_a.unshift [attr_name, '']
 793                         tok_state = tok_state_attribute_name
 794                 return null
 795
 796         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
 797         tok_state_attribute_name = ->
 798                 switch c = txt.charAt(cur++)
 799                         when "\t", "\n", "\u000c", ' '
 800                                 tok_state = tok_state_after_attribute_name
 801                         when '/'
 802                                 tok_state = tok_state_self_closing_start_tag
 803                         when '='
 804                                 tok_state = tok_state_before_attribute_value
 805                         when '>'
 806                                 tok_state = tok_state_data
 807                                 tmp = tok_cur_tag
 808                                 tok_cur_tag = null
 809                                 return tmp
 810                         when "\u0000"
 811                                 parse_error()
 812                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
 813                         when '"', "'", '<'
 814                                 parse_error()
 815                                 tok_cur_tag.attrs_a[0][0] = c
 816                         when '' # EOF
 817                                 parse_error()
 818                                 tok_state = tok_state_data
 819                         else
 820                                 if uc_alpha.indexOf(c) > -1
 821                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
 822                                 else
 823                                         tok_cur_tag.attrs_a[0][0] += c
 824                 return null
 825
 826         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
 827         tok_state_before_attribute_value = ->
 828                 switch c = txt.charAt(cur++)
 829                         when "\t", "\n", "\u000c", ' '
 830                                 return null
 831                         when '"'
 832                                 tok_state = tok_state_attribute_value_double_quoted
 833                         when '&'
 834                                 tok_state = tok_state_attribute_value_unquoted
 835                                 cur -= 1
 836                         when "'"
 837                                 tok_state = tok_state_attribute_value_single_quoted
 838                         when "\u0000"
 839                                 # Parse error
 840                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
 841                                 tok_state = tok_state_attribute_value_unquoted
 842                         when '>'
 843                                 # Parse error
 844                                 tok_state = tok_state_data
 845                                 tmp = tok_cur_tag
 846                                 tok_cur_tag = null
 847                                 return tmp
 848                         when '' # EOF
 849                                 parse_error()
 850                                 tok_state = tok_state_data
 851                         else
 852                                 tok_cur_tag.attrs_a[0][1] += c
 853                                 tok_state = tok_state_attribute_value_unquoted
 854                 return null
 855
 856         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
 857         tok_state_attribute_value_double_quoted = ->
 858                 switch c = txt.charAt(cur++)
 859                         when '"'
 860                                 tok_state = tok_state_after_attribute_value_quoted
 861                         when '&'
 862                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
 863                         when "\u0000"
 864                                 # Parse error
 865                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
 866                         when '' # EOF
 867                                 parse_error()
 868                                 tok_state = tok_state_data
 869                         else
 870                                 tok_cur_tag.attrs_a[0][1] += c
 871                 return null
 872
 873         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
 874         tok_state_attribute_value_single_quoted = ->
 875                 switch c = txt.charAt(cur++)
 876                         when "'"
 877                                 tok_state = tok_state_after_attribute_value_quoted
 878                         when '&'
 879                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
 880                         when "\u0000"
 881                                 # Parse error
 882                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
 883                         when '' # EOF
 884                                 parse_error()
 885                                 tok_state = tok_state_data
 886                         else
 887                                 tok_cur_tag.attrs_a[0][1] += c
 888                 return null
 889
 890         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
 891         tok_state_attribute_value_unquoted = ->
 892                 switch c = txt.charAt(cur++)
 893                         when "\t", "\n", "\u000c", ' '
 894                                 tok_state = tok_state_before_attribute_name
 895                         when '&'
 896                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
 897                         when '>'
 898                                 tok_state = tok_state_data
 899                                 tmp = tok_cur_tag
 900                                 tok_cur_tag = null
 901                                 return tmp
 902                         when "\u0000"
 903                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
 904                         when '' # EOF
 905                                 parse_error()
 906                                 tok_state = tok_state_data
 907                         else
 908                                 # Parse Error if ', <, = or ` (backtick)
 909                                 tok_cur_tag.attrs_a[0][1] += c
 910                 return null
 911
 912         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
 913         tok_state_after_attribute_value_quoted = ->
 914                 switch c = txt.charAt(cur++)
 915                         when "\t", "\n", "\u000c", ' '
 916                                 tok_state = tok_state_before_attribute_name
 917                         when '/'
 918                                 tok_state = tok_state_self_closing_start_tag
 919                         when '>'
 920                                 tok_state = tok_state_data
 921                                 tmp = tok_cur_tag
 922                                 tok_cur_tag = null
 923                                 return tmp
 924                         when '' # EOF
 925                                 parse_error()
 926                                 tok_state = tok_state_data
 927                         else
 928                                 # Parse Error
 929                                 tok_state = tok_state_before_attribute_name
 930                                 cur -= 1 # we didn't handle that char
 931                 return null
 932
 933         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
 934         # Don't set this as a state, just call it
 935         # returns a string (NOT a text node)
 936         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
 937                 if cur >= txt.length
 938                         return '&'
 939                 switch c = txt.charAt(cur)
 940                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
 941                                 # explicitly not a parse error
 942                                 return '&'
 943                         when ';'
 944                                 # there has to be "one or more" alnums between & and ; to be a parse error
 945                                 return '&'
 946                         when '#'
 947                                 if cur + 1 >= txt.length
 948                                         return '&'
 949                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
 950                                         prefix = '#x'
 951                                         charset = hex_chars
 952                                         start = cur + 2
 953                                 else
 954                                         charset = digits
 955                                         start = cur + 1
 956                                         prefix = '#'
 957                                 i = 0
 958                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
 959                                         i += 1
 960                                 if i is 0
 961                                         return '&'
 962                                 if txt.charAt(start + i) is ';'
 963                                         i += 1
 964                                 # FIXME This is supposed to generate parse errors for some chars
 965                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
 966                                 if decoded?
 967                                         cur = start + i
 968                                         return decoded
 969                                 return '&'
 970                         else
 971                                 for i in [0...31]
 972                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
 973                                                 break
 974                                 if i is 0
 975                                         # exit early, because parse_error() below needs at least one alnum
 976                                         return '&'
 977                                 if txt.charAt(cur + i) is ';'
 978                                         i += 1 # include ';' terminator in value
 979                                         decoded = decode_named_char_ref txt.substr(cur, i)
 980                                         if decoded?
 981                                                 cur += i
 982                                                 return decoded
 983                                         parse_error()
 984                                         return '&'
 985                                 else
 986                                         # no ';' terminator (only legacy char refs)
 987                                         max = i
 988                                         for i in [2..max] # no prefix matches, so ok to check shortest first
 989                                                 c = legacy_char_refs[txt.substr(cur, i)]
 990                                                 if c?
 991                                                         if in_attr
 992                                                                 if txt.charAt(cur + i) is '='
 993                                                                         # "because some legacy user agents will
 994                                                                         # misinterpret the markup in those cases"
 995                                                                         parse_error()
 996                                                                         return '&'
 997                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
 998                                                                         # this makes attributes forgiving about url args
 999                                                                         return '&'
1000                                                         # ok, and besides the weird exceptions for attributes...
1001                                                         # return the matching char
1002                                                         cur += i # consume entity chars
1003                                                         parse_error() # because no terminating ";"
1004                                                         return c
1005                                         parse_error()
1006                                         return '&'
1007                 return # never reached
1008
1009         # tree constructor initialization
1010         # see comments on TYPE_TAG/etc for the structure of this data
1011         tree = new Node TYPE_TAG, name: 'html'
1012         open_els = [tree]
1013         tree_state = tree_in_body
1014         flag_frameset_ok = true
1015         flag_parsing = true
1016         afe = [] # active formatting elements
1017
1018         # tokenizer initialization
1019         tok_state = tok_state_data
1020
1021         # proccess input
1022         while flag_parsing
1023                 t = tok_state()
1024                 if t?
1025                         tree_state t
1026         return tree.children
1027
1028 # everything below is tests on the above
1029 test_equals = (description, output, expected_output) ->
1030         if output is expected_output
1031                 console.log "passed." # don't say name, so smart consoles can merge all of these
1032         else
1033                 console.log "FAILED: \"#{description}\""
1034                 console.log "   Expected: #{expected_output}"
1035                 console.log "     Actual: #{output}"
1036 test_parser = (args) ->
1037         parse_errors = []
1038         errors_cb = (i) ->
1039                 parse_errors.push i
1040         parsed = parse_html args.html, errors_cb
1041         serialized = ''
1042         sep = ''
1043         for t in parsed
1044                 serialized += sep
1045                 sep = ','
1046                 serialized += t.serialize()
1047         if serialized isnt args.expected or parse_errors.length isnt args.errors
1048                 console.log "FAILED: \"#{args.name}\""
1049         else
1050                 console.log "passed \"#{args.name}\""
1051         if serialized isnt args.expected
1052                 console.log "      Input: #{args.html}"
1053                 console.log "    Correct: #{args.expected}"
1054                 console.log "     Output: #{serialized}"
1055         if parse_errors.length isnt args.errors
1056                 console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
1057
1058 test_parser name: "empty", \
1059         html: "",
1060         expected: '',
1061         errors: 0
1062 test_parser name: "just text", \
1063         html: "abc",
1064         expected: 'text:"abc"',
1065         errors: 0
1066 test_parser name: "named entity", \
1067         html: "a&amp;1234",
1068         expected: 'text:"a&1234"',
1069         errors: 0
1070 test_parser name: "broken named character references", \
1071         html: "1&amp2&&amp;3&aabbcc;",
1072         expected: 'text:"1&2&&3&aabbcc;"',
1073         errors: 2
1074 test_parser name: "numbered entity overrides", \
1075         html: "1&#X80&#x80; &#x83",
1076         expected: 'text:"1€€ ƒ"',
1077         errors: 0
1078 test_parser name: "open tag", \
1079         html: "foo<span>bar",
1080         expected: 'text:"foo",tag:"span",{},[text:"bar"]',
1081         errors: 1 # no close tag
1082 test_parser name: "open tag with attributes", \
1083         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
1084         expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
1085         errors: 1 # no close tag
1086 test_parser name: "open tag with attributes of various quotings", \
1087         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
1088         expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
1089         errors: 1 # no close tag
1090 test_parser name: "attribute entity exceptions dq", \
1091         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
1092         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
1093         errors: 2 # no close tag, &amp= in attr
1094 test_parser name: "attribute entity exceptions sq", \
1095         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
1096         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
1097         errors: 2 # no close tag, &amp= in attr
1098 test_parser name: "attribute entity exceptions uq", \
1099         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
1100         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
1101         errors: 2 # no close tag, &amp= in attr
1102 test_parser name: "matching closing tags", \
1103         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
1104         expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
1105         errors: 0
1106 test_parser name: "missing closing tag inside", \
1107         html: "foo<div>bar<span>baz</div>qux",
1108         expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
1109         errors: 1 # close tag mismatch
1110 test_parser name: "mis-matched closing tags", \
1111         html: "<span>12<div>34</span>56</div>78",
1112         expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
1113         errors: 2 # misplaced </span>, no </span> at the end
1114 test_parser name: "mis-matched formatting elements", \
1115         html: "12<b>34<i>56</b>78</i>90",
1116         expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
1117         errors: 1 # no idea how many their should be
1118 test_parser name: "crazy formatting elements test", \
1119         html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
1120         # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
1121         # firefox does this:
1122         expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
1123         errors: 6 # no idea how many there should be