parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
  20 # or <body> tags, nor does it produce the top level "document" node in the dom
  21 # tree, nor nodes for html, head or body.
  22 #
  23 # Instead, the data structure produced by this parser is an array of nodes.
  24 #
  25 # Each node is an array. The first element in the array is an integer (one of
  26 # the TYPE_* constants below) followed by the appropriate fields for that type
  27 # (shown below in the comments after the TYPE_* definition.)
  28
  29 TYPE_TAG = 0 # name, {attributes}, [children]
  30 TYPE_TEXT = 1 # "text"
  31 TYPE_WHITESPACE = 2
  32 TYPE_COMMENT = 3
  33 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  34 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
  35
  36 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
  37 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
  38 digits = "0123456789"
  39 alnum = lc_alpha + uc_alpha + digits
  40 hex_chars = digits + "abcdefABCDEF"
  41
  42 # some SVG elements have dashes in them
  43 tag_name_chars = alnum + "-"
  44
  45 # http://www.w3.org/TR/html5/infrastructure.html#space-character
  46 space_chars = "\u0009\u000a\u000c\u000d\u0020"
  47
  48 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
  49 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
  50
  51 # These are the character references that don't need a terminating semicolon
  52 # min length: 2, max: 6, none are a prefix of any other.
  53 legacy_char_refs = {
  54         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
  55         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
  56         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
  57         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
  58         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
  59         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
  60         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
  61         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
  62         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
  63         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
  64         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
  65         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
  66         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
  67         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
  68         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
  69         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
  70         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
  71         yen: '¥', yuml: 'ÿ'
  72 }
  73
  74 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
  75 raw_text_elements = ['script', 'style']
  76 escapable_raw_text_elements = ['textarea', 'title']
  77 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
  78 svg_elements = [
  79         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
  80         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
  81         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
  82         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
  83         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
  84         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
  85         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
  86         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
  87         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
  88         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
  89         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
  90         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
  91         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
  92         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
  93         'view', 'vkern'
  94 ]
  95
  96 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
  97 mathml_elements = [
  98         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
  99         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
 100         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
 101         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
 102         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
 103         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 104         'determinant', 'diff', 'divergence', 'divide', 'domain',
 105         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 106         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 107         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 108         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 109         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 110         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 111         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 112         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 113         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 114         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 115         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 116         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 117         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 118         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 119         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 120         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 121         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 122         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 123         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 124         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 125         'vectorproduct', 'xor'
 126 ]
 127 # foreign_elements = [svg_elements..., mathml_elements...]
 128 #normal_elements = All other allowed HTML elements are normal elements.
 129
 130
 131 # decode_named_char_ref()
 132 #
 133 # The list of named character references is _huge_ so ask the browser to decode
 134 # for us instead of wasting bandwidth/space on including the table here.
 135 #
 136 # Pass without the "&" but with the ";" examples:
 137 #    for "&amp" pass "amp;"
 138 #    for "&#x2032" pass "x2032;"
 139 g_dncr = {
 140         cache: {}
 141         textarea: document.createElement('textarea')
 142 }
 143 # TODO test this in IE8
 144 decode_named_char_ref = (txt) ->
 145         txt = "&#{txt}"
 146         decoded = g_dncr.cache[txt]
 147         return decoded if decoded?
 148         g_dncr.textarea.innerHTML = txt
 149         decoded = g_dncr.textarea.value
 150         return null if decoded is txt
 151         return g_dncr.cache[txt] = decoded
 152
 153 parse_html = (txt) ->
 154         cur = 0 # index of next char in txt to be parsed
 155         # declare tree and tokenizer variables so they're in scope below
 156         tree = null
 157         tree_append_point = null
 158         tree_state = null
 159         tok_state = null
 160         tok_cur_tag = null # partially parsed tag
 161
 162
 163         # the functions below implement the tokenizer stats described here:
 164         # http://www.w3.org/TR/html5/syntax.html#tokenization
 165
 166         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
 167         tok_state_data = ->
 168                 switch c = txt.charAt(cur++)
 169                         when '&'
 170                                 tok_state = tok_state_character_reference_in_data
 171                         when '<'
 172                                 tok_state = tok_state_tag_open
 173                         when "\u0000"
 174                                 # Parse error
 175                                 return [TYPE_TEXT, c]
 176                         else
 177                                 return [TYPE_TEXT, c]
 178                 return null
 179
 180         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
 181         # & just got consumed
 182         tok_state_character_reference_in_data = ->
 183                 tok_state = tok_state_data
 184                 if cur >= txt.length
 185                         return [TYPE_TEXT, '&']
 186                 switch c = txt.charAt(cur)
 187                         when ';'
 188                                 return [TYPE_TEXT, '&']
 189                         when '#'
 190                                 if cur + 1 >= txt.length
 191                                         return [TYPE_TEXT, '&']
 192                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
 193                                         prefix = '#x'
 194                                         charset = hex_chars
 195                                         start = cur + 2
 196                                 else
 197                                         charset = digits
 198                                         start = cur + 1
 199                                         prefix = '#'
 200                                 i = 0
 201                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
 202                                         i += 1
 203                                 if i is 0
 204                                         return [TYPE_TEXT, '&']
 205                                 if txt.charAt(start + i) is ';'
 206                                         i += 1
 207                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
 208                                 if decoded?
 209                                         cur = start + i
 210                                         return [TYPE_TEXT, decoded]
 211                                 return [TYPE_TEXT, '&']
 212                         else
 213                                 for i in [0...31]
 214                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
 215                                                 break
 216                                 if i is 0
 217                                         return [TYPE_TEXT, '&']
 218                                 if txt.charAt(cur + i) is ';'
 219                                         i += 1 # include ';' terminator in value
 220                                         decoded = decode_named_char_ref txt.substr(cur, i)
 221                                         if decoded?
 222                                                 cur += i
 223                                                 return [TYPE_TEXT, decoded]
 224                                         return [TYPE_TEXT, '&']
 225                                 else
 226                                         # no ';' terminator (only legacy char refs)
 227                                         if i < 2 or i > 6
 228                                                 return [TYPE_TEXT, '&']
 229                                         # FIXME: if we're inside an attribute:
 230                                         # 1.    don't parse refs that are followed by =
 231                                         # 2.    don't parse refs that are followed by alnum
 232                                         max = i
 233                                         for i in [2..max] # no prefix matches, so ok to check shortest first
 234                                                 c = legacy_char_refs[txt.substr(cur, i)]
 235                                                 if c?
 236                                                         cur += i # consume entity chars
 237                                                         return [TYPE_TEXT, c]
 238                 return null
 239
 240         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
 241         tok_state_tag_open = ->
 242                 switch c = txt.charAt(cur++)
 243                         when '!'
 244                                 tok_state = tok_state_markup_declaration_open
 245                         when '/'
 246                                 tok_state = tok_state_end_tag_open
 247                         when '?'
 248                                 # Parse error
 249                                 tok_state = tok_state_bogus_comment
 250                         else
 251                                 if lc_alpha.indexOf(c) > -1
 252                                         tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
 253                                         tok_state = tok_state_tag_name
 254                                 else if uc_alpha.indexOf(c) > -1
 255                                         tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
 256                                         tok_state = tok_state_tag_name
 257                                 else
 258                                         # Parse error
 259                                         tok_state = tok_state_data
 260                                         cur -= 1 # we didn't parse/handle the char after <
 261                                         return [TYPE_TEXT, '<']
 262                 return null
 263
 264         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
 265         tok_state_tag_name = ->
 266                 switch c = txt.charAt(cur++)
 267                         when "\t", "\n", "\u000c", ' '
 268                                 tok_state = tok_state_before_attribute_name
 269                         when '/'
 270                                 tok_state = tok_state_self_closing_start_tag
 271                         when '>'
 272                                 tok_state = tok_state_data
 273                                 tmp = tok_cur_tag
 274                                 tok_cur_tag = null
 275                                 return tmp
 276                         when "\u0000"
 277                                 # Parse error
 278                                 tok_cur_tag[1] += "\ufffd"
 279                         else
 280                                 if uc_alpha.indexOf(c) > -1
 281                                         tok_cur_tag[1] += c.toLowerCase()
 282                                 else
 283                                         tok_cur_tag[1] += c
 284                 return null
 285
 286         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
 287         tok_state_before_attribute_name = ->
 288                 attr_name = null
 289                 switch c = txt.charAt(cur++)
 290                         when "\t", "\n", "\u000c", ' '
 291                                 return null
 292                         when '/'
 293                                 tok_state = tok_state_self_closing_start_tag
 294                                 return null
 295                         when '>'
 296                                 tok_state = tok_state_data
 297                                 tmp = tok_cur_tag
 298                                 tok_cur_tag = null
 299                                 return tmp
 300                         when "\u0000"
 301                                 # Parse error
 302                                 attr_name = "\ufffd"
 303                         when '"', "'", '<', '='
 304                                 # Parse error
 305                                 attr_name = c
 306                         else
 307                                 if uc_alpha.indexOf(c) > -1
 308                                         attr_name = c.toLowerCase()
 309                                 else
 310                                         attr_name = c
 311                 if attr_name?
 312                         tok_cur_tag[2].unshift [attr_name, '']
 313                         tok_state = tok_state_attribute_name
 314                 return null
 315
 316         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
 317         tok_state_attribute_name = ->
 318                 switch c = txt.charAt(cur++)
 319                         when "\t", "\n", "\u000c", ' '
 320                                 tok_state = tok_state_after_attribute_name
 321                         when '/'
 322                                 tok_state = tok_state_self_closing_start_tag
 323                         when '='
 324                                 tok_state = tok_state_before_attribute_value
 325                         when '>'
 326                                 tok_state = tok_state_data
 327                                 tmp = tok_cur_tag
 328                                 tok_cur_tag = null
 329                                 return tmp
 330                         when "\u0000"
 331                                 # Parse error
 332                                 tok_cur_tag[2][0][0] += "\ufffd"
 333                         else
 334                                 if uc_alpha.indexOf(c) > -1
 335                                         tok_cur_tag[2][0][0] += c.toLowerCase()
 336                                 else
 337                                         # Parse error if ", ' or <
 338                                         tok_cur_tag[2][0][0] += c
 339                 return null
 340
 341         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
 342         tok_state_before_attribute_value = ->
 343                 switch c = txt.charAt(cur++)
 344                         when "\t", "\n", "\u000c", ' '
 345                                 return null
 346                         when '"'
 347                                 tok_state = tok_state_attribute_value_double_quoted
 348                         when '&'
 349                                 tok_state = tok_state_attribute_value_unquoted
 350                                 cur -= 1
 351                         when "'"
 352                                 tok_state = tok_state_attribute_value_single_quoted
 353                         when "\u0000"
 354                                 # Parse error
 355                                 tok_cur_tag[2][0][1] += "\ufffd"
 356                                 tok_state = tok_state_attribute_value_unquoted
 357                         when '>'
 358                                 # Parse error
 359                                 tok_state = tok_state_data
 360                                 tmp = tok_cur_tag
 361                                 tok_cur_tag = null
 362                                 return tmp
 363                         else
 364                                 tok_cur_tag[2][0][1] += c
 365                                 tok_state = tok_state_attribute_value_unquoted
 366                 return null
 367
 368         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
 369         tok_state_attribute_value_double_quoted = ->
 370                 switch c = txt.charAt(cur++)
 371                         when '"'
 372                                 tok_state = tok_state_after_attribute_value_quoted
 373                         when '&'
 374                                 tok_state = tok_state_character_reference_in_attribute_value
 375                                 tok_char_ref_addl_allowed = '"' # FIXME
 376                         when "\u0000"
 377                                 # Parse error
 378                                 tok_cur_tag[2][0][1] += "\ufffd"
 379                         else
 380                                 tok_cur_tag[2][0][1] += c
 381                 return null
 382
 383         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
 384         tok_state_attribute_value_single_quoted = ->
 385                 switch c = txt.charAt(cur++)
 386                         when "'"
 387                                 tok_state = tok_state_after_attribute_value_quoted
 388                         when '&'
 389                                 tok_state = tok_state_character_reference_in_attribute_value
 390                                 tok_char_ref_addl_allowed = "'" # FIXME
 391                         when "\u0000"
 392                                 # Parse error
 393                                 tok_cur_tag[2][0][1] += "\ufffd"
 394                         else
 395                                 tok_cur_tag[2][0][1] += c
 396                 return null
 397
 398         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
 399         tok_state_attribute_value_unquoted = ->
 400                 switch c = txt.charAt(cur++)
 401                         when "\t", "\n", "\u000c", ' '
 402                                 tok_state = tok_state_before_attribute_name
 403                         when '&'
 404                                 tok_state = tok_state_character_reference_in_attribute_value
 405                                 tok_char_ref_addl_allowed = '>' # FIXME
 406                         when '>'
 407                                 tok_state = tok_state_data
 408                                 tmp = tok_cur_tag
 409                                 tok_cur_tag = null
 410                                 return tmp
 411                         when "\u0000"
 412                                 tok_cur_tag[2][0][1] += "\ufffd"
 413                         else
 414                                 # Parse Error if ', <, = or ` (backtick)
 415                                 tok_cur_tag[2][0][1] += c
 416                 return null
 417
 418         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
 419         tok_state_after_attribute_value_quoted = ->
 420                 switch c = txt.charAt(cur++)
 421                         when "\t", "\n", "\u000c", ' '
 422                                 tok_state = tok_state_before_attribute_name
 423                         when '/'
 424                                 tok_state = tok_state_self_closing_start_tag
 425                         when '>'
 426                                 tok_state = tok_state_data
 427                                 tmp = tok_cur_tag
 428                                 tok_cur_tag = null
 429                                 return tmp
 430                         else
 431                                 # Parse Error
 432                                 tok_state = tok_state_before_attribute_name
 433                                 cur -= 1 # we didn't handle that char
 434                 return null
 435
 436         # the functions below impliment the Tree Contstruction algorithm here:
 437         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 438         # FIXME this is just a bit of a hack that makes sense... read spec and do it that way
 439         tree_append = (t) ->
 440                 if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
 441                         tree_append_point[tree_append_point.length - 1][1] += t[1]
 442                 else
 443                         tree_append_point.push t
 444                         if t[0] is TYPE_OPEN_TAG
 445                                 t[0] = TYPE_TAG
 446                                 attrs = {}
 447                                 while t[2].length
 448                                         a = t[2].pop()
 449                                         attrs[a[0]] = a[1]
 450                                 t[2] = attrs
 451                                 tree_append_point = t[3]
 452
 453         # tree constructor initialization
 454         tree = [] # see comments on TYPE_TAG/etc for the structure of this data
 455         tree_append_point = tree
 456         tree_state = tree_append
 457
 458         # tokenizer initialization
 459         tok_state = tok_state_data
 460
 461         # proccess input
 462         while cur < txt.length
 463                 t = tok_state()
 464                 if t?
 465                         tree_state t
 466
 467         return tree
 468
 469 # everything below is tests on the above
 470 test_equals = (description, fn, args..., expected_output) ->
 471         output = fn.apply this, args
 472         if output is expected_output
 473                 console.log "passed: #{description}."
 474         else
 475                 console.log "FAILED: #{description}. Expected: #{expected_output}, actual: #{output}"
 476 html_to_json = (html) ->
 477         return JSON.stringify parse_html html
 478 test_equals "empty", html_to_json, "", '[]'
 479 test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
 480 test_equals "named entity", html_to_json, "a&amp;1234", '[[1,"a&1234"]]'
 481 test_equals "broken named character references", html_to_json, "1&amp2&&amp;3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
 482 test_equals "numbered entity overrides", html_to_json, "1&#X80&#x80; &#x83", '[[1,"1€€ ƒ"]]'
 483 test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
 484 test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
 485 test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'