parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
  20 # or <body> tags, nor does it produce the top level "document" node in the dom
  21 # tree, nor nodes for html, head or body.
  22 #
  23 # Instead, the data structure produced by this parser is an array of nodes.
  24 #
  25 # Each node is an array. The first element in the array is an integer (one of
  26 # the TYPE_* constants below) followed by the appropriate fields for that type
  27 # (shown below in the comments after the TYPE_* definition.)
  28
  29 TYPE_TAG = 0 # name, {attributes}, [children]
  30 TYPE_TEXT = 1 # "text"
  31 TYPE_WHITESPACE = 2
  32 TYPE_COMMENT = 3
  33 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  34 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
  35 TYPE_EOF = 5
  36
  37 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
  38 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
  39 digits = "0123456789"
  40 alnum = lc_alpha + uc_alpha + digits
  41 hex_chars = digits + "abcdefABCDEF"
  42
  43 # some SVG elements have dashes in them
  44 tag_name_chars = alnum + "-"
  45
  46 # http://www.w3.org/TR/html5/infrastructure.html#space-character
  47 space_chars = "\u0009\u000a\u000c\u000d\u0020"
  48
  49 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
  50 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
  51
  52 # These are the character references that don't need a terminating semicolon
  53 # min length: 2, max: 6, none are a prefix of any other.
  54 legacy_char_refs = {
  55         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
  56         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
  57         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
  58         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
  59         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
  60         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
  61         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
  62         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
  63         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
  64         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
  65         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
  66         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
  67         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
  68         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
  69         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
  70         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
  71         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
  72         yen: '¥', yuml: 'ÿ'
  73 }
  74
  75 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
  76 raw_text_elements = ['script', 'style']
  77 escapable_raw_text_elements = ['textarea', 'title']
  78 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
  79 svg_elements = [
  80         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
  81         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
  82         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
  83         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
  84         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
  85         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
  86         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
  87         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
  88         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
  89         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
  90         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
  91         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
  92         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
  93         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
  94         'view', 'vkern'
  95 ]
  96
  97 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
  98 mathml_elements = [
  99         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
 100         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
 101         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
 102         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
 103         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
 104         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 105         'determinant', 'diff', 'divergence', 'divide', 'domain',
 106         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 107         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 108         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 109         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 110         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 111         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 112         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 113         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 114         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 115         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 116         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 117         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 118         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 119         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 120         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 121         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 122         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 123         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 124         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 125         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 126         'vectorproduct', 'xor'
 127 ]
 128 # foreign_elements = [svg_elements..., mathml_elements...]
 129 #normal_elements = All other allowed HTML elements are normal elements.
 130
 131
 132 # decode_named_char_ref()
 133 #
 134 # The list of named character references is _huge_ so ask the browser to decode
 135 # for us instead of wasting bandwidth/space on including the table here.
 136 #
 137 # Pass without the "&" but with the ";" examples:
 138 #    for "&amp" pass "amp;"
 139 #    for "&#x2032" pass "x2032;"
 140 g_dncr = {
 141         cache: {}
 142         textarea: document.createElement('textarea')
 143 }
 144 # TODO test this in IE8
 145 decode_named_char_ref = (txt) ->
 146         txt = "&#{txt}"
 147         decoded = g_dncr.cache[txt]
 148         return decoded if decoded?
 149         g_dncr.textarea.innerHTML = txt
 150         decoded = g_dncr.textarea.value
 151         return null if decoded is txt
 152         return g_dncr.cache[txt] = decoded
 153
 154 parse_html = (txt) ->
 155         cur = 0 # index of next char in txt to be parsed
 156         # declare tree and tokenizer variables so they're in scope below
 157         tree = null
 158         tree_append_point = null
 159         tree_state = null
 160         tok_state = null
 161         tok_cur_tag = null # partially parsed tag
 162
 163         parse_error = ->
 164                 console.log "Parse error at character #{cur} of #{txt.length}"
 165
 166         # the functions below implement the tokenizer stats described here:
 167         # http://www.w3.org/TR/html5/syntax.html#tokenization
 168
 169         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
 170         tok_state_data = ->
 171                 switch c = txt.charAt(cur++)
 172                         when '&'
 173                                 return [TYPE_TEXT, tokenize_character_reference()]
 174                         when '<'
 175                                 tok_state = tok_state_tag_open
 176                         when "\u0000"
 177                                 parse_error()
 178                                 return [TYPE_TEXT, c]
 179                         when '' # EOF
 180                                 return [TYPE_EOF]
 181                         else
 182                                 return [TYPE_TEXT, c]
 183                 return null
 184
 185         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
 186         # not needed: tok_state_character_reference_in_data = ->
 187         # just call tok_state_character_reference_in_data()
 188
 189         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
 190         tok_state_tag_open = ->
 191                 switch c = txt.charAt(cur++)
 192                         when '!'
 193                                 tok_state = tok_state_markup_declaration_open
 194                         when '/'
 195                                 tok_state = tok_state_end_tag_open
 196                         when '?'
 197                                 parse_error()
 198                                 tok_state = tok_state_bogus_comment
 199                         else
 200                                 if lc_alpha.indexOf(c) > -1
 201                                         tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
 202                                         tok_state = tok_state_tag_name
 203                                 else if uc_alpha.indexOf(c) > -1
 204                                         tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
 205                                         tok_state = tok_state_tag_name
 206                                 else
 207                                         parse_error()
 208                                         tok_state = tok_state_data
 209                                         cur -= 1 # we didn't parse/handle the char after <
 210                                         return [TYPE_TEXT, '<']
 211                 return null
 212
 213         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
 214         tok_state_tag_name = ->
 215                 switch c = txt.charAt(cur++)
 216                         when "\t", "\n", "\u000c", ' '
 217                                 tok_state = tok_state_before_attribute_name
 218                         when '/'
 219                                 tok_state = tok_state_self_closing_start_tag
 220                         when '>'
 221                                 tok_state = tok_state_data
 222                                 tmp = tok_cur_tag
 223                                 tok_cur_tag = null
 224                                 return tmp
 225                         when "\u0000"
 226                                 parse_error()
 227                                 tok_cur_tag[1] += "\ufffd"
 228                         when '' # EOF
 229                                 parse_error()
 230                                 tok_state = tok_state_data
 231                         else
 232                                 if uc_alpha.indexOf(c) > -1
 233                                         tok_cur_tag[1] += c.toLowerCase()
 234                                 else
 235                                         tok_cur_tag[1] += c
 236                 return null
 237
 238         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
 239         tok_state_before_attribute_name = ->
 240                 attr_name = null
 241                 switch c = txt.charAt(cur++)
 242                         when "\t", "\n", "\u000c", ' '
 243                                 return null
 244                         when '/'
 245                                 tok_state = tok_state_self_closing_start_tag
 246                                 return null
 247                         when '>'
 248                                 tok_state = tok_state_data
 249                                 tmp = tok_cur_tag
 250                                 tok_cur_tag = null
 251                                 return tmp
 252                         when "\u0000"
 253                                 parse_error()
 254                                 attr_name = "\ufffd"
 255                         when '"', "'", '<', '='
 256                                 parse_error()
 257                                 attr_name = c
 258                         when '' # EOF
 259                                 parse_error()
 260                                 tok_state = tok_state_data
 261                         else
 262                                 if uc_alpha.indexOf(c) > -1
 263                                         attr_name = c.toLowerCase()
 264                                 else
 265                                         attr_name = c
 266                 if attr_name?
 267                         tok_cur_tag[2].unshift [attr_name, '']
 268                         tok_state = tok_state_attribute_name
 269                 return null
 270
 271         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
 272         tok_state_attribute_name = ->
 273                 switch c = txt.charAt(cur++)
 274                         when "\t", "\n", "\u000c", ' '
 275                                 tok_state = tok_state_after_attribute_name
 276                         when '/'
 277                                 tok_state = tok_state_self_closing_start_tag
 278                         when '='
 279                                 tok_state = tok_state_before_attribute_value
 280                         when '>'
 281                                 tok_state = tok_state_data
 282                                 tmp = tok_cur_tag
 283                                 tok_cur_tag = null
 284                                 return tmp
 285                         when "\u0000"
 286                                 parse_error()
 287                                 tok_cur_tag[2][0][0] = "\ufffd"
 288                         when '"', "'", '<'
 289                                 parse_error()
 290                                 tok_cur_tag[2][0][0] = c
 291                         when '' # EOF
 292                                 parse_error()
 293                                 tok_state = tok_state_data
 294                         else
 295                                 if uc_alpha.indexOf(c) > -1
 296                                         tok_cur_tag[2][0][0] = c.toLowerCase()
 297                                 else
 298                                         tok_cur_tag[2][0][0] += c
 299                 return null
 300
 301         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
 302         tok_state_before_attribute_value = ->
 303                 switch c = txt.charAt(cur++)
 304                         when "\t", "\n", "\u000c", ' '
 305                                 return null
 306                         when '"'
 307                                 tok_state = tok_state_attribute_value_double_quoted
 308                         when '&'
 309                                 tok_state = tok_state_attribute_value_unquoted
 310                                 cur -= 1
 311                         when "'"
 312                                 tok_state = tok_state_attribute_value_single_quoted
 313                         when "\u0000"
 314                                 # Parse error
 315                                 tok_cur_tag[2][0][1] += "\ufffd"
 316                                 tok_state = tok_state_attribute_value_unquoted
 317                         when '>'
 318                                 # Parse error
 319                                 tok_state = tok_state_data
 320                                 tmp = tok_cur_tag
 321                                 tok_cur_tag = null
 322                                 return tmp
 323                         when '' # EOF
 324                                 parse_error()
 325                                 tok_state = tok_state_data
 326                         else
 327                                 tok_cur_tag[2][0][1] += c
 328                                 tok_state = tok_state_attribute_value_unquoted
 329                 return null
 330
 331         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
 332         tok_state_attribute_value_double_quoted = ->
 333                 switch c = txt.charAt(cur++)
 334                         when '"'
 335                                 tok_state = tok_state_after_attribute_value_quoted
 336                         when '&'
 337                                 tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
 338                         when "\u0000"
 339                                 # Parse error
 340                                 tok_cur_tag[2][0][1] += "\ufffd"
 341                         when '' # EOF
 342                                 parse_error()
 343                                 tok_state = tok_state_data
 344                         else
 345                                 tok_cur_tag[2][0][1] += c
 346                 return null
 347
 348         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
 349         tok_state_attribute_value_single_quoted = ->
 350                 switch c = txt.charAt(cur++)
 351                         when "'"
 352                                 tok_state = tok_state_after_attribute_value_quoted
 353                         when '&'
 354                                 tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
 355                         when "\u0000"
 356                                 # Parse error
 357                                 tok_cur_tag[2][0][1] += "\ufffd"
 358                         when '' # EOF
 359                                 parse_error()
 360                                 tok_state = tok_state_data
 361                         else
 362                                 tok_cur_tag[2][0][1] += c
 363                 return null
 364
 365         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
 366         tok_state_attribute_value_unquoted = ->
 367                 switch c = txt.charAt(cur++)
 368                         when "\t", "\n", "\u000c", ' '
 369                                 tok_state = tok_state_before_attribute_name
 370                         when '&'
 371                                 tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
 372                         when '>'
 373                                 tok_state = tok_state_data
 374                                 tmp = tok_cur_tag
 375                                 tok_cur_tag = null
 376                                 return tmp
 377                         when "\u0000"
 378                                 tok_cur_tag[2][0][1] += "\ufffd"
 379                         when '' # EOF
 380                                 parse_error()
 381                                 tok_state = tok_state_data
 382                         else
 383                                 # Parse Error if ', <, = or ` (backtick)
 384                                 tok_cur_tag[2][0][1] += c
 385                 return null
 386
 387         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
 388         tok_state_after_attribute_value_quoted = ->
 389                 switch c = txt.charAt(cur++)
 390                         when "\t", "\n", "\u000c", ' '
 391                                 tok_state = tok_state_before_attribute_name
 392                         when '/'
 393                                 tok_state = tok_state_self_closing_start_tag
 394                         when '>'
 395                                 tok_state = tok_state_data
 396                                 tmp = tok_cur_tag
 397                                 tok_cur_tag = null
 398                                 return tmp
 399                         when '' # EOF
 400                                 parse_error()
 401                                 tok_state = tok_state_data
 402                         else
 403                                 # Parse Error
 404                                 tok_state = tok_state_before_attribute_name
 405                                 cur -= 1 # we didn't handle that char
 406                 return null
 407
 408         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
 409         # Don't set this as a state, just call it
 410         # returns a string (NOT a text node)
 411         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
 412                 if cur >= txt.length
 413                         return '&'
 414                 switch c = txt.charAt(cur)
 415                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
 416                                 # explicitly not a parse error
 417                                 return '&'
 418                         when ';'
 419                                 # there has to be "one or more" alnums between & and ; to be a parse error
 420                                 return '&'
 421                         when '#'
 422                                 if cur + 1 >= txt.length
 423                                         return '&'
 424                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
 425                                         prefix = '#x'
 426                                         charset = hex_chars
 427                                         start = cur + 2
 428                                 else
 429                                         charset = digits
 430                                         start = cur + 1
 431                                         prefix = '#'
 432                                 i = 0
 433                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
 434                                         i += 1
 435                                 if i is 0
 436                                         return '&'
 437                                 if txt.charAt(start + i) is ';'
 438                                         i += 1
 439                                 # FIXME This is supposed to generate parse errors for some chars
 440                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
 441                                 if decoded?
 442                                         cur = start + i
 443                                         return decoded
 444                                 return '&'
 445                         else
 446                                 for i in [0...31]
 447                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
 448                                                 break
 449                                 if i is 0
 450                                         # exit early, because parse_error() below needs at least one alnum
 451                                         return '&'
 452                                 if txt.charAt(cur + i) is ';'
 453                                         i += 1 # include ';' terminator in value
 454                                         decoded = decode_named_char_ref txt.substr(cur, i)
 455                                         if decoded?
 456                                                 cur += i
 457                                                 return decoded
 458                                         parse_error()
 459                                         return '&'
 460                                 else
 461                                         # no ';' terminator (only legacy char refs)
 462                                         max = i
 463                                         for i in [2..max] # no prefix matches, so ok to check shortest first
 464                                                 c = legacy_char_refs[txt.substr(cur, i)]
 465                                                 if c?
 466                                                         if in_attr
 467                                                                 if txt.charAt(cur + i) is '='
 468                                                                         # "because some legacy user agents will
 469                                                                         # misinterpret the markup in those cases"
 470                                                                         parse_error()
 471                                                                         return '&'
 472                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
 473                                                                         # this makes attributes forgiving about url args
 474                                                                         return '&'
 475                                                         # ok, and besides the weird exceptions for attributes...
 476                                                         # return the matching char
 477                                                         cur += i # consume entity chars
 478                                                         parse_error() # because no terminating ";"
 479                                                         return c
 480                                         parse_error()
 481                                         return '&'
 482                 return # never reached
 483
 484         # the functions below impliment the Tree Contstruction algorithm here:
 485         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 486         # FIXME this is just a bit of a hack that makes sense... read spec and do it that way
 487         tree_append = (t) ->
 488                 switch t[0]
 489                         when TYPE_TEXT
 490                                 if tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
 491                                         tree_append_point[tree_append_point.length - 1][1] += t[1]
 492                                 else
 493                                         tree_append_point.push t
 494                         when TYPE_OPEN_TAG
 495                                 t[0] = TYPE_TAG
 496                                 # convert attributes into a hash
 497                                 attrs = {}
 498                                 while t[2].length
 499                                         a = t[2].pop()
 500                                         attrs[a[0]] = a[1]
 501                                 t[2] = attrs
 502                                 tree_append_point.push t
 503                                 tree_append_point = t[3]
 504                                 # TODO implement stack of open elements
 505                                 # TODO implement formatting elements thing
 506                         when TYPE_EOF
 507                                 return
 508                         # TODO implement close tags
 509                         # TODO implement self-closing tags
 510                         else
 511                                 console.log "UNIMPLEMENTED tag type: #{t[0]}"
 512
 513         # tree constructor initialization
 514         tree = [] # see comments on TYPE_TAG/etc for the structure of this data
 515         tree_append_point = tree
 516         tree_state = tree_append
 517
 518         # tokenizer initialization
 519         tok_state = tok_state_data
 520
 521         # proccess input
 522         loop
 523                 t = tok_state()
 524                 if t?
 525                         tree_state t
 526                         if t[0] is TYPE_EOF
 527                                 return tree
 528         return # never reached
 529
 530 # everything below is tests on the above
 531 test_equals = (description, fn, args..., expected_output) ->
 532         output = fn.apply this, args
 533         if output is expected_output
 534                 console.log "passed: #{description}."
 535         else
 536                 console.log "FAILED: #{description}..."
 537                 console.log "   Expected: #{expected_output}"
 538                 console.log "     Actual: #{output}"
 539 html_to_json = (html) ->
 540         return JSON.stringify parse_html html
 541 test_equals "empty", html_to_json, "", '[]'
 542 test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
 543 test_equals "named entity", html_to_json, "a&amp;1234", '[[1,"a&1234"]]'
 544 test_equals "broken named character references", html_to_json, "1&amp2&&amp;3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
 545 test_equals "numbered entity overrides", html_to_json, "1&#X80&#x80; &#x83", '[[1,"1€€ ƒ"]]'
 546 test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
 547 test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
 548 test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'
 549 test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
 550 test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
 551 test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'