parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
  20 # or <body> tags, nor does it produce the top level "document" node in the dom
  21 # tree, nor nodes for html, head or body.
  22 #
  23 # Instead, the data structure produced by this parser is an array of nodes.
  24 #
  25 # Each node is an array. The first element in the array is an integer (one of
  26 # the TYPE_* constants below) followed by the appropriate fields for that type
  27 # (shown below in the comments after the TYPE_* definition.)
  28
  29 TYPE_TAG = 0 # name, {attributes}, [children]
  30 TYPE_TEXT = 1 # "text"
  31 TYPE_WHITESPACE = 2
  32 TYPE_COMMENT = 3
  33 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  34 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
  35 TYPE_CLOSE_TAG = 5 # name
  36 TYPE_EOF = 6
  37
  38 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
  39 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
  40 digits = "0123456789"
  41 alnum = lc_alpha + uc_alpha + digits
  42 hex_chars = digits + "abcdefABCDEF"
  43
  44 # some SVG elements have dashes in them
  45 tag_name_chars = alnum + "-"
  46
  47 # http://www.w3.org/TR/html5/infrastructure.html#space-character
  48 space_chars = "\u0009\u000a\u000c\u000d\u0020"
  49
  50 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
  51 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
  52
  53 # These are the character references that don't need a terminating semicolon
  54 # min length: 2, max: 6, none are a prefix of any other.
  55 legacy_char_refs = {
  56         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
  57         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
  58         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
  59         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
  60         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
  61         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
  62         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
  63         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
  64         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
  65         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
  66         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
  67         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
  68         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
  69         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
  70         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
  71         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
  72         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
  73         yen: '¥', yuml: 'ÿ'
  74 }
  75
  76 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
  77 raw_text_elements = ['script', 'style']
  78 escapable_raw_text_elements = ['textarea', 'title']
  79 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
  80 svg_elements = [
  81         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
  82         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
  83         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
  84         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
  85         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
  86         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
  87         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
  88         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
  89         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
  90         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
  91         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
  92         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
  93         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
  94         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
  95         'view', 'vkern'
  96 ]
  97
  98 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
  99 mathml_elements = [
 100         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
 101         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
 102         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
 103         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
 104         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
 105         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 106         'determinant', 'diff', 'divergence', 'divide', 'domain',
 107         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 108         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 109         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 110         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 111         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 112         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 113         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 114         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 115         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 116         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 117         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 118         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 119         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 120         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 121         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 122         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 123         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 124         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 125         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 126         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 127         'vectorproduct', 'xor'
 128 ]
 129 # foreign_elements = [svg_elements..., mathml_elements...]
 130 #normal_elements = All other allowed HTML elements are normal elements.
 131
 132
 133 # decode_named_char_ref()
 134 #
 135 # The list of named character references is _huge_ so ask the browser to decode
 136 # for us instead of wasting bandwidth/space on including the table here.
 137 #
 138 # Pass without the "&" but with the ";" examples:
 139 #    for "&amp" pass "amp;"
 140 #    for "&#x2032" pass "x2032;"
 141 g_dncr = {
 142         cache: {}
 143         textarea: document.createElement('textarea')
 144 }
 145 # TODO test this in IE8
 146 decode_named_char_ref = (txt) ->
 147         txt = "&#{txt}"
 148         decoded = g_dncr.cache[txt]
 149         return decoded if decoded?
 150         g_dncr.textarea.innerHTML = txt
 151         decoded = g_dncr.textarea.value
 152         return null if decoded is txt
 153         return g_dncr.cache[txt] = decoded
 154
 155 parse_html = (txt) ->
 156         cur = 0 # index of next char in txt to be parsed
 157         # declare tree and tokenizer variables so they're in scope below
 158         tree = null
 159         open_tags = [] # stack of open elements
 160         tree_state = null
 161         tok_state = null
 162         tok_cur_tag = null # partially parsed tag
 163
 164         parse_error = ->
 165                 console.log "Parse error at character #{cur} of #{txt.length}"
 166
 167         # the functions below implement the tokenizer stats described here:
 168         # http://www.w3.org/TR/html5/syntax.html#tokenization
 169
 170         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
 171         tok_state_data = ->
 172                 switch c = txt.charAt(cur++)
 173                         when '&'
 174                                 return [TYPE_TEXT, tokenize_character_reference()]
 175                         when '<'
 176                                 tok_state = tok_state_tag_open
 177                         when "\u0000"
 178                                 parse_error()
 179                                 return [TYPE_TEXT, c]
 180                         when '' # EOF
 181                                 return [TYPE_EOF]
 182                         else
 183                                 return [TYPE_TEXT, c]
 184                 return null
 185
 186         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
 187         # not needed: tok_state_character_reference_in_data = ->
 188         # just call tok_state_character_reference_in_data()
 189
 190         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
 191         tok_state_tag_open = ->
 192                 switch c = txt.charAt(cur++)
 193                         when '!'
 194                                 tok_state = tok_state_markup_declaration_open
 195                         when '/'
 196                                 tok_state = tok_state_end_tag_open
 197                         when '?'
 198                                 parse_error()
 199                                 tok_state = tok_state_bogus_comment
 200                         else
 201                                 if lc_alpha.indexOf(c) > -1
 202                                         tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
 203                                         tok_state = tok_state_tag_name
 204                                 else if uc_alpha.indexOf(c) > -1
 205                                         tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
 206                                         tok_state = tok_state_tag_name
 207                                 else
 208                                         parse_error()
 209                                         tok_state = tok_state_data
 210                                         cur -= 1 # we didn't parse/handle the char after <
 211                                         return [TYPE_TEXT, '<']
 212                 return null
 213
 214         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
 215         tok_state_end_tag_open = ->
 216                 switch c = txt.charAt(cur++)
 217                         when '>'
 218                                 parse_error()
 219                                 tok_state = tok_state_data
 220                         when '' # EOF
 221                                 parse_error()
 222                                 tok_state = tok_state_data
 223                                 return [TYPE_TEXT, '</']
 224                         else
 225                                 if uc_alpha.indexOf(c) > -1
 226                                         tok_cur_tag = [TYPE_CLOSE_TAG, c.toLowerCase(), [], []]
 227                                         tok_state = tok_state_tag_name
 228                                 else if lc_alpha.indexOf(c) > -1
 229                                         tok_cur_tag = [TYPE_CLOSE_TAG, c, [], []]
 230                                         tok_state = tok_state_tag_name
 231                                 else
 232                                         parse_error()
 233                                         tok_state = tok_state_bogus_comment
 234                 return null
 235
 236         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
 237         tok_state_tag_name = ->
 238                 switch c = txt.charAt(cur++)
 239                         when "\t", "\n", "\u000c", ' '
 240                                 tok_state = tok_state_before_attribute_name
 241                         when '/'
 242                                 tok_state = tok_state_self_closing_start_tag
 243                         when '>'
 244                                 tok_state = tok_state_data
 245                                 tmp = tok_cur_tag
 246                                 tok_cur_tag = null
 247                                 return tmp
 248                         when "\u0000"
 249                                 parse_error()
 250                                 tok_cur_tag[1] += "\ufffd"
 251                         when '' # EOF
 252                                 parse_error()
 253                                 tok_state = tok_state_data
 254                         else
 255                                 if uc_alpha.indexOf(c) > -1
 256                                         tok_cur_tag[1] += c.toLowerCase()
 257                                 else
 258                                         tok_cur_tag[1] += c
 259                 return null
 260
 261         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
 262         tok_state_before_attribute_name = ->
 263                 attr_name = null
 264                 switch c = txt.charAt(cur++)
 265                         when "\t", "\n", "\u000c", ' '
 266                                 return null
 267                         when '/'
 268                                 tok_state = tok_state_self_closing_start_tag
 269                                 return null
 270                         when '>'
 271                                 tok_state = tok_state_data
 272                                 tmp = tok_cur_tag
 273                                 tok_cur_tag = null
 274                                 return tmp
 275                         when "\u0000"
 276                                 parse_error()
 277                                 attr_name = "\ufffd"
 278                         when '"', "'", '<', '='
 279                                 parse_error()
 280                                 attr_name = c
 281                         when '' # EOF
 282                                 parse_error()
 283                                 tok_state = tok_state_data
 284                         else
 285                                 if uc_alpha.indexOf(c) > -1
 286                                         attr_name = c.toLowerCase()
 287                                 else
 288                                         attr_name = c
 289                 if attr_name?
 290                         tok_cur_tag[2].unshift [attr_name, '']
 291                         tok_state = tok_state_attribute_name
 292                 return null
 293
 294         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
 295         tok_state_attribute_name = ->
 296                 switch c = txt.charAt(cur++)
 297                         when "\t", "\n", "\u000c", ' '
 298                                 tok_state = tok_state_after_attribute_name
 299                         when '/'
 300                                 tok_state = tok_state_self_closing_start_tag
 301                         when '='
 302                                 tok_state = tok_state_before_attribute_value
 303                         when '>'
 304                                 tok_state = tok_state_data
 305                                 tmp = tok_cur_tag
 306                                 tok_cur_tag = null
 307                                 return tmp
 308                         when "\u0000"
 309                                 parse_error()
 310                                 tok_cur_tag[2][0][0] = "\ufffd"
 311                         when '"', "'", '<'
 312                                 parse_error()
 313                                 tok_cur_tag[2][0][0] = c
 314                         when '' # EOF
 315                                 parse_error()
 316                                 tok_state = tok_state_data
 317                         else
 318                                 if uc_alpha.indexOf(c) > -1
 319                                         tok_cur_tag[2][0][0] = c.toLowerCase()
 320                                 else
 321                                         tok_cur_tag[2][0][0] += c
 322                 return null
 323
 324         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
 325         tok_state_before_attribute_value = ->
 326                 switch c = txt.charAt(cur++)
 327                         when "\t", "\n", "\u000c", ' '
 328                                 return null
 329                         when '"'
 330                                 tok_state = tok_state_attribute_value_double_quoted
 331                         when '&'
 332                                 tok_state = tok_state_attribute_value_unquoted
 333                                 cur -= 1
 334                         when "'"
 335                                 tok_state = tok_state_attribute_value_single_quoted
 336                         when "\u0000"
 337                                 # Parse error
 338                                 tok_cur_tag[2][0][1] += "\ufffd"
 339                                 tok_state = tok_state_attribute_value_unquoted
 340                         when '>'
 341                                 # Parse error
 342                                 tok_state = tok_state_data
 343                                 tmp = tok_cur_tag
 344                                 tok_cur_tag = null
 345                                 return tmp
 346                         when '' # EOF
 347                                 parse_error()
 348                                 tok_state = tok_state_data
 349                         else
 350                                 tok_cur_tag[2][0][1] += c
 351                                 tok_state = tok_state_attribute_value_unquoted
 352                 return null
 353
 354         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
 355         tok_state_attribute_value_double_quoted = ->
 356                 switch c = txt.charAt(cur++)
 357                         when '"'
 358                                 tok_state = tok_state_after_attribute_value_quoted
 359                         when '&'
 360                                 tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
 361                         when "\u0000"
 362                                 # Parse error
 363                                 tok_cur_tag[2][0][1] += "\ufffd"
 364                         when '' # EOF
 365                                 parse_error()
 366                                 tok_state = tok_state_data
 367                         else
 368                                 tok_cur_tag[2][0][1] += c
 369                 return null
 370
 371         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
 372         tok_state_attribute_value_single_quoted = ->
 373                 switch c = txt.charAt(cur++)
 374                         when "'"
 375                                 tok_state = tok_state_after_attribute_value_quoted
 376                         when '&'
 377                                 tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
 378                         when "\u0000"
 379                                 # Parse error
 380                                 tok_cur_tag[2][0][1] += "\ufffd"
 381                         when '' # EOF
 382                                 parse_error()
 383                                 tok_state = tok_state_data
 384                         else
 385                                 tok_cur_tag[2][0][1] += c
 386                 return null
 387
 388         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
 389         tok_state_attribute_value_unquoted = ->
 390                 switch c = txt.charAt(cur++)
 391                         when "\t", "\n", "\u000c", ' '
 392                                 tok_state = tok_state_before_attribute_name
 393                         when '&'
 394                                 tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
 395                         when '>'
 396                                 tok_state = tok_state_data
 397                                 tmp = tok_cur_tag
 398                                 tok_cur_tag = null
 399                                 return tmp
 400                         when "\u0000"
 401                                 tok_cur_tag[2][0][1] += "\ufffd"
 402                         when '' # EOF
 403                                 parse_error()
 404                                 tok_state = tok_state_data
 405                         else
 406                                 # Parse Error if ', <, = or ` (backtick)
 407                                 tok_cur_tag[2][0][1] += c
 408                 return null
 409
 410         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
 411         tok_state_after_attribute_value_quoted = ->
 412                 switch c = txt.charAt(cur++)
 413                         when "\t", "\n", "\u000c", ' '
 414                                 tok_state = tok_state_before_attribute_name
 415                         when '/'
 416                                 tok_state = tok_state_self_closing_start_tag
 417                         when '>'
 418                                 tok_state = tok_state_data
 419                                 tmp = tok_cur_tag
 420                                 tok_cur_tag = null
 421                                 return tmp
 422                         when '' # EOF
 423                                 parse_error()
 424                                 tok_state = tok_state_data
 425                         else
 426                                 # Parse Error
 427                                 tok_state = tok_state_before_attribute_name
 428                                 cur -= 1 # we didn't handle that char
 429                 return null
 430
 431         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
 432         # Don't set this as a state, just call it
 433         # returns a string (NOT a text node)
 434         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
 435                 if cur >= txt.length
 436                         return '&'
 437                 switch c = txt.charAt(cur)
 438                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
 439                                 # explicitly not a parse error
 440                                 return '&'
 441                         when ';'
 442                                 # there has to be "one or more" alnums between & and ; to be a parse error
 443                                 return '&'
 444                         when '#'
 445                                 if cur + 1 >= txt.length
 446                                         return '&'
 447                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
 448                                         prefix = '#x'
 449                                         charset = hex_chars
 450                                         start = cur + 2
 451                                 else
 452                                         charset = digits
 453                                         start = cur + 1
 454                                         prefix = '#'
 455                                 i = 0
 456                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
 457                                         i += 1
 458                                 if i is 0
 459                                         return '&'
 460                                 if txt.charAt(start + i) is ';'
 461                                         i += 1
 462                                 # FIXME This is supposed to generate parse errors for some chars
 463                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
 464                                 if decoded?
 465                                         cur = start + i
 466                                         return decoded
 467                                 return '&'
 468                         else
 469                                 for i in [0...31]
 470                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
 471                                                 break
 472                                 if i is 0
 473                                         # exit early, because parse_error() below needs at least one alnum
 474                                         return '&'
 475                                 if txt.charAt(cur + i) is ';'
 476                                         i += 1 # include ';' terminator in value
 477                                         decoded = decode_named_char_ref txt.substr(cur, i)
 478                                         if decoded?
 479                                                 cur += i
 480                                                 return decoded
 481                                         parse_error()
 482                                         return '&'
 483                                 else
 484                                         # no ';' terminator (only legacy char refs)
 485                                         max = i
 486                                         for i in [2..max] # no prefix matches, so ok to check shortest first
 487                                                 c = legacy_char_refs[txt.substr(cur, i)]
 488                                                 if c?
 489                                                         if in_attr
 490                                                                 if txt.charAt(cur + i) is '='
 491                                                                         # "because some legacy user agents will
 492                                                                         # misinterpret the markup in those cases"
 493                                                                         parse_error()
 494                                                                         return '&'
 495                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
 496                                                                         # this makes attributes forgiving about url args
 497                                                                         return '&'
 498                                                         # ok, and besides the weird exceptions for attributes...
 499                                                         # return the matching char
 500                                                         cur += i # consume entity chars
 501                                                         parse_error() # because no terminating ";"
 502                                                         return c
 503                                         parse_error()
 504                                         return '&'
 505                 return # never reached
 506
 507         # the functions below impliment the Tree Contstruction algorithm here:
 508         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 509         # FIXME this is just a bit of a hack that makes sense... read spec and do it that way
 510         tree_append = (t) ->
 511                 switch t[0]
 512                         when TYPE_TEXT
 513                                 if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
 514                                         open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
 515                                 else
 516                                         open_tags[0][3].push t
 517                         when TYPE_OPEN_TAG
 518                                 t[0] = TYPE_TAG
 519                                 # convert attributes into a hash
 520                                 attrs = {}
 521                                 while t[2].length
 522                                         a = t[2].pop()
 523                                         attrs[a[0]] = a[1]
 524                                 t[2] = attrs
 525                                 # FIXME probs have to auto-close things first
 526                                 open_tags[0][3].push t
 527                                 open_tags.unshift t
 528                                 # TODO implement formatting elements thing
 529                         when TYPE_CLOSE_TAG
 530                                 # FIXME this is just a hack for now
 531                                 if open_tags.length < 2
 532                                         parse_error()
 533                                         return
 534                                 if open_tags[0][1] isnt t[1]
 535                                         parse_error()
 536                                         # fall through and close something anyway
 537                                 open_tags.shift()
 538                         when TYPE_EOF
 539                                 return
 540                         # TODO implement close tags
 541                         # TODO implement self-closing tags
 542                         else
 543                                 console.log "UNIMPLEMENTED tag type: #{t[0]}"
 544                 return
 545
 546         # tree constructor initialization
 547         # see comments on TYPE_TAG/etc for the structure of this data
 548         tree = [TYPE_TAG, 'html', {}, []]
 549         open_tags = [tree]
 550         tree_state = tree_append
 551
 552         # tokenizer initialization
 553         tok_state = tok_state_data
 554
 555         # proccess input
 556         loop
 557                 t = tok_state()
 558                 if t?
 559                         tree_state t
 560                         if t[0] is TYPE_EOF
 561                                 return tree[3]
 562         return # never reached
 563
 564 # everything below is tests on the above
 565 test_equals = (description, fn, args..., expected_output) ->
 566         output = fn.apply this, args
 567         if output is expected_output
 568                 console.log "passed: #{description}."
 569         else
 570                 console.log "FAILED: #{description}..."
 571                 console.log "   Expected: #{expected_output}"
 572                 console.log "     Actual: #{output}"
 573 html_to_json = (html) ->
 574         return JSON.stringify parse_html html
 575 test_equals "empty", html_to_json, "", '[]'
 576 test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
 577 test_equals "named entity", html_to_json, "a&amp;1234", '[[1,"a&1234"]]'
 578 test_equals "broken named character references", html_to_json, "1&amp2&&amp;3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
 579 test_equals "numbered entity overrides", html_to_json, "1&#X80&#x80; &#x83", '[[1,"1€€ ƒ"]]'
 580 test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
 581 test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
 582 test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'
 583 test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
 584 test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
 585 test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
 586 test_equals "matching closing tags", html_to_json, "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]'