parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
  20 # or <body> tags, nor does it produce the top level "document" node in the dom
  21 # tree, nor nodes for html, head or body.
  22 #
  23 # Instead, the data structure produced by this parser is an array of nodes.
  24 #
  25 # Each node is an array. The first element in the array is an integer (one of
  26 # the TYPE_* constants below) followed by the appropriate fields for that type
  27 # (shown below in the comments after the TYPE_* definition.)
  28
  29 TYPE_TAG = 0 # name, {attributes}, [children]
  30 TYPE_TEXT = 1 # "text"
  31 TYPE_COMMENT = 2
  32 TYPE_DOCTYPE = 3
  33 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  34 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
  35 TYPE_END_TAG = 5 # name
  36 TYPE_EOF = 6
  37
  38 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
  39 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
  40 digits = "0123456789"
  41 alnum = lc_alpha + uc_alpha + digits
  42 hex_chars = digits + "abcdefABCDEF"
  43 scopers = { # FIXME these are supposed to be namespace specific
  44         'applet', 'caption', 'html', 'table', 'td', 'th', 'marquee', 'object',
  45         'template', 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml',
  46         'foreignObject', 'desc', 'title'
  47 }
  48
  49 # some SVG elements have dashes in them
  50 tag_name_chars = alnum + "-"
  51
  52 # http://www.w3.org/TR/html5/infrastructure.html#space-character
  53 space_chars = "\u0009\u000a\u000c\u000d\u0020"
  54
  55 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
  56 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
  57
  58 # These are the character references that don't need a terminating semicolon
  59 # min length: 2, max: 6, none are a prefix of any other.
  60 legacy_char_refs = {
  61         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
  62         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
  63         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
  64         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
  65         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
  66         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
  67         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
  68         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
  69         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
  70         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
  71         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
  72         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
  73         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
  74         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
  75         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
  76         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
  77         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
  78         yen: '¥', yuml: 'ÿ'
  79 }
  80
  81 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
  82 raw_text_elements = ['script', 'style']
  83 escapable_raw_text_elements = ['textarea', 'title']
  84 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
  85 svg_elements = [
  86         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
  87         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
  88         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
  89         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
  90         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
  91         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
  92         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
  93         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
  94         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
  95         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
  96         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
  97         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
  98         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
  99         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
 100         'view', 'vkern'
 101 ]
 102
 103 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
 104 mathml_elements = [
 105         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
 106         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
 107         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
 108         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
 109         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
 110         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 111         'determinant', 'diff', 'divergence', 'divide', 'domain',
 112         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 113         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 114         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 115         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 116         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 117         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 118         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 119         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 120         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 121         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 122         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 123         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 124         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 125         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 126         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 127         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 128         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 129         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 130         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 131         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 132         'vectorproduct', 'xor'
 133 ]
 134 # foreign_elements = [svg_elements..., mathml_elements...]
 135 #normal_elements = All other allowed HTML elements are normal elements.
 136
 137 special_elements = {
 138         # from HTML:
 139         address: true, applet: true, area: true, article: true, aside: true,
 140         base: true, basefont: true, bgsound: true, blockquote: true, body: true,
 141         br: true, button: true, caption: true, center: true, col: true,
 142         colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
 143         dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
 144         footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
 145         h3: true, h4: true, h5: true, h6: true, head: true, header: true,
 146         hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
 147         isindex: true, li: true, link: true, listing: true, main: true,
 148         marquee: true, meta: true, nav: true, noembed: true, noframes: true,
 149         noscript: true, object: true, ol: true, p: true, param: true,
 150         plaintext: true, pre: true, script: true, section: true, select: true,
 151         source: true, style: true, summary: true, table: true, tbody: true,
 152         td: true, template: true, textarea: true, tfoot: true, th: true,
 153         thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
 154         xmp: true,
 155
 156         # from MathML:
 157         mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
 158
 159         # from SVG:
 160         foreignObject: true, desc: true, title: true
 161 }
 162
 163 formatting_elements = {
 164          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
 165          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
 166          u: true
 167 }
 168
 169
 170 # decode_named_char_ref()
 171 #
 172 # The list of named character references is _huge_ so ask the browser to decode
 173 # for us instead of wasting bandwidth/space on including the table here.
 174 #
 175 # Pass without the "&" but with the ";" examples:
 176 #    for "&amp" pass "amp;"
 177 #    for "&#x2032" pass "x2032;"
 178 g_dncr = {
 179         cache: {}
 180         textarea: document.createElement('textarea')
 181 }
 182 # TODO test this in IE8
 183 decode_named_char_ref = (txt) ->
 184         txt = "&#{txt}"
 185         decoded = g_dncr.cache[txt]
 186         return decoded if decoded?
 187         g_dncr.textarea.innerHTML = txt
 188         decoded = g_dncr.textarea.value
 189         return null if decoded is txt
 190         return g_dncr.cache[txt] = decoded
 191
 192 parse_html = (txt) ->
 193         cur = 0 # index of next char in txt to be parsed
 194         # declare tree and tokenizer variables so they're in scope below
 195         tree = null
 196         open_tags = [] # stack of open elements
 197         tree_state = null
 198         tok_state = null
 199         tok_cur_tag = null # partially parsed tag
 200         flag_frameset_ok = null
 201         flag_parsing = null
 202
 203         parse_error = ->
 204                 console.log "Parse error at character #{cur} of #{txt.length}"
 205
 206
 207         # the functions below impliment the Tree Contstruction algorithm
 208         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 209
 210         # But first... the helpers
 211         template_tag_is_open = ->
 212                 for t of open_tags
 213                         if t[0] is TYPE_TAG and t[1] is 'template'
 214                                 return true
 215                 return false
 216         is_in_scope = (tag_name) ->
 217                 for t of open_tags
 218                         if t[0] is TYPE_TAG and t[1] is tag_name
 219                                 return true
 220                         # FIXME bail if in scopers
 221                 return false
 222
 223         reconstruct_active_formatting_elements = ->
 224                 # FIXME implement this
 225
 226         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
 227         # FIXME implement this
 228         close_p_if_in_button_scope = ->
 229                 if open_tags[0][1] is 'p' # FIXME
 230                         open_tags.pop()
 231                 return
 232                 #p = find_button_scope 'p'
 233                 #if p?
 234                         # TODO generate_implied_end_tags except for p tags
 235                         # TODO parse_error unless open_tags[0][1] is 'p'
 236                         # TODO pop stack until 'p' popped
 237
 238
 239
 240         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
 241         tree_insert_a_character = (t) ->
 242                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
 243                 if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
 244                         open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
 245                 else
 246                         open_tags[0][3].push t
 247
 248         # FIXME read spec, do this right
 249         # note: this assumes it's an open tag
 250         tree_insert_tag = (t) ->
 251                 t[0] = TYPE_TAG # not TYPE_OPEN_TAG
 252                 # convert attributes into a hash
 253                 attrs = {}
 254                 while t[2].length
 255                         a = t[2].pop()
 256                         attrs[a[0]] = a[1]
 257                 t[2] = attrs
 258                 open_tags[0][3].push t
 259                 open_tags.unshift t
 260
 261         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
 262         tree_insert_a_comment = (t) ->
 263                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
 264                 open_tags[0][3].push t
 265
 266         # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
 267         tree_in_body = (t) ->
 268                 switch t[0]
 269                         when TYPE_TEXT
 270                                 switch t[1]
 271                                         when "\u0000"
 272                                                 parse_error()
 273                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
 274                                                 reconstruct_active_formatting_elements()
 275                                                 tree_insert_a_character t
 276                                         else
 277                                                 reconstruct_active_formatting_elements()
 278                                                 tree_insert_a_character t
 279                                                 flag_frameset_ok = false
 280                         when TYPE_COMMENT
 281                                 tree_insert_a_comment t
 282                         when TYPE_DOCTYPE
 283                                 parse_error()
 284                         when TYPE_OPEN_TAG
 285                                 switch t[1]
 286                                         when 'html'
 287                                                 parse_error()
 288                                                 return if template_tag_is_open()
 289                                                 root_attrs = open_tags[open_tags.length - 1][3]
 290                                                 for k, v of t[2]
 291                                                         root_attrs[k] = v unless root_attrs[k]?
 292                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
 293                                                 # FIXME also do this for </template> (end tag)
 294                                                 return tree_in_head t
 295                                         when 'body'
 296                                                 parse_error()
 297                                                 # TODO
 298                                         when 'frameset'
 299                                                 parse_error()
 300                                                 # TODO
 301                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
 302                                                 close_p_if_in_button_scope()
 303                                                 tree_insert_tag t
 304                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
 305                                                 close_p_if_in_button_scope()
 306                                                 if open_tags[0][1] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
 307                                                         parse_error()
 308                                                         open_tags.shift()
 309                                                 tree_insert_tag t
 310                                         # TODO lots more to implement here
 311                                         else # any other start tag
 312                                                 reconstruct_active_formatting_elements()
 313                                                 tree_insert_tag t
 314                         when TYPE_EOF
 315                                 ok_tags = {
 316                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
 317                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
 318                                 }
 319                                 for t in open_tags
 320                                         unless ok_tags[t[1]]?
 321                                                 parse_error()
 322                                                 break
 323                                 # TODO stack of template insertion modes thing
 324                                 flag_parsing = false # stop parsing
 325                         when TYPE_END_TAG
 326                                 switch t[1]
 327                                         when 'body'
 328                                                 unless is_in_scope 'body'
 329                                                         parse_error()
 330                                                         return
 331                                                 # TODO implement parse error and move to tree_after_body
 332                                         when 'html'
 333                                                 unless is_in_scope 'body' # weird, but it's what the spec says
 334                                                         parse_error()
 335                                                         return
 336                                                 # TODO implement parse error and move to tree_after_body, reprocess
 337                                         # TODO lots more close tags to implement here
 338                                         else
 339                                                 for node, i in open_tags
 340                                                         if node[1] is t[1]
 341                                                                 # FIXME generate implied end tags except those with name==t[1]
 342                                                                 parse_error() unless i is 0
 343                                                                 while i > 0
 344                                                                         open_tags.shift()
 345                                                                         i -= 1
 346                                                                 open_tags.shift()
 347                                                                 return
 348                                                         if special_elements[node[1]]?
 349                                                                 parse_error()
 350                                                                 return
 351
 352
 353         # the functions below implement the tokenizer stats described here:
 354         # http://www.w3.org/TR/html5/syntax.html#tokenization
 355
 356         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
 357         tok_state_data = ->
 358                 switch c = txt.charAt(cur++)
 359                         when '&'
 360                                 return [TYPE_TEXT, tokenize_character_reference()]
 361                         when '<'
 362                                 tok_state = tok_state_tag_open
 363                         when "\u0000"
 364                                 parse_error()
 365                                 return [TYPE_TEXT, c]
 366                         when '' # EOF
 367                                 return [TYPE_EOF]
 368                         else
 369                                 return [TYPE_TEXT, c]
 370                 return null
 371
 372         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
 373         # not needed: tok_state_character_reference_in_data = ->
 374         # just call tok_state_character_reference_in_data()
 375
 376         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
 377         tok_state_tag_open = ->
 378                 switch c = txt.charAt(cur++)
 379                         when '!'
 380                                 tok_state = tok_state_markup_declaration_open
 381                         when '/'
 382                                 tok_state = tok_state_end_tag_open
 383                         when '?'
 384                                 parse_error()
 385                                 tok_state = tok_state_bogus_comment
 386                         else
 387                                 if lc_alpha.indexOf(c) > -1
 388                                         tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
 389                                         tok_state = tok_state_tag_name
 390                                 else if uc_alpha.indexOf(c) > -1
 391                                         tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
 392                                         tok_state = tok_state_tag_name
 393                                 else
 394                                         parse_error()
 395                                         tok_state = tok_state_data
 396                                         cur -= 1 # we didn't parse/handle the char after <
 397                                         return [TYPE_TEXT, '<']
 398                 return null
 399
 400         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
 401         tok_state_end_tag_open = ->
 402                 switch c = txt.charAt(cur++)
 403                         when '>'
 404                                 parse_error()
 405                                 tok_state = tok_state_data
 406                         when '' # EOF
 407                                 parse_error()
 408                                 tok_state = tok_state_data
 409                                 return [TYPE_TEXT, '</']
 410                         else
 411                                 if uc_alpha.indexOf(c) > -1
 412                                         tok_cur_tag = [TYPE_END_TAG, c.toLowerCase(), [], []]
 413                                         tok_state = tok_state_tag_name
 414                                 else if lc_alpha.indexOf(c) > -1
 415                                         tok_cur_tag = [TYPE_END_TAG, c, [], []]
 416                                         tok_state = tok_state_tag_name
 417                                 else
 418                                         parse_error()
 419                                         tok_state = tok_state_bogus_comment
 420                 return null
 421
 422         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
 423         tok_state_tag_name = ->
 424                 switch c = txt.charAt(cur++)
 425                         when "\t", "\n", "\u000c", ' '
 426                                 tok_state = tok_state_before_attribute_name
 427                         when '/'
 428                                 tok_state = tok_state_self_closing_start_tag
 429                         when '>'
 430                                 tok_state = tok_state_data
 431                                 tmp = tok_cur_tag
 432                                 tok_cur_tag = null
 433                                 return tmp
 434                         when "\u0000"
 435                                 parse_error()
 436                                 tok_cur_tag[1] += "\ufffd"
 437                         when '' # EOF
 438                                 parse_error()
 439                                 tok_state = tok_state_data
 440                         else
 441                                 if uc_alpha.indexOf(c) > -1
 442                                         tok_cur_tag[1] += c.toLowerCase()
 443                                 else
 444                                         tok_cur_tag[1] += c
 445                 return null
 446
 447         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
 448         tok_state_before_attribute_name = ->
 449                 attr_name = null
 450                 switch c = txt.charAt(cur++)
 451                         when "\t", "\n", "\u000c", ' '
 452                                 return null
 453                         when '/'
 454                                 tok_state = tok_state_self_closing_start_tag
 455                                 return null
 456                         when '>'
 457                                 tok_state = tok_state_data
 458                                 tmp = tok_cur_tag
 459                                 tok_cur_tag = null
 460                                 return tmp
 461                         when "\u0000"
 462                                 parse_error()
 463                                 attr_name = "\ufffd"
 464                         when '"', "'", '<', '='
 465                                 parse_error()
 466                                 attr_name = c
 467                         when '' # EOF
 468                                 parse_error()
 469                                 tok_state = tok_state_data
 470                         else
 471                                 if uc_alpha.indexOf(c) > -1
 472                                         attr_name = c.toLowerCase()
 473                                 else
 474                                         attr_name = c
 475                 if attr_name?
 476                         tok_cur_tag[2].unshift [attr_name, '']
 477                         tok_state = tok_state_attribute_name
 478                 return null
 479
 480         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
 481         tok_state_attribute_name = ->
 482                 switch c = txt.charAt(cur++)
 483                         when "\t", "\n", "\u000c", ' '
 484                                 tok_state = tok_state_after_attribute_name
 485                         when '/'
 486                                 tok_state = tok_state_self_closing_start_tag
 487                         when '='
 488                                 tok_state = tok_state_before_attribute_value
 489                         when '>'
 490                                 tok_state = tok_state_data
 491                                 tmp = tok_cur_tag
 492                                 tok_cur_tag = null
 493                                 return tmp
 494                         when "\u0000"
 495                                 parse_error()
 496                                 tok_cur_tag[2][0][0] = "\ufffd"
 497                         when '"', "'", '<'
 498                                 parse_error()
 499                                 tok_cur_tag[2][0][0] = c
 500                         when '' # EOF
 501                                 parse_error()
 502                                 tok_state = tok_state_data
 503                         else
 504                                 if uc_alpha.indexOf(c) > -1
 505                                         tok_cur_tag[2][0][0] = c.toLowerCase()
 506                                 else
 507                                         tok_cur_tag[2][0][0] += c
 508                 return null
 509
 510         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
 511         tok_state_before_attribute_value = ->
 512                 switch c = txt.charAt(cur++)
 513                         when "\t", "\n", "\u000c", ' '
 514                                 return null
 515                         when '"'
 516                                 tok_state = tok_state_attribute_value_double_quoted
 517                         when '&'
 518                                 tok_state = tok_state_attribute_value_unquoted
 519                                 cur -= 1
 520                         when "'"
 521                                 tok_state = tok_state_attribute_value_single_quoted
 522                         when "\u0000"
 523                                 # Parse error
 524                                 tok_cur_tag[2][0][1] += "\ufffd"
 525                                 tok_state = tok_state_attribute_value_unquoted
 526                         when '>'
 527                                 # Parse error
 528                                 tok_state = tok_state_data
 529                                 tmp = tok_cur_tag
 530                                 tok_cur_tag = null
 531                                 return tmp
 532                         when '' # EOF
 533                                 parse_error()
 534                                 tok_state = tok_state_data
 535                         else
 536                                 tok_cur_tag[2][0][1] += c
 537                                 tok_state = tok_state_attribute_value_unquoted
 538                 return null
 539
 540         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
 541         tok_state_attribute_value_double_quoted = ->
 542                 switch c = txt.charAt(cur++)
 543                         when '"'
 544                                 tok_state = tok_state_after_attribute_value_quoted
 545                         when '&'
 546                                 tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
 547                         when "\u0000"
 548                                 # Parse error
 549                                 tok_cur_tag[2][0][1] += "\ufffd"
 550                         when '' # EOF
 551                                 parse_error()
 552                                 tok_state = tok_state_data
 553                         else
 554                                 tok_cur_tag[2][0][1] += c
 555                 return null
 556
 557         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
 558         tok_state_attribute_value_single_quoted = ->
 559                 switch c = txt.charAt(cur++)
 560                         when "'"
 561                                 tok_state = tok_state_after_attribute_value_quoted
 562                         when '&'
 563                                 tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
 564                         when "\u0000"
 565                                 # Parse error
 566                                 tok_cur_tag[2][0][1] += "\ufffd"
 567                         when '' # EOF
 568                                 parse_error()
 569                                 tok_state = tok_state_data
 570                         else
 571                                 tok_cur_tag[2][0][1] += c
 572                 return null
 573
 574         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
 575         tok_state_attribute_value_unquoted = ->
 576                 switch c = txt.charAt(cur++)
 577                         when "\t", "\n", "\u000c", ' '
 578                                 tok_state = tok_state_before_attribute_name
 579                         when '&'
 580                                 tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
 581                         when '>'
 582                                 tok_state = tok_state_data
 583                                 tmp = tok_cur_tag
 584                                 tok_cur_tag = null
 585                                 return tmp
 586                         when "\u0000"
 587                                 tok_cur_tag[2][0][1] += "\ufffd"
 588                         when '' # EOF
 589                                 parse_error()
 590                                 tok_state = tok_state_data
 591                         else
 592                                 # Parse Error if ', <, = or ` (backtick)
 593                                 tok_cur_tag[2][0][1] += c
 594                 return null
 595
 596         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
 597         tok_state_after_attribute_value_quoted = ->
 598                 switch c = txt.charAt(cur++)
 599                         when "\t", "\n", "\u000c", ' '
 600                                 tok_state = tok_state_before_attribute_name
 601                         when '/'
 602                                 tok_state = tok_state_self_closing_start_tag
 603                         when '>'
 604                                 tok_state = tok_state_data
 605                                 tmp = tok_cur_tag
 606                                 tok_cur_tag = null
 607                                 return tmp
 608                         when '' # EOF
 609                                 parse_error()
 610                                 tok_state = tok_state_data
 611                         else
 612                                 # Parse Error
 613                                 tok_state = tok_state_before_attribute_name
 614                                 cur -= 1 # we didn't handle that char
 615                 return null
 616
 617         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
 618         # Don't set this as a state, just call it
 619         # returns a string (NOT a text node)
 620         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
 621                 if cur >= txt.length
 622                         return '&'
 623                 switch c = txt.charAt(cur)
 624                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
 625                                 # explicitly not a parse error
 626                                 return '&'
 627                         when ';'
 628                                 # there has to be "one or more" alnums between & and ; to be a parse error
 629                                 return '&'
 630                         when '#'
 631                                 if cur + 1 >= txt.length
 632                                         return '&'
 633                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
 634                                         prefix = '#x'
 635                                         charset = hex_chars
 636                                         start = cur + 2
 637                                 else
 638                                         charset = digits
 639                                         start = cur + 1
 640                                         prefix = '#'
 641                                 i = 0
 642                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
 643                                         i += 1
 644                                 if i is 0
 645                                         return '&'
 646                                 if txt.charAt(start + i) is ';'
 647                                         i += 1
 648                                 # FIXME This is supposed to generate parse errors for some chars
 649                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
 650                                 if decoded?
 651                                         cur = start + i
 652                                         return decoded
 653                                 return '&'
 654                         else
 655                                 for i in [0...31]
 656                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
 657                                                 break
 658                                 if i is 0
 659                                         # exit early, because parse_error() below needs at least one alnum
 660                                         return '&'
 661                                 if txt.charAt(cur + i) is ';'
 662                                         i += 1 # include ';' terminator in value
 663                                         decoded = decode_named_char_ref txt.substr(cur, i)
 664                                         if decoded?
 665                                                 cur += i
 666                                                 return decoded
 667                                         parse_error()
 668                                         return '&'
 669                                 else
 670                                         # no ';' terminator (only legacy char refs)
 671                                         max = i
 672                                         for i in [2..max] # no prefix matches, so ok to check shortest first
 673                                                 c = legacy_char_refs[txt.substr(cur, i)]
 674                                                 if c?
 675                                                         if in_attr
 676                                                                 if txt.charAt(cur + i) is '='
 677                                                                         # "because some legacy user agents will
 678                                                                         # misinterpret the markup in those cases"
 679                                                                         parse_error()
 680                                                                         return '&'
 681                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
 682                                                                         # this makes attributes forgiving about url args
 683                                                                         return '&'
 684                                                         # ok, and besides the weird exceptions for attributes...
 685                                                         # return the matching char
 686                                                         cur += i # consume entity chars
 687                                                         parse_error() # because no terminating ";"
 688                                                         return c
 689                                         parse_error()
 690                                         return '&'
 691                 return # never reached
 692
 693         # tree constructor initialization
 694         # see comments on TYPE_TAG/etc for the structure of this data
 695         tree = [TYPE_TAG, 'html', {}, []]
 696         open_tags = [tree]
 697         tree_state = tree_in_body
 698         flag_frameset_ok = true
 699         flag_parsing = true
 700
 701         # tokenizer initialization
 702         tok_state = tok_state_data
 703
 704         # proccess input
 705         while flag_parsing
 706                 t = tok_state()
 707                 if t?
 708                         tree_state t
 709         return tree[3]
 710
 711 # everything below is tests on the above
 712 test_equals = (description, fn, args..., expected_output) ->
 713         output = fn.apply this, args
 714         if output is expected_output
 715                 console.log "passed: #{description}."
 716         else
 717                 console.log "FAILED: #{description}..."
 718                 console.log "   Expected: #{expected_output}"
 719                 console.log "     Actual: #{output}"
 720 html_to_json = (html) ->
 721         return JSON.stringify parse_html html
 722 test_equals "empty", html_to_json, "", '[]'
 723 test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
 724 test_equals "named entity", html_to_json, "a&amp;1234", '[[1,"a&1234"]]'
 725 test_equals "broken named character references", html_to_json, "1&amp2&&amp;3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
 726 test_equals "numbered entity overrides", html_to_json, "1&#X80&#x80; &#x83", '[[1,"1€€ ƒ"]]'
 727 test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
 728 test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
 729 test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'
 730 test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
 731 test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
 732 test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
 733 test_equals "matching closing tags", html_to_json, "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]'
 734 test_equals "mis-matched closing tags", html_to_json, "foo<div>bar<span>baz</div>qux", '[[1,"foo"],[0,"div",{},[[1,"bar"],[0,"span",{},[[1,"baz"]]]]],[1,"qux"]]'