parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
  20 # or <body> tags, nor does it produce the top level "document" node in the dom
  21 # tree, nor nodes for html, head or body.
  22 #
  23 # Instead, the data structure produced by this parser is an array of nodes.
  24 #
  25 # Each node is an array. The first element in the array is an integer (one of
  26 # the TYPE_* constants below) followed by the appropriate fields for that type
  27 # (shown below in the comments after the TYPE_* definition.)
  28
  29 TYPE_TAG = 0 # name, {attributes}, [children]
  30 TYPE_TEXT = 1 # "text"
  31 TYPE_COMMENT = 2
  32 TYPE_DOCTYPE = 3
  33 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  34 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
  35 TYPE_END_TAG = 5 # name
  36 TYPE_EOF = 6
  37
  38 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
  39 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
  40 digits = "0123456789"
  41 alnum = lc_alpha + uc_alpha + digits
  42 hex_chars = digits + "abcdefABCDEF"
  43 scopers = { # FIXME these are supposed to be namespace specific
  44         'applet', 'caption', 'html', 'table', 'td', 'th', 'marquee', 'object',
  45         'template', 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml',
  46         'foreignObject', 'desc', 'title'
  47 }
  48
  49 # some SVG elements have dashes in them
  50 tag_name_chars = alnum + "-"
  51
  52 # http://www.w3.org/TR/html5/infrastructure.html#space-character
  53 space_chars = "\u0009\u000a\u000c\u000d\u0020"
  54
  55 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
  56 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
  57
  58 # These are the character references that don't need a terminating semicolon
  59 # min length: 2, max: 6, none are a prefix of any other.
  60 legacy_char_refs = {
  61         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
  62         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
  63         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
  64         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
  65         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
  66         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
  67         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
  68         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
  69         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
  70         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
  71         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
  72         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
  73         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
  74         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
  75         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
  76         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
  77         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
  78         yen: '¥', yuml: 'ÿ'
  79 }
  80
  81 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
  82 raw_text_elements = ['script', 'style']
  83 escapable_raw_text_elements = ['textarea', 'title']
  84 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
  85 svg_elements = [
  86         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
  87         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
  88         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
  89         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
  90         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
  91         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
  92         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
  93         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
  94         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
  95         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
  96         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
  97         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
  98         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
  99         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
 100         'view', 'vkern'
 101 ]
 102
 103 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
 104 mathml_elements = [
 105         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
 106         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
 107         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
 108         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
 109         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
 110         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 111         'determinant', 'diff', 'divergence', 'divide', 'domain',
 112         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 113         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 114         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 115         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 116         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 117         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 118         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 119         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 120         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 121         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 122         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 123         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 124         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 125         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 126         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 127         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 128         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 129         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 130         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 131         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 132         'vectorproduct', 'xor'
 133 ]
 134 # foreign_elements = [svg_elements..., mathml_elements...]
 135 #normal_elements = All other allowed HTML elements are normal elements.
 136
 137 special_elements = {
 138         # from HTML:
 139         address: true, applet: true, area: true, article: true, aside: true,
 140         base: true, basefont: true, bgsound: true, blockquote: true, body: true,
 141         br: true, button: true, caption: true, center: true, col: true,
 142         colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
 143         dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
 144         footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
 145         h3: true, h4: true, h5: true, h6: true, head: true, header: true,
 146         hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
 147         isindex: true, li: true, link: true, listing: true, main: true,
 148         marquee: true, meta: true, nav: true, noembed: true, noframes: true,
 149         noscript: true, object: true, ol: true, p: true, param: true,
 150         plaintext: true, pre: true, script: true, section: true, select: true,
 151         source: true, style: true, summary: true, table: true, tbody: true,
 152         td: true, template: true, textarea: true, tfoot: true, th: true,
 153         thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
 154         xmp: true,
 155
 156         # from MathML:
 157         mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
 158
 159         # from SVG:
 160         foreignObject: true, desc: true, title: true
 161 }
 162
 163 formatting_elements = {
 164          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
 165          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
 166          u: true
 167 }
 168
 169
 170 # decode_named_char_ref()
 171 #
 172 # The list of named character references is _huge_ so ask the browser to decode
 173 # for us instead of wasting bandwidth/space on including the table here.
 174 #
 175 # Pass without the "&" but with the ";" examples:
 176 #    for "&amp" pass "amp;"
 177 #    for "&#x2032" pass "x2032;"
 178 g_dncr = {
 179         cache: {}
 180         textarea: document.createElement('textarea')
 181 }
 182 # TODO test this in IE8
 183 decode_named_char_ref = (txt) ->
 184         txt = "&#{txt}"
 185         decoded = g_dncr.cache[txt]
 186         return decoded if decoded?
 187         g_dncr.textarea.innerHTML = txt
 188         decoded = g_dncr.textarea.value
 189         return null if decoded is txt
 190         return g_dncr.cache[txt] = decoded
 191
 192 parse_html = (txt, parse_error_cb = null) ->
 193         cur = 0 # index of next char in txt to be parsed
 194         # declare tree and tokenizer variables so they're in scope below
 195         tree = null
 196         open_tags = [] # stack of open elements
 197         tree_state = null
 198         tok_state = null
 199         tok_cur_tag = null # partially parsed tag
 200         flag_frameset_ok = null
 201         flag_parsing = null
 202
 203         parse_error = ->
 204                 if parse_error_cb?
 205                         parse_error_cb cur
 206                 else
 207                         console.log "Parse error at character #{cur} of #{txt.length}"
 208
 209
 210         # the functions below impliment the Tree Contstruction algorithm
 211         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 212
 213         # But first... the helpers
 214         template_tag_is_open = ->
 215                 for t of open_tags
 216                         if t[0] is TYPE_TAG and t[1] is 'template'
 217                                 return true
 218                 return false
 219         is_in_scope = (tag_name) ->
 220                 for t of open_tags
 221                         if t[0] is TYPE_TAG and t[1] is tag_name
 222                                 return true
 223                         # FIXME bail if in scopers
 224                 return false
 225
 226         reconstruct_active_formatting_elements = ->
 227                 # FIXME implement this
 228
 229         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
 230         # FIXME implement this
 231         close_p_if_in_button_scope = ->
 232                 if open_tags[0][1] is 'p' # FIXME
 233                         open_tags.pop()
 234                 return
 235                 #p = find_button_scope 'p'
 236                 #if p?
 237                         # TODO generate_implied_end_tags except for p tags
 238                         # TODO parse_error unless open_tags[0][1] is 'p'
 239                         # TODO pop stack until 'p' popped
 240
 241
 242
 243         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
 244         tree_insert_a_character = (t) ->
 245                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
 246                 if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
 247                         open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
 248                 else
 249                         open_tags[0][3].push t
 250
 251         # FIXME read spec, do this right
 252         # note: this assumes it's an open tag
 253         tree_insert_tag = (t) ->
 254                 t[0] = TYPE_TAG # not TYPE_OPEN_TAG
 255                 # convert attributes into a hash
 256                 attrs = {}
 257                 while t[2].length
 258                         a = t[2].pop()
 259                         attrs[a[0]] = a[1]
 260                 t[2] = attrs
 261                 open_tags[0][3].push t
 262                 open_tags.unshift t
 263
 264         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
 265         tree_insert_a_comment = (t) ->
 266                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
 267                 open_tags[0][3].push t
 268
 269         # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
 270         tree_in_body = (t) ->
 271                 switch t[0]
 272                         when TYPE_TEXT
 273                                 switch t[1]
 274                                         when "\u0000"
 275                                                 parse_error()
 276                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
 277                                                 reconstruct_active_formatting_elements()
 278                                                 tree_insert_a_character t
 279                                         else
 280                                                 reconstruct_active_formatting_elements()
 281                                                 tree_insert_a_character t
 282                                                 flag_frameset_ok = false
 283                         when TYPE_COMMENT
 284                                 tree_insert_a_comment t
 285                         when TYPE_DOCTYPE
 286                                 parse_error()
 287                         when TYPE_OPEN_TAG
 288                                 switch t[1]
 289                                         when 'html'
 290                                                 parse_error()
 291                                                 return if template_tag_is_open()
 292                                                 root_attrs = open_tags[open_tags.length - 1][3]
 293                                                 for k, v of t[2]
 294                                                         root_attrs[k] = v unless root_attrs[k]?
 295                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
 296                                                 # FIXME also do this for </template> (end tag)
 297                                                 return tree_in_head t
 298                                         when 'body'
 299                                                 parse_error()
 300                                                 # TODO
 301                                         when 'frameset'
 302                                                 parse_error()
 303                                                 # TODO
 304                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
 305                                                 close_p_if_in_button_scope()
 306                                                 tree_insert_tag t
 307                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
 308                                                 close_p_if_in_button_scope()
 309                                                 if open_tags[0][1] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
 310                                                         parse_error()
 311                                                         open_tags.shift()
 312                                                 tree_insert_tag t
 313                                         # TODO lots more to implement here
 314                                         else # any other start tag
 315                                                 reconstruct_active_formatting_elements()
 316                                                 tree_insert_tag t
 317                         when TYPE_EOF
 318                                 ok_tags = {
 319                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
 320                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
 321                                 }
 322                                 for t in open_tags
 323                                         unless ok_tags[t[1]]?
 324                                                 parse_error()
 325                                                 break
 326                                 # TODO stack of template insertion modes thing
 327                                 flag_parsing = false # stop parsing
 328                         when TYPE_END_TAG
 329                                 switch t[1]
 330                                         when 'body'
 331                                                 unless is_in_scope 'body'
 332                                                         parse_error()
 333                                                         return
 334                                                 # TODO implement parse error and move to tree_after_body
 335                                         when 'html'
 336                                                 unless is_in_scope 'body' # weird, but it's what the spec says
 337                                                         parse_error()
 338                                                         return
 339                                                 # TODO implement parse error and move to tree_after_body, reprocess
 340                                         # TODO lots more close tags to implement here
 341                                         else
 342                                                 for node, i in open_tags
 343                                                         if node[1] is t[1]
 344                                                                 # FIXME generate implied end tags except those with name==t[1]
 345                                                                 parse_error() unless i is 0
 346                                                                 while i > 0
 347                                                                         open_tags.shift()
 348                                                                         i -= 1
 349                                                                 open_tags.shift()
 350                                                                 return
 351                                                         if special_elements[node[1]]?
 352                                                                 parse_error()
 353                                                                 return
 354
 355
 356         # the functions below implement the tokenizer stats described here:
 357         # http://www.w3.org/TR/html5/syntax.html#tokenization
 358
 359         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
 360         tok_state_data = ->
 361                 switch c = txt.charAt(cur++)
 362                         when '&'
 363                                 return [TYPE_TEXT, tokenize_character_reference()]
 364                         when '<'
 365                                 tok_state = tok_state_tag_open
 366                         when "\u0000"
 367                                 parse_error()
 368                                 return [TYPE_TEXT, c]
 369                         when '' # EOF
 370                                 return [TYPE_EOF]
 371                         else
 372                                 return [TYPE_TEXT, c]
 373                 return null
 374
 375         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
 376         # not needed: tok_state_character_reference_in_data = ->
 377         # just call tok_state_character_reference_in_data()
 378
 379         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
 380         tok_state_tag_open = ->
 381                 switch c = txt.charAt(cur++)
 382                         when '!'
 383                                 tok_state = tok_state_markup_declaration_open
 384                         when '/'
 385                                 tok_state = tok_state_end_tag_open
 386                         when '?'
 387                                 parse_error()
 388                                 tok_state = tok_state_bogus_comment
 389                         else
 390                                 if lc_alpha.indexOf(c) > -1
 391                                         tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
 392                                         tok_state = tok_state_tag_name
 393                                 else if uc_alpha.indexOf(c) > -1
 394                                         tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
 395                                         tok_state = tok_state_tag_name
 396                                 else
 397                                         parse_error()
 398                                         tok_state = tok_state_data
 399                                         cur -= 1 # we didn't parse/handle the char after <
 400                                         return [TYPE_TEXT, '<']
 401                 return null
 402
 403         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
 404         tok_state_end_tag_open = ->
 405                 switch c = txt.charAt(cur++)
 406                         when '>'
 407                                 parse_error()
 408                                 tok_state = tok_state_data
 409                         when '' # EOF
 410                                 parse_error()
 411                                 tok_state = tok_state_data
 412                                 return [TYPE_TEXT, '</']
 413                         else
 414                                 if uc_alpha.indexOf(c) > -1
 415                                         tok_cur_tag = [TYPE_END_TAG, c.toLowerCase(), [], []]
 416                                         tok_state = tok_state_tag_name
 417                                 else if lc_alpha.indexOf(c) > -1
 418                                         tok_cur_tag = [TYPE_END_TAG, c, [], []]
 419                                         tok_state = tok_state_tag_name
 420                                 else
 421                                         parse_error()
 422                                         tok_state = tok_state_bogus_comment
 423                 return null
 424
 425         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
 426         tok_state_tag_name = ->
 427                 switch c = txt.charAt(cur++)
 428                         when "\t", "\n", "\u000c", ' '
 429                                 tok_state = tok_state_before_attribute_name
 430                         when '/'
 431                                 tok_state = tok_state_self_closing_start_tag
 432                         when '>'
 433                                 tok_state = tok_state_data
 434                                 tmp = tok_cur_tag
 435                                 tok_cur_tag = null
 436                                 return tmp
 437                         when "\u0000"
 438                                 parse_error()
 439                                 tok_cur_tag[1] += "\ufffd"
 440                         when '' # EOF
 441                                 parse_error()
 442                                 tok_state = tok_state_data
 443                         else
 444                                 if uc_alpha.indexOf(c) > -1
 445                                         tok_cur_tag[1] += c.toLowerCase()
 446                                 else
 447                                         tok_cur_tag[1] += c
 448                 return null
 449
 450         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
 451         tok_state_before_attribute_name = ->
 452                 attr_name = null
 453                 switch c = txt.charAt(cur++)
 454                         when "\t", "\n", "\u000c", ' '
 455                                 return null
 456                         when '/'
 457                                 tok_state = tok_state_self_closing_start_tag
 458                                 return null
 459                         when '>'
 460                                 tok_state = tok_state_data
 461                                 tmp = tok_cur_tag
 462                                 tok_cur_tag = null
 463                                 return tmp
 464                         when "\u0000"
 465                                 parse_error()
 466                                 attr_name = "\ufffd"
 467                         when '"', "'", '<', '='
 468                                 parse_error()
 469                                 attr_name = c
 470                         when '' # EOF
 471                                 parse_error()
 472                                 tok_state = tok_state_data
 473                         else
 474                                 if uc_alpha.indexOf(c) > -1
 475                                         attr_name = c.toLowerCase()
 476                                 else
 477                                         attr_name = c
 478                 if attr_name?
 479                         tok_cur_tag[2].unshift [attr_name, '']
 480                         tok_state = tok_state_attribute_name
 481                 return null
 482
 483         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
 484         tok_state_attribute_name = ->
 485                 switch c = txt.charAt(cur++)
 486                         when "\t", "\n", "\u000c", ' '
 487                                 tok_state = tok_state_after_attribute_name
 488                         when '/'
 489                                 tok_state = tok_state_self_closing_start_tag
 490                         when '='
 491                                 tok_state = tok_state_before_attribute_value
 492                         when '>'
 493                                 tok_state = tok_state_data
 494                                 tmp = tok_cur_tag
 495                                 tok_cur_tag = null
 496                                 return tmp
 497                         when "\u0000"
 498                                 parse_error()
 499                                 tok_cur_tag[2][0][0] = "\ufffd"
 500                         when '"', "'", '<'
 501                                 parse_error()
 502                                 tok_cur_tag[2][0][0] = c
 503                         when '' # EOF
 504                                 parse_error()
 505                                 tok_state = tok_state_data
 506                         else
 507                                 if uc_alpha.indexOf(c) > -1
 508                                         tok_cur_tag[2][0][0] = c.toLowerCase()
 509                                 else
 510                                         tok_cur_tag[2][0][0] += c
 511                 return null
 512
 513         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
 514         tok_state_before_attribute_value = ->
 515                 switch c = txt.charAt(cur++)
 516                         when "\t", "\n", "\u000c", ' '
 517                                 return null
 518                         when '"'
 519                                 tok_state = tok_state_attribute_value_double_quoted
 520                         when '&'
 521                                 tok_state = tok_state_attribute_value_unquoted
 522                                 cur -= 1
 523                         when "'"
 524                                 tok_state = tok_state_attribute_value_single_quoted
 525                         when "\u0000"
 526                                 # Parse error
 527                                 tok_cur_tag[2][0][1] += "\ufffd"
 528                                 tok_state = tok_state_attribute_value_unquoted
 529                         when '>'
 530                                 # Parse error
 531                                 tok_state = tok_state_data
 532                                 tmp = tok_cur_tag
 533                                 tok_cur_tag = null
 534                                 return tmp
 535                         when '' # EOF
 536                                 parse_error()
 537                                 tok_state = tok_state_data
 538                         else
 539                                 tok_cur_tag[2][0][1] += c
 540                                 tok_state = tok_state_attribute_value_unquoted
 541                 return null
 542
 543         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
 544         tok_state_attribute_value_double_quoted = ->
 545                 switch c = txt.charAt(cur++)
 546                         when '"'
 547                                 tok_state = tok_state_after_attribute_value_quoted
 548                         when '&'
 549                                 tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
 550                         when "\u0000"
 551                                 # Parse error
 552                                 tok_cur_tag[2][0][1] += "\ufffd"
 553                         when '' # EOF
 554                                 parse_error()
 555                                 tok_state = tok_state_data
 556                         else
 557                                 tok_cur_tag[2][0][1] += c
 558                 return null
 559
 560         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
 561         tok_state_attribute_value_single_quoted = ->
 562                 switch c = txt.charAt(cur++)
 563                         when "'"
 564                                 tok_state = tok_state_after_attribute_value_quoted
 565                         when '&'
 566                                 tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
 567                         when "\u0000"
 568                                 # Parse error
 569                                 tok_cur_tag[2][0][1] += "\ufffd"
 570                         when '' # EOF
 571                                 parse_error()
 572                                 tok_state = tok_state_data
 573                         else
 574                                 tok_cur_tag[2][0][1] += c
 575                 return null
 576
 577         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
 578         tok_state_attribute_value_unquoted = ->
 579                 switch c = txt.charAt(cur++)
 580                         when "\t", "\n", "\u000c", ' '
 581                                 tok_state = tok_state_before_attribute_name
 582                         when '&'
 583                                 tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
 584                         when '>'
 585                                 tok_state = tok_state_data
 586                                 tmp = tok_cur_tag
 587                                 tok_cur_tag = null
 588                                 return tmp
 589                         when "\u0000"
 590                                 tok_cur_tag[2][0][1] += "\ufffd"
 591                         when '' # EOF
 592                                 parse_error()
 593                                 tok_state = tok_state_data
 594                         else
 595                                 # Parse Error if ', <, = or ` (backtick)
 596                                 tok_cur_tag[2][0][1] += c
 597                 return null
 598
 599         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
 600         tok_state_after_attribute_value_quoted = ->
 601                 switch c = txt.charAt(cur++)
 602                         when "\t", "\n", "\u000c", ' '
 603                                 tok_state = tok_state_before_attribute_name
 604                         when '/'
 605                                 tok_state = tok_state_self_closing_start_tag
 606                         when '>'
 607                                 tok_state = tok_state_data
 608                                 tmp = tok_cur_tag
 609                                 tok_cur_tag = null
 610                                 return tmp
 611                         when '' # EOF
 612                                 parse_error()
 613                                 tok_state = tok_state_data
 614                         else
 615                                 # Parse Error
 616                                 tok_state = tok_state_before_attribute_name
 617                                 cur -= 1 # we didn't handle that char
 618                 return null
 619
 620         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
 621         # Don't set this as a state, just call it
 622         # returns a string (NOT a text node)
 623         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
 624                 if cur >= txt.length
 625                         return '&'
 626                 switch c = txt.charAt(cur)
 627                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
 628                                 # explicitly not a parse error
 629                                 return '&'
 630                         when ';'
 631                                 # there has to be "one or more" alnums between & and ; to be a parse error
 632                                 return '&'
 633                         when '#'
 634                                 if cur + 1 >= txt.length
 635                                         return '&'
 636                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
 637                                         prefix = '#x'
 638                                         charset = hex_chars
 639                                         start = cur + 2
 640                                 else
 641                                         charset = digits
 642                                         start = cur + 1
 643                                         prefix = '#'
 644                                 i = 0
 645                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
 646                                         i += 1
 647                                 if i is 0
 648                                         return '&'
 649                                 if txt.charAt(start + i) is ';'
 650                                         i += 1
 651                                 # FIXME This is supposed to generate parse errors for some chars
 652                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
 653                                 if decoded?
 654                                         cur = start + i
 655                                         return decoded
 656                                 return '&'
 657                         else
 658                                 for i in [0...31]
 659                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
 660                                                 break
 661                                 if i is 0
 662                                         # exit early, because parse_error() below needs at least one alnum
 663                                         return '&'
 664                                 if txt.charAt(cur + i) is ';'
 665                                         i += 1 # include ';' terminator in value
 666                                         decoded = decode_named_char_ref txt.substr(cur, i)
 667                                         if decoded?
 668                                                 cur += i
 669                                                 return decoded
 670                                         parse_error()
 671                                         return '&'
 672                                 else
 673                                         # no ';' terminator (only legacy char refs)
 674                                         max = i
 675                                         for i in [2..max] # no prefix matches, so ok to check shortest first
 676                                                 c = legacy_char_refs[txt.substr(cur, i)]
 677                                                 if c?
 678                                                         if in_attr
 679                                                                 if txt.charAt(cur + i) is '='
 680                                                                         # "because some legacy user agents will
 681                                                                         # misinterpret the markup in those cases"
 682                                                                         parse_error()
 683                                                                         return '&'
 684                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
 685                                                                         # this makes attributes forgiving about url args
 686                                                                         return '&'
 687                                                         # ok, and besides the weird exceptions for attributes...
 688                                                         # return the matching char
 689                                                         cur += i # consume entity chars
 690                                                         parse_error() # because no terminating ";"
 691                                                         return c
 692                                         parse_error()
 693                                         return '&'
 694                 return # never reached
 695
 696         # tree constructor initialization
 697         # see comments on TYPE_TAG/etc for the structure of this data
 698         tree = [TYPE_TAG, 'html', {}, []]
 699         open_tags = [tree]
 700         tree_state = tree_in_body
 701         flag_frameset_ok = true
 702         flag_parsing = true
 703
 704         # tokenizer initialization
 705         tok_state = tok_state_data
 706
 707         # proccess input
 708         while flag_parsing
 709                 t = tok_state()
 710                 if t?
 711                         tree_state t
 712         return tree[3]
 713
 714 # everything below is tests on the above
 715 test_equals = (description, output, expected_output) ->
 716         if output is expected_output
 717                 console.log "passed." # don't say name, so smart consoles can merge all of these
 718         else
 719                 console.log "FAILED: \"#{description}\""
 720                 console.log "   Expected: #{expected_output}"
 721                 console.log "     Actual: #{output}"
 722 test_parser = (args) ->
 723         parse_errors = []
 724         errors_cb = (i) ->
 725                 parse_errors.push i
 726         parsed = parse_html args.html, errors_cb
 727         parsed = JSON.stringify parsed
 728         if parsed isnt args.expected or parse_errors.length isnt args.errors
 729                 console.log "test FAILED: \"#{args.name}\""
 730         else
 731                 console.log 'test passed'
 732         if parsed isnt args.expected
 733                 console.log "      Input: #{args.html}"
 734                 console.log "    Correct: #{args.expected}"
 735                 console.log "     Output: #{parsed}"
 736         if parse_errors.length isnt args.errors
 737                 console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
 738
 739 test_parser name: "empty", \
 740         html: "",
 741         expected: '[]',
 742         errors: 0
 743 test_parser name: "just text", \
 744         html: "abc",
 745         expected: '[[1,"abc"]]',
 746         errors: 0
 747 test_parser name: "named entity", \
 748         html: "a&amp;1234",
 749         expected: '[[1,"a&1234"]]',
 750         errors: 0
 751 test_parser name: "broken named character references", \
 752         html: "1&amp2&&amp;3&aabbcc;",
 753         expected: '[[1,"1&2&&3&aabbcc;"]]',
 754         errors: 2
 755 test_parser name: "numbered entity overrides", \
 756         html: "1&#X80&#x80; &#x83",
 757         expected: '[[1,"1€€ ƒ"]]',
 758         errors: 0
 759 test_parser name: "open tag", \
 760         html: "foo<span>bar",
 761         expected: '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]',
 762         errors: 1 # no close tag
 763 test_parser name: "open tag with attributes", \
 764         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
 765         expected: '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]',
 766         errors: 1 # no close tag
 767 test_parser name: "open tag with attributes of various quotings", \
 768         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
 769         expected: '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]',
 770         errors: 1 # no close tag
 771 test_parser name: "attribute entity exceptions dq", \
 772         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
 773         expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]',
 774         errors: 2 # no close tag, &amp= in attr
 775 test_parser name: "attribute entity exceptions sq", \
 776         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
 777         expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]',
 778         errors: 2 # no close tag, &amp= in attr
 779 test_parser name: "attribute entity exceptions uq", \
 780         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
 781         expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]',
 782         errors: 2 # no close tag, &amp= in attr
 783 test_parser name: "matching closing tags", \
 784         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
 785         expected: '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]',
 786         errors: 0
 787 test_parser name: "mis-matched closing tags", \
 788         html: "foo<div>bar<span>baz</div>qux",
 789         expected: '[[1,"foo"],[0,"div",{},[[1,"bar"],[0,"span",{},[[1,"baz"]]]]],[1,"qux"]]',
 790         errors: 1 # close tag mismatch