parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
  20 # or <body> tags, nor does it produce the top level "document" node in the dom
  21 # tree, nor nodes for html, head or body.
  22 #
  23 # Instead, the data structure produced by this parser is an array of nodes.
  24 #
  25 # Each node is an obect of the Node class. Here are the Node types:
  26 TYPE_TAG = 0 # name, {attributes}, [children]
  27 TYPE_TEXT = 1 # "text"
  28 TYPE_COMMENT = 2
  29 TYPE_DOCTYPE = 3
  30 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  31 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
  32 TYPE_END_TAG = 5 # name
  33 TYPE_EOF = 6
  34
  35 class Node
  36         constructor: (type, args = {}) ->
  37                 @type = type # one of the TYPE_* constants above
  38                 @name = args.name ? '' # tag name
  39                 @text = args.text ? '' # contents for text/comment nodes
  40                 @attrs = args.attrs ? {}
  41                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
  42                 @children = args.children ? []
  43         serialize: -> # for unit tests
  44                 ret = ''
  45                 switch @type
  46                         when TYPE_TAG
  47                                 ret += 'tag:'
  48                                 ret += JSON.stringify @name
  49                                 ret += ','
  50                                 ret += JSON.stringify @attrs
  51                                 ret += ','
  52                                 sep = '['
  53                                 for c in @children
  54                                         ret += sep
  55                                         sep = ','
  56                                         ret += c.serialize()
  57                                 ret += ']'
  58                         when TYPE_TEXT
  59                                 ret += 'text:'
  60                                 ret += JSON.stringify @text
  61                         when TYPE_COMMENT
  62                                 ret += 'comment:'
  63                                 ret += JSON.stringify @text
  64                         when TYPE_DOCTYPE
  65                                 ret += 'doctype'
  66                                 # FIXME
  67                         else
  68                                 ret += 'unknown:'
  69                 return ret
  70
  71
  72 # helpers: (only take args that are normally known when parser creates nodes)
  73 new_open_tag = (name) ->
  74         return new Node TYPE_OPEN_TAG, name: name
  75 new_end_tag = (name) ->
  76         return new Node TYPE_END_TAG, name: name
  77 new_text_node = (txt) ->
  78         return new Node TYPE_TEXT, text: txt
  79 new_comment_node = (txt) ->
  80         return new Node TYPE_COMMENT, text: txt
  81 new_eof_token = ->
  82         return new Node TYPE_EOF
  83
  84 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
  85 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
  86 digits = "0123456789"
  87 alnum = lc_alpha + uc_alpha + digits
  88 hex_chars = digits + "abcdefABCDEF"
  89 scopers = { # FIXME these are supposed to be namespace specific
  90         'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
  91         'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
  92         'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
  93         'foreignObject': true, 'desc': true, 'title'
  94 }
  95
  96 # some SVG elements have dashes in them
  97 tag_name_chars = alnum + "-"
  98
  99 # http://www.w3.org/TR/html5/infrastructure.html#space-character
 100 space_chars = "\u0009\u000a\u000c\u000d\u0020"
 101
 102 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
 103 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
 104
 105 # These are the character references that don't need a terminating semicolon
 106 # min length: 2, max: 6, none are a prefix of any other.
 107 legacy_char_refs = {
 108         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
 109         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
 110         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
 111         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
 112         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
 113         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
 114         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
 115         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
 116         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
 117         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
 118         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
 119         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
 120         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
 121         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
 122         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
 123         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
 124         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
 125         yen: '¥', yuml: 'ÿ'
 126 }
 127
 128 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
 129 raw_text_elements = ['script', 'style']
 130 escapable_raw_text_elements = ['textarea', 'title']
 131 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
 132 svg_elements = [
 133         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
 134         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
 135         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
 136         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
 137         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
 138         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
 139         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
 140         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
 141         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
 142         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
 143         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
 144         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
 145         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
 146         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
 147         'view', 'vkern'
 148 ]
 149
 150 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
 151 mathml_elements = [
 152         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
 153         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
 154         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
 155         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
 156         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
 157         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 158         'determinant', 'diff', 'divergence', 'divide', 'domain',
 159         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 160         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 161         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 162         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 163         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 164         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 165         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 166         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 167         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 168         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 169         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 170         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 171         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 172         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 173         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 174         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 175         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 176         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 177         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 178         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 179         'vectorproduct', 'xor'
 180 ]
 181 # foreign_elements = [svg_elements..., mathml_elements...]
 182 #normal_elements = All other allowed HTML elements are normal elements.
 183
 184 special_elements = {
 185         # from HTML:
 186         address: true, applet: true, area: true, article: true, aside: true,
 187         base: true, basefont: true, bgsound: true, blockquote: true, body: true,
 188         br: true, button: true, caption: true, center: true, col: true,
 189         colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
 190         dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
 191         footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
 192         h3: true, h4: true, h5: true, h6: true, head: true, header: true,
 193         hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
 194         isindex: true, li: true, link: true, listing: true, main: true,
 195         marquee: true, meta: true, nav: true, noembed: true, noframes: true,
 196         noscript: true, object: true, ol: true, p: true, param: true,
 197         plaintext: true, pre: true, script: true, section: true, select: true,
 198         source: true, style: true, summary: true, table: true, tbody: true,
 199         td: true, template: true, textarea: true, tfoot: true, th: true,
 200         thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
 201         xmp: true,
 202
 203         # from MathML:
 204         mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
 205
 206         # from SVG:
 207         foreignObject: true, desc: true, title: true
 208 }
 209
 210 formatting_elements = {
 211          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
 212          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
 213          u: true
 214 }
 215
 216
 217 # decode_named_char_ref()
 218 #
 219 # The list of named character references is _huge_ so ask the browser to decode
 220 # for us instead of wasting bandwidth/space on including the table here.
 221 #
 222 # Pass without the "&" but with the ";" examples:
 223 #    for "&amp" pass "amp;"
 224 #    for "&#x2032" pass "x2032;"
 225 g_dncr = {
 226         cache: {}
 227         textarea: document.createElement('textarea')
 228 }
 229 # TODO test this in IE8
 230 decode_named_char_ref = (txt) ->
 231         txt = "&#{txt}"
 232         decoded = g_dncr.cache[txt]
 233         return decoded if decoded?
 234         g_dncr.textarea.innerHTML = txt
 235         decoded = g_dncr.textarea.value
 236         return null if decoded is txt
 237         return g_dncr.cache[txt] = decoded
 238
 239 parse_html = (txt, parse_error_cb = null) ->
 240         cur = 0 # index of next char in txt to be parsed
 241         # declare tree and tokenizer variables so they're in scope below
 242         tree = null
 243         open_tags = [] # stack of open elements
 244         tree_state = null
 245         tok_state = null
 246         tok_cur_tag = null # partially parsed tag
 247         flag_frameset_ok = null
 248         flag_parsing = null
 249
 250         parse_error = ->
 251                 if parse_error_cb?
 252                         parse_error_cb cur
 253                 else
 254                         console.log "Parse error at character #{cur} of #{txt.length}"
 255
 256
 257         # the functions below impliment the Tree Contstruction algorithm
 258         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 259
 260         # But first... the helpers
 261         template_tag_is_open = ->
 262                 for t of open_tags
 263                         if t.type is TYPE_TAG and t.name is 'template'
 264                                 return true
 265                 return false
 266         is_in_scope = (tag_name) ->
 267                 for t of open_tags
 268                         if t.name is tag_name
 269                                 return true
 270                         if t.name of scopers
 271                                 return false
 272                 return false
 273
 274         reconstruct_active_formatting_elements = ->
 275                 # FIXME implement this
 276
 277         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
 278         # FIXME implement this
 279         close_p_if_in_button_scope = ->
 280                 if open_tags[0].name is 'p'
 281                         open_tags.pop()
 282                 return
 283                 #p = find_button_scope 'p'
 284                 #if p?
 285                         # TODO generate_implied_end_tags except for p tags
 286                         # TODO parse_error unless open_tags[0].name is 'p'
 287                         # TODO pop stack until 'p' popped
 288
 289
 290
 291         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
 292         tree_insert_a_character = (t) ->
 293                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
 294                 dest = open_tags[0].children
 295                 if dest.length > 0 and dest[dest.length - 1].type is TYPE_TEXT
 296                         dest[dest.length - 1].text += t.text
 297                 else
 298                         dest.push t
 299
 300         # FIXME read spec, do this right
 301         # note: this assumes it's an open tag
 302         tree_insert_tag = (t) ->
 303                 t.type = TYPE_TAG # not TYPE_OPEN_TAG
 304                 # convert attributes into a hash
 305                 while t.attrs_a.length
 306                         a = t.attrs_a.pop()
 307                         t.attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
 308                 open_tags[0].children.push t
 309                 open_tags.unshift t
 310
 311         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
 312         tree_insert_a_comment = (t) ->
 313                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
 314                 open_tags[0].children.push t
 315
 316         # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
 317         tree_in_body = (t) ->
 318                 switch t.type
 319                         when TYPE_TEXT
 320                                 switch t.text
 321                                         when "\u0000"
 322                                                 parse_error()
 323                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
 324                                                 reconstruct_active_formatting_elements()
 325                                                 tree_insert_a_character t
 326                                         else
 327                                                 reconstruct_active_formatting_elements()
 328                                                 tree_insert_a_character t
 329                                                 flag_frameset_ok = false
 330                         when TYPE_COMMENT
 331                                 tree_insert_a_comment t
 332                         when TYPE_DOCTYPE
 333                                 parse_error()
 334                         when TYPE_OPEN_TAG
 335                                 switch t.name
 336                                         when 'html'
 337                                                 parse_error()
 338                                                 return if template_tag_is_open()
 339                                                 root_attrs = open_tags[open_tags.length - 1].children
 340                                                 for k, v of t.attrs
 341                                                         root_attrs[k] = v unless root_attrs[k]?
 342                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
 343                                                 # FIXME also do this for </template> (end tag)
 344                                                 return tree_in_head t
 345                                         when 'body'
 346                                                 parse_error()
 347                                                 # TODO
 348                                         when 'frameset'
 349                                                 parse_error()
 350                                                 # TODO
 351                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
 352                                                 close_p_if_in_button_scope()
 353                                                 tree_insert_tag t
 354                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
 355                                                 close_p_if_in_button_scope()
 356                                                 if open_tags[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
 357                                                         parse_error()
 358                                                         open_tags.shift()
 359                                                 tree_insert_tag t
 360                                         # TODO lots more to implement here
 361                                         else # any other start tag
 362                                                 reconstruct_active_formatting_elements()
 363                                                 tree_insert_tag t
 364                         when TYPE_EOF
 365                                 ok_tags = {
 366                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
 367                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
 368                                 }
 369                                 for t in open_tags
 370                                         unless ok_tags[t.name]?
 371                                                 parse_error()
 372                                                 break
 373                                 # TODO stack of template insertion modes thing
 374                                 flag_parsing = false # stop parsing
 375                         when TYPE_END_TAG
 376                                 switch t.name
 377                                         when 'body'
 378                                                 unless is_in_scope 'body'
 379                                                         parse_error()
 380                                                         return
 381                                                 # TODO implement parse error and move to tree_after_body
 382                                         when 'html'
 383                                                 unless is_in_scope 'body' # weird, but it's what the spec says
 384                                                         parse_error()
 385                                                         return
 386                                                 # TODO implement parse error and move to tree_after_body, reprocess
 387                                         # TODO lots more close tags to implement here
 388                                         else
 389                                                 for node, i in open_tags
 390                                                         if node.name is t.name
 391                                                                 # FIXME generate implied end tags except those with name==t.name
 392                                                                 parse_error() unless i is 0
 393                                                                 while i > 0
 394                                                                         open_tags.shift()
 395                                                                         i -= 1
 396                                                                 open_tags.shift()
 397                                                                 return
 398                                                         if special_elements[node.name]?
 399                                                                 parse_error()
 400                                                                 return
 401
 402
 403         # the functions below implement the tokenizer stats described here:
 404         # http://www.w3.org/TR/html5/syntax.html#tokenization
 405
 406         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
 407         tok_state_data = ->
 408                 switch c = txt.charAt(cur++)
 409                         when '&'
 410                                 return new_text_node tokenize_character_reference()
 411                         when '<'
 412                                 tok_state = tok_state_tag_open
 413                         when "\u0000"
 414                                 parse_error()
 415                                 return new_text_node c
 416                         when '' # EOF
 417                                 return new_eof_token()
 418                         else
 419                                 return new_text_node c
 420                 return null
 421
 422         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
 423         # not needed: tok_state_character_reference_in_data = ->
 424         # just call tok_state_character_reference_in_data()
 425
 426         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
 427         tok_state_tag_open = ->
 428                 switch c = txt.charAt(cur++)
 429                         when '!'
 430                                 tok_state = tok_state_markup_declaration_open
 431                         when '/'
 432                                 tok_state = tok_state_end_tag_open
 433                         when '?'
 434                                 parse_error()
 435                                 tok_state = tok_state_bogus_comment
 436                         else
 437                                 if lc_alpha.indexOf(c) > -1
 438                                         tok_cur_tag = new_open_tag c
 439                                         tok_state = tok_state_tag_name
 440                                 else if uc_alpha.indexOf(c) > -1
 441                                         tok_cur_tag = new_open_tag c.toLowerCase()
 442                                         tok_state = tok_state_tag_name
 443                                 else
 444                                         parse_error()
 445                                         tok_state = tok_state_data
 446                                         cur -= 1 # we didn't parse/handle the char after <
 447                                         return new_text_node '<'
 448                 return null
 449
 450         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
 451         tok_state_end_tag_open = ->
 452                 switch c = txt.charAt(cur++)
 453                         when '>'
 454                                 parse_error()
 455                                 tok_state = tok_state_data
 456                         when '' # EOF
 457                                 parse_error()
 458                                 tok_state = tok_state_data
 459                                 return new_text_node '</'
 460                         else
 461                                 if uc_alpha.indexOf(c) > -1
 462                                         tok_cur_tag = new_end_tag c.toLowerCase()
 463                                         tok_state = tok_state_tag_name
 464                                 else if lc_alpha.indexOf(c) > -1
 465                                         tok_cur_tag = new_end_tag c
 466                                         tok_state = tok_state_tag_name
 467                                 else
 468                                         parse_error()
 469                                         tok_state = tok_state_bogus_comment
 470                 return null
 471
 472         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
 473         tok_state_tag_name = ->
 474                 switch c = txt.charAt(cur++)
 475                         when "\t", "\n", "\u000c", ' '
 476                                 tok_state = tok_state_before_attribute_name
 477                         when '/'
 478                                 tok_state = tok_state_self_closing_start_tag
 479                         when '>'
 480                                 tok_state = tok_state_data
 481                                 tmp = tok_cur_tag
 482                                 tok_cur_tag = null
 483                                 return tmp
 484                         when "\u0000"
 485                                 parse_error()
 486                                 tok_cur_tag.name += "\ufffd"
 487                         when '' # EOF
 488                                 parse_error()
 489                                 tok_state = tok_state_data
 490                         else
 491                                 if uc_alpha.indexOf(c) > -1
 492                                         tok_cur_tag.name += c.toLowerCase()
 493                                 else
 494                                         tok_cur_tag.name += c
 495                 return null
 496
 497         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
 498         tok_state_before_attribute_name = ->
 499                 attr_name = null
 500                 switch c = txt.charAt(cur++)
 501                         when "\t", "\n", "\u000c", ' '
 502                                 return null
 503                         when '/'
 504                                 tok_state = tok_state_self_closing_start_tag
 505                                 return null
 506                         when '>'
 507                                 tok_state = tok_state_data
 508                                 tmp = tok_cur_tag
 509                                 tok_cur_tag = null
 510                                 return tmp
 511                         when "\u0000"
 512                                 parse_error()
 513                                 attr_name = "\ufffd"
 514                         when '"', "'", '<', '='
 515                                 parse_error()
 516                                 attr_name = c
 517                         when '' # EOF
 518                                 parse_error()
 519                                 tok_state = tok_state_data
 520                         else
 521                                 if uc_alpha.indexOf(c) > -1
 522                                         attr_name = c.toLowerCase()
 523                                 else
 524                                         attr_name = c
 525                 if attr_name?
 526                         tok_cur_tag.attrs_a.unshift [attr_name, '']
 527                         tok_state = tok_state_attribute_name
 528                 return null
 529
 530         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
 531         tok_state_attribute_name = ->
 532                 switch c = txt.charAt(cur++)
 533                         when "\t", "\n", "\u000c", ' '
 534                                 tok_state = tok_state_after_attribute_name
 535                         when '/'
 536                                 tok_state = tok_state_self_closing_start_tag
 537                         when '='
 538                                 tok_state = tok_state_before_attribute_value
 539                         when '>'
 540                                 tok_state = tok_state_data
 541                                 tmp = tok_cur_tag
 542                                 tok_cur_tag = null
 543                                 return tmp
 544                         when "\u0000"
 545                                 parse_error()
 546                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
 547                         when '"', "'", '<'
 548                                 parse_error()
 549                                 tok_cur_tag.attrs_a[0][0] = c
 550                         when '' # EOF
 551                                 parse_error()
 552                                 tok_state = tok_state_data
 553                         else
 554                                 if uc_alpha.indexOf(c) > -1
 555                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
 556                                 else
 557                                         tok_cur_tag.attrs_a[0][0] += c
 558                 return null
 559
 560         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
 561         tok_state_before_attribute_value = ->
 562                 switch c = txt.charAt(cur++)
 563                         when "\t", "\n", "\u000c", ' '
 564                                 return null
 565                         when '"'
 566                                 tok_state = tok_state_attribute_value_double_quoted
 567                         when '&'
 568                                 tok_state = tok_state_attribute_value_unquoted
 569                                 cur -= 1
 570                         when "'"
 571                                 tok_state = tok_state_attribute_value_single_quoted
 572                         when "\u0000"
 573                                 # Parse error
 574                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
 575                                 tok_state = tok_state_attribute_value_unquoted
 576                         when '>'
 577                                 # Parse error
 578                                 tok_state = tok_state_data
 579                                 tmp = tok_cur_tag
 580                                 tok_cur_tag = null
 581                                 return tmp
 582                         when '' # EOF
 583                                 parse_error()
 584                                 tok_state = tok_state_data
 585                         else
 586                                 tok_cur_tag.attrs_a[0][1] += c
 587                                 tok_state = tok_state_attribute_value_unquoted
 588                 return null
 589
 590         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
 591         tok_state_attribute_value_double_quoted = ->
 592                 switch c = txt.charAt(cur++)
 593                         when '"'
 594                                 tok_state = tok_state_after_attribute_value_quoted
 595                         when '&'
 596                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
 597                         when "\u0000"
 598                                 # Parse error
 599                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
 600                         when '' # EOF
 601                                 parse_error()
 602                                 tok_state = tok_state_data
 603                         else
 604                                 tok_cur_tag.attrs_a[0][1] += c
 605                 return null
 606
 607         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
 608         tok_state_attribute_value_single_quoted = ->
 609                 switch c = txt.charAt(cur++)
 610                         when "'"
 611                                 tok_state = tok_state_after_attribute_value_quoted
 612                         when '&'
 613                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
 614                         when "\u0000"
 615                                 # Parse error
 616                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
 617                         when '' # EOF
 618                                 parse_error()
 619                                 tok_state = tok_state_data
 620                         else
 621                                 tok_cur_tag.attrs_a[0][1] += c
 622                 return null
 623
 624         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
 625         tok_state_attribute_value_unquoted = ->
 626                 switch c = txt.charAt(cur++)
 627                         when "\t", "\n", "\u000c", ' '
 628                                 tok_state = tok_state_before_attribute_name
 629                         when '&'
 630                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
 631                         when '>'
 632                                 tok_state = tok_state_data
 633                                 tmp = tok_cur_tag
 634                                 tok_cur_tag = null
 635                                 return tmp
 636                         when "\u0000"
 637                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
 638                         when '' # EOF
 639                                 parse_error()
 640                                 tok_state = tok_state_data
 641                         else
 642                                 # Parse Error if ', <, = or ` (backtick)
 643                                 tok_cur_tag.attrs_a[0][1] += c
 644                 return null
 645
 646         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
 647         tok_state_after_attribute_value_quoted = ->
 648                 switch c = txt.charAt(cur++)
 649                         when "\t", "\n", "\u000c", ' '
 650                                 tok_state = tok_state_before_attribute_name
 651                         when '/'
 652                                 tok_state = tok_state_self_closing_start_tag
 653                         when '>'
 654                                 tok_state = tok_state_data
 655                                 tmp = tok_cur_tag
 656                                 tok_cur_tag = null
 657                                 return tmp
 658                         when '' # EOF
 659                                 parse_error()
 660                                 tok_state = tok_state_data
 661                         else
 662                                 # Parse Error
 663                                 tok_state = tok_state_before_attribute_name
 664                                 cur -= 1 # we didn't handle that char
 665                 return null
 666
 667         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
 668         # Don't set this as a state, just call it
 669         # returns a string (NOT a text node)
 670         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
 671                 if cur >= txt.length
 672                         return '&'
 673                 switch c = txt.charAt(cur)
 674                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
 675                                 # explicitly not a parse error
 676                                 return '&'
 677                         when ';'
 678                                 # there has to be "one or more" alnums between & and ; to be a parse error
 679                                 return '&'
 680                         when '#'
 681                                 if cur + 1 >= txt.length
 682                                         return '&'
 683                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
 684                                         prefix = '#x'
 685                                         charset = hex_chars
 686                                         start = cur + 2
 687                                 else
 688                                         charset = digits
 689                                         start = cur + 1
 690                                         prefix = '#'
 691                                 i = 0
 692                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
 693                                         i += 1
 694                                 if i is 0
 695                                         return '&'
 696                                 if txt.charAt(start + i) is ';'
 697                                         i += 1
 698                                 # FIXME This is supposed to generate parse errors for some chars
 699                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
 700                                 if decoded?
 701                                         cur = start + i
 702                                         return decoded
 703                                 return '&'
 704                         else
 705                                 for i in [0...31]
 706                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
 707                                                 break
 708                                 if i is 0
 709                                         # exit early, because parse_error() below needs at least one alnum
 710                                         return '&'
 711                                 if txt.charAt(cur + i) is ';'
 712                                         i += 1 # include ';' terminator in value
 713                                         decoded = decode_named_char_ref txt.substr(cur, i)
 714                                         if decoded?
 715                                                 cur += i
 716                                                 return decoded
 717                                         parse_error()
 718                                         return '&'
 719                                 else
 720                                         # no ';' terminator (only legacy char refs)
 721                                         max = i
 722                                         for i in [2..max] # no prefix matches, so ok to check shortest first
 723                                                 c = legacy_char_refs[txt.substr(cur, i)]
 724                                                 if c?
 725                                                         if in_attr
 726                                                                 if txt.charAt(cur + i) is '='
 727                                                                         # "because some legacy user agents will
 728                                                                         # misinterpret the markup in those cases"
 729                                                                         parse_error()
 730                                                                         return '&'
 731                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
 732                                                                         # this makes attributes forgiving about url args
 733                                                                         return '&'
 734                                                         # ok, and besides the weird exceptions for attributes...
 735                                                         # return the matching char
 736                                                         cur += i # consume entity chars
 737                                                         parse_error() # because no terminating ";"
 738                                                         return c
 739                                         parse_error()
 740                                         return '&'
 741                 return # never reached
 742
 743         # tree constructor initialization
 744         # see comments on TYPE_TAG/etc for the structure of this data
 745         tree = new Node TYPE_TAG, name: 'html'
 746         open_tags = [tree]
 747         tree_state = tree_in_body
 748         flag_frameset_ok = true
 749         flag_parsing = true
 750
 751         # tokenizer initialization
 752         tok_state = tok_state_data
 753
 754         # proccess input
 755         while flag_parsing
 756                 t = tok_state()
 757                 if t?
 758                         tree_state t
 759         return tree.children
 760
 761 # everything below is tests on the above
 762 test_equals = (description, output, expected_output) ->
 763         if output is expected_output
 764                 console.log "passed." # don't say name, so smart consoles can merge all of these
 765         else
 766                 console.log "FAILED: \"#{description}\""
 767                 console.log "   Expected: #{expected_output}"
 768                 console.log "     Actual: #{output}"
 769 test_parser = (args) ->
 770         parse_errors = []
 771         errors_cb = (i) ->
 772                 parse_errors.push i
 773         parsed = parse_html args.html, errors_cb
 774         serialized = ''
 775         sep = ''
 776         for t in parsed
 777                 serialized += sep
 778                 sep = ','
 779                 serialized += t.serialize()
 780         if serialized isnt args.expected or parse_errors.length isnt args.errors
 781                 console.log "test FAILED: \"#{args.name}\""
 782         else
 783                 console.log 'test passed'
 784         if serialized isnt args.expected
 785                 console.log "      Input: #{args.html}"
 786                 console.log "    Correct: #{args.expected}"
 787                 console.log "     Output: #{serialized}"
 788         if parse_errors.length isnt args.errors
 789                 console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
 790
 791 test_parser name: "empty", \
 792         html: "",
 793         expected: '',
 794         errors: 0
 795 test_parser name: "just text", \
 796         html: "abc",
 797         expected: 'text:"abc"',
 798         errors: 0
 799 test_parser name: "named entity", \
 800         html: "a&amp;1234",
 801         expected: 'text:"a&1234"',
 802         errors: 0
 803 test_parser name: "broken named character references", \
 804         html: "1&amp2&&amp;3&aabbcc;",
 805         expected: 'text:"1&2&&3&aabbcc;"',
 806         errors: 2
 807 test_parser name: "numbered entity overrides", \
 808         html: "1&#X80&#x80; &#x83",
 809         expected: 'text:"1€€ ƒ"',
 810         errors: 0
 811 test_parser name: "open tag", \
 812         html: "foo<span>bar",
 813         expected: 'text:"foo",tag:"span",{},[text:"bar"]',
 814         errors: 1 # no close tag
 815 test_parser name: "open tag with attributes", \
 816         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
 817         expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
 818         errors: 1 # no close tag
 819 test_parser name: "open tag with attributes of various quotings", \
 820         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
 821         expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
 822         errors: 1 # no close tag
 823 test_parser name: "attribute entity exceptions dq", \
 824         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
 825         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
 826         errors: 2 # no close tag, &amp= in attr
 827 test_parser name: "attribute entity exceptions sq", \
 828         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
 829         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
 830         errors: 2 # no close tag, &amp= in attr
 831 test_parser name: "attribute entity exceptions uq", \
 832         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
 833         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
 834         errors: 2 # no close tag, &amp= in attr
 835 test_parser name: "matching closing tags", \
 836         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
 837         expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
 838         errors: 0
 839 test_parser name: "mis-matched closing tags", \
 840         html: "foo<div>bar<span>baz</div>qux",
 841         expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
 842         errors: 1 # close tag mismatch