parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
  20 # or <body> tags, nor does it produce the top level "document" node in the dom
  21 # tree, nor nodes for html, head or body.
  22 #
  23 # Instead, the data structure produced by this parser is an array of nodes.
  24 #
  25 # Each node is an obect of the Node class. Here are the Node types:
  26 TYPE_TAG = 0 # name, {attributes}, [children]
  27 TYPE_TEXT = 1 # "text"
  28 TYPE_COMMENT = 2
  29 TYPE_DOCTYPE = 3
  30 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  31 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
  32 TYPE_END_TAG = 5 # name
  33 TYPE_EOF = 6
  34
  35 class Node
  36         constructor: (type, args = {}) ->
  37                 @type = type # one of the TYPE_* constants above
  38                 @name = args.name ? '' # tag name
  39                 @text = args.text ? '' # contents for text/comment nodes
  40                 @attrs = args.attrs ? {}
  41                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
  42                 @children = args.children ? []
  43         serialize: -> # for unit tests
  44                 ret = ''
  45                 switch @type
  46                         when TYPE_TAG
  47                                 ret += 'tag:'
  48                                 ret += JSON.stringify @name
  49                                 ret += ','
  50                                 ret += JSON.stringify @attrs
  51                                 ret += ','
  52                                 sep = '['
  53                                 for c in @children
  54                                         ret += sep
  55                                         sep = ','
  56                                         ret += c.serialize()
  57                                 ret += ']'
  58                         when TYPE_TEXT
  59                                 ret += 'text:'
  60                                 ret += JSON.stringify @text
  61                         when TYPE_COMMENT
  62                                 ret += 'comment:'
  63                                 ret += JSON.stringify @text
  64                         when TYPE_DOCTYPE
  65                                 ret += 'doctype'
  66                                 # FIXME
  67                         else
  68                                 ret += 'unknown:'
  69                 return ret
  70
  71
  72 # helpers: (only take args that are normally known when parser creates nodes)
  73 new_open_tag = (name) ->
  74         return new Node TYPE_OPEN_TAG, name: name
  75 new_end_tag = (name) ->
  76         return new Node TYPE_END_TAG, name: name
  77 new_text_node = (txt) ->
  78         return new Node TYPE_TEXT, text: txt
  79 new_comment_node = (txt) ->
  80         return new Node TYPE_COMMENT, text: txt
  81 new_eof_token = ->
  82         return new Node TYPE_EOF
  83
  84 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
  85 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
  86 digits = "0123456789"
  87 alnum = lc_alpha + uc_alpha + digits
  88 hex_chars = digits + "abcdefABCDEF"
  89
  90 # some SVG elements have dashes in them
  91 tag_name_chars = alnum + "-"
  92
  93 # http://www.w3.org/TR/html5/infrastructure.html#space-character
  94 space_chars = "\u0009\u000a\u000c\u000d\u0020"
  95
  96 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
  97 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
  98
  99 # These are the character references that don't need a terminating semicolon
 100 # min length: 2, max: 6, none are a prefix of any other.
 101 legacy_char_refs = {
 102         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
 103         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
 104         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
 105         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
 106         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
 107         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
 108         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
 109         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
 110         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
 111         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
 112         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
 113         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
 114         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
 115         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
 116         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
 117         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
 118         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
 119         yen: '¥', yuml: 'ÿ'
 120 }
 121
 122 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
 123 raw_text_elements = ['script', 'style']
 124 escapable_raw_text_elements = ['textarea', 'title']
 125 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
 126 svg_elements = [
 127         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
 128         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
 129         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
 130         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
 131         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
 132         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
 133         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
 134         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
 135         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
 136         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
 137         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
 138         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
 139         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
 140         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
 141         'view', 'vkern'
 142 ]
 143
 144 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
 145 mathml_elements = [
 146         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
 147         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
 148         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
 149         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
 150         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
 151         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 152         'determinant', 'diff', 'divergence', 'divide', 'domain',
 153         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 154         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 155         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 156         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 157         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 158         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 159         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 160         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 161         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 162         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 163         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 164         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 165         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 166         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 167         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 168         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 169         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 170         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 171         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 172         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 173         'vectorproduct', 'xor'
 174 ]
 175 # foreign_elements = [svg_elements..., mathml_elements...]
 176 #normal_elements = All other allowed HTML elements are normal elements.
 177
 178 special_elements = {
 179         # from HTML:
 180         address: true, applet: true, area: true, article: true, aside: true,
 181         base: true, basefont: true, bgsound: true, blockquote: true, body: true,
 182         br: true, button: true, caption: true, center: true, col: true,
 183         colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
 184         dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
 185         footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
 186         h3: true, h4: true, h5: true, h6: true, head: true, header: true,
 187         hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
 188         isindex: true, li: true, link: true, listing: true, main: true,
 189         marquee: true, meta: true, nav: true, noembed: true, noframes: true,
 190         noscript: true, object: true, ol: true, p: true, param: true,
 191         plaintext: true, pre: true, script: true, section: true, select: true,
 192         source: true, style: true, summary: true, table: true, tbody: true,
 193         td: true, template: true, textarea: true, tfoot: true, th: true,
 194         thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
 195         xmp: true,
 196
 197         # from MathML:
 198         mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
 199
 200         # from SVG:
 201         foreignObject: true, desc: true, title: true
 202 }
 203
 204 formatting_elements = {
 205          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
 206          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
 207          u: true
 208 }
 209
 210
 211 # decode_named_char_ref()
 212 #
 213 # The list of named character references is _huge_ so ask the browser to decode
 214 # for us instead of wasting bandwidth/space on including the table here.
 215 #
 216 # Pass without the "&" but with the ";" examples:
 217 #    for "&amp" pass "amp;"
 218 #    for "&#x2032" pass "x2032;"
 219 g_dncr = {
 220         cache: {}
 221         textarea: document.createElement('textarea')
 222 }
 223 # TODO test this in IE8
 224 decode_named_char_ref = (txt) ->
 225         txt = "&#{txt}"
 226         decoded = g_dncr.cache[txt]
 227         return decoded if decoded?
 228         g_dncr.textarea.innerHTML = txt
 229         decoded = g_dncr.textarea.value
 230         return null if decoded is txt
 231         return g_dncr.cache[txt] = decoded
 232
 233 parse_html = (txt, parse_error_cb = null) ->
 234         cur = 0 # index of next char in txt to be parsed
 235         # declare tree and tokenizer variables so they're in scope below
 236         tree = null
 237         open_tags = [] # stack of open elements
 238         tree_state = null
 239         tok_state = null
 240         tok_cur_tag = null # partially parsed tag
 241         flag_frameset_ok = null
 242         flag_parsing = null
 243
 244         parse_error = ->
 245                 if parse_error_cb?
 246                         parse_error_cb cur
 247                 else
 248                         console.log "Parse error at character #{cur} of #{txt.length}"
 249
 250
 251         # the functions below impliment the Tree Contstruction algorithm
 252         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 253
 254         # But first... the helpers
 255         template_tag_is_open = ->
 256                 for t in open_tags
 257                         if t.type is TYPE_TAG and t.name is 'template'
 258                                 return true
 259                 return false
 260         is_in_scope_x = (tag_name, scope) ->
 261                 for t in open_tags
 262                         if t.name is tag_name
 263                                 return true
 264                         if t.name of scope
 265                                 return false
 266                 return false
 267         is_in_scope_x_y = (tag_name, scope, scope2) ->
 268                 for t in open_tags
 269                         if t.name is tag_name
 270                                 return true
 271                         if t.name of scope
 272                                 return false
 273                         if t.name of scope2
 274                                 return false
 275                 return false
 276         standard_scopers = { # FIXME these are supposed to be namespace specific
 277                 'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
 278                 'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
 279                 'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
 280                 'foreignObject': true, 'desc': true, 'title'
 281         }
 282         button_scopers = button: true
 283         li_scopers = ol: true, ul: true
 284         table_scopers = html: true, table: true, template: true
 285         is_in_scope = (tag_name) ->
 286                 return is_in_scope_x tag_name, standard_scopers
 287         is_in_button_scope = (tag_name) ->
 288                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers
 289         is_in_table_scope = (tag_name) ->
 290                 return is_in_scope_x tag_name, table_scopers
 291         is_in_select_scope = (tag_name) ->
 292                 for t in open_tags
 293                         if t.name is tag_name
 294                                 return true
 295                         if t.name isnt 'optgroup' and t.name isnt 'option'
 296                                 return false
 297                 return false
 298
 299         reconstruct_active_formatting_elements = ->
 300                 # FIXME implement this
 301
 302         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
 303         # FIXME implement this
 304         close_p_if_in_button_scope = ->
 305                 if open_tags[0].name is 'p'
 306                         open_tags.pop()
 307                 return
 308                 #p = find_button_scope 'p'
 309                 #if p?
 310                         # TODO generate_implied_end_tags except for p tags
 311                         # TODO parse_error unless open_tags[0].name is 'p'
 312                         # TODO pop stack until 'p' popped
 313
 314
 315
 316         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
 317         tree_insert_a_character = (t) ->
 318                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
 319                 dest = open_tags[0].children
 320                 if dest.length > 0 and dest[dest.length - 1].type is TYPE_TEXT
 321                         dest[dest.length - 1].text += t.text
 322                 else
 323                         dest.push t
 324
 325         # FIXME read spec, do this right
 326         # note: this assumes it's an open tag
 327         tree_insert_tag = (t) ->
 328                 t.type = TYPE_TAG # not TYPE_OPEN_TAG
 329                 # convert attributes into a hash
 330                 while t.attrs_a.length
 331                         a = t.attrs_a.pop()
 332                         t.attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
 333                 open_tags[0].children.push t
 334                 open_tags.unshift t
 335
 336         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
 337         tree_insert_a_comment = (t) ->
 338                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
 339                 open_tags[0].children.push t
 340
 341         # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
 342         tree_in_body = (t) ->
 343                 switch t.type
 344                         when TYPE_TEXT
 345                                 switch t.text
 346                                         when "\u0000"
 347                                                 parse_error()
 348                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
 349                                                 reconstruct_active_formatting_elements()
 350                                                 tree_insert_a_character t
 351                                         else
 352                                                 reconstruct_active_formatting_elements()
 353                                                 tree_insert_a_character t
 354                                                 flag_frameset_ok = false
 355                         when TYPE_COMMENT
 356                                 tree_insert_a_comment t
 357                         when TYPE_DOCTYPE
 358                                 parse_error()
 359                         when TYPE_OPEN_TAG
 360                                 switch t.name
 361                                         when 'html'
 362                                                 parse_error()
 363                                                 return if template_tag_is_open()
 364                                                 root_attrs = open_tags[open_tags.length - 1].children
 365                                                 for k, v of t.attrs
 366                                                         root_attrs[k] = v unless root_attrs[k]?
 367                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
 368                                                 # FIXME also do this for </template> (end tag)
 369                                                 return tree_in_head t
 370                                         when 'body'
 371                                                 parse_error()
 372                                                 # TODO
 373                                         when 'frameset'
 374                                                 parse_error()
 375                                                 # TODO
 376                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
 377                                                 close_p_if_in_button_scope()
 378                                                 tree_insert_tag t
 379                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
 380                                                 close_p_if_in_button_scope()
 381                                                 if open_tags[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
 382                                                         parse_error()
 383                                                         open_tags.shift()
 384                                                 tree_insert_tag t
 385                                         # TODO lots more to implement here
 386                                         else # any other start tag
 387                                                 reconstruct_active_formatting_elements()
 388                                                 tree_insert_tag t
 389                         when TYPE_EOF
 390                                 ok_tags = {
 391                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
 392                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
 393                                 }
 394                                 for t in open_tags
 395                                         unless ok_tags[t.name]?
 396                                                 parse_error()
 397                                                 break
 398                                 # TODO stack of template insertion modes thing
 399                                 flag_parsing = false # stop parsing
 400                         when TYPE_END_TAG
 401                                 switch t.name
 402                                         when 'body'
 403                                                 unless is_in_scope 'body'
 404                                                         parse_error()
 405                                                         return
 406                                                 # TODO implement parse error and move to tree_after_body
 407                                         when 'html'
 408                                                 unless is_in_scope 'body' # weird, but it's what the spec says
 409                                                         parse_error()
 410                                                         return
 411                                                 # TODO implement parse error and move to tree_after_body, reprocess
 412                                         # TODO lots more close tags to implement here
 413                                         else
 414                                                 for node, i in open_tags
 415                                                         if node.name is t.name
 416                                                                 # FIXME generate implied end tags except those with name==t.name
 417                                                                 parse_error() unless i is 0
 418                                                                 while i > 0
 419                                                                         open_tags.shift()
 420                                                                         i -= 1
 421                                                                 open_tags.shift()
 422                                                                 return
 423                                                         if special_elements[node.name]?
 424                                                                 parse_error()
 425                                                                 return
 426
 427
 428         # the functions below implement the tokenizer stats described here:
 429         # http://www.w3.org/TR/html5/syntax.html#tokenization
 430
 431         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
 432         tok_state_data = ->
 433                 switch c = txt.charAt(cur++)
 434                         when '&'
 435                                 return new_text_node tokenize_character_reference()
 436                         when '<'
 437                                 tok_state = tok_state_tag_open
 438                         when "\u0000"
 439                                 parse_error()
 440                                 return new_text_node c
 441                         when '' # EOF
 442                                 return new_eof_token()
 443                         else
 444                                 return new_text_node c
 445                 return null
 446
 447         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
 448         # not needed: tok_state_character_reference_in_data = ->
 449         # just call tok_state_character_reference_in_data()
 450
 451         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
 452         tok_state_tag_open = ->
 453                 switch c = txt.charAt(cur++)
 454                         when '!'
 455                                 tok_state = tok_state_markup_declaration_open
 456                         when '/'
 457                                 tok_state = tok_state_end_tag_open
 458                         when '?'
 459                                 parse_error()
 460                                 tok_state = tok_state_bogus_comment
 461                         else
 462                                 if lc_alpha.indexOf(c) > -1
 463                                         tok_cur_tag = new_open_tag c
 464                                         tok_state = tok_state_tag_name
 465                                 else if uc_alpha.indexOf(c) > -1
 466                                         tok_cur_tag = new_open_tag c.toLowerCase()
 467                                         tok_state = tok_state_tag_name
 468                                 else
 469                                         parse_error()
 470                                         tok_state = tok_state_data
 471                                         cur -= 1 # we didn't parse/handle the char after <
 472                                         return new_text_node '<'
 473                 return null
 474
 475         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
 476         tok_state_end_tag_open = ->
 477                 switch c = txt.charAt(cur++)
 478                         when '>'
 479                                 parse_error()
 480                                 tok_state = tok_state_data
 481                         when '' # EOF
 482                                 parse_error()
 483                                 tok_state = tok_state_data
 484                                 return new_text_node '</'
 485                         else
 486                                 if uc_alpha.indexOf(c) > -1
 487                                         tok_cur_tag = new_end_tag c.toLowerCase()
 488                                         tok_state = tok_state_tag_name
 489                                 else if lc_alpha.indexOf(c) > -1
 490                                         tok_cur_tag = new_end_tag c
 491                                         tok_state = tok_state_tag_name
 492                                 else
 493                                         parse_error()
 494                                         tok_state = tok_state_bogus_comment
 495                 return null
 496
 497         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
 498         tok_state_tag_name = ->
 499                 switch c = txt.charAt(cur++)
 500                         when "\t", "\n", "\u000c", ' '
 501                                 tok_state = tok_state_before_attribute_name
 502                         when '/'
 503                                 tok_state = tok_state_self_closing_start_tag
 504                         when '>'
 505                                 tok_state = tok_state_data
 506                                 tmp = tok_cur_tag
 507                                 tok_cur_tag = null
 508                                 return tmp
 509                         when "\u0000"
 510                                 parse_error()
 511                                 tok_cur_tag.name += "\ufffd"
 512                         when '' # EOF
 513                                 parse_error()
 514                                 tok_state = tok_state_data
 515                         else
 516                                 if uc_alpha.indexOf(c) > -1
 517                                         tok_cur_tag.name += c.toLowerCase()
 518                                 else
 519                                         tok_cur_tag.name += c
 520                 return null
 521
 522         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
 523         tok_state_before_attribute_name = ->
 524                 attr_name = null
 525                 switch c = txt.charAt(cur++)
 526                         when "\t", "\n", "\u000c", ' '
 527                                 return null
 528                         when '/'
 529                                 tok_state = tok_state_self_closing_start_tag
 530                                 return null
 531                         when '>'
 532                                 tok_state = tok_state_data
 533                                 tmp = tok_cur_tag
 534                                 tok_cur_tag = null
 535                                 return tmp
 536                         when "\u0000"
 537                                 parse_error()
 538                                 attr_name = "\ufffd"
 539                         when '"', "'", '<', '='
 540                                 parse_error()
 541                                 attr_name = c
 542                         when '' # EOF
 543                                 parse_error()
 544                                 tok_state = tok_state_data
 545                         else
 546                                 if uc_alpha.indexOf(c) > -1
 547                                         attr_name = c.toLowerCase()
 548                                 else
 549                                         attr_name = c
 550                 if attr_name?
 551                         tok_cur_tag.attrs_a.unshift [attr_name, '']
 552                         tok_state = tok_state_attribute_name
 553                 return null
 554
 555         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
 556         tok_state_attribute_name = ->
 557                 switch c = txt.charAt(cur++)
 558                         when "\t", "\n", "\u000c", ' '
 559                                 tok_state = tok_state_after_attribute_name
 560                         when '/'
 561                                 tok_state = tok_state_self_closing_start_tag
 562                         when '='
 563                                 tok_state = tok_state_before_attribute_value
 564                         when '>'
 565                                 tok_state = tok_state_data
 566                                 tmp = tok_cur_tag
 567                                 tok_cur_tag = null
 568                                 return tmp
 569                         when "\u0000"
 570                                 parse_error()
 571                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
 572                         when '"', "'", '<'
 573                                 parse_error()
 574                                 tok_cur_tag.attrs_a[0][0] = c
 575                         when '' # EOF
 576                                 parse_error()
 577                                 tok_state = tok_state_data
 578                         else
 579                                 if uc_alpha.indexOf(c) > -1
 580                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
 581                                 else
 582                                         tok_cur_tag.attrs_a[0][0] += c
 583                 return null
 584
 585         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
 586         tok_state_before_attribute_value = ->
 587                 switch c = txt.charAt(cur++)
 588                         when "\t", "\n", "\u000c", ' '
 589                                 return null
 590                         when '"'
 591                                 tok_state = tok_state_attribute_value_double_quoted
 592                         when '&'
 593                                 tok_state = tok_state_attribute_value_unquoted
 594                                 cur -= 1
 595                         when "'"
 596                                 tok_state = tok_state_attribute_value_single_quoted
 597                         when "\u0000"
 598                                 # Parse error
 599                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
 600                                 tok_state = tok_state_attribute_value_unquoted
 601                         when '>'
 602                                 # Parse error
 603                                 tok_state = tok_state_data
 604                                 tmp = tok_cur_tag
 605                                 tok_cur_tag = null
 606                                 return tmp
 607                         when '' # EOF
 608                                 parse_error()
 609                                 tok_state = tok_state_data
 610                         else
 611                                 tok_cur_tag.attrs_a[0][1] += c
 612                                 tok_state = tok_state_attribute_value_unquoted
 613                 return null
 614
 615         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
 616         tok_state_attribute_value_double_quoted = ->
 617                 switch c = txt.charAt(cur++)
 618                         when '"'
 619                                 tok_state = tok_state_after_attribute_value_quoted
 620                         when '&'
 621                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
 622                         when "\u0000"
 623                                 # Parse error
 624                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
 625                         when '' # EOF
 626                                 parse_error()
 627                                 tok_state = tok_state_data
 628                         else
 629                                 tok_cur_tag.attrs_a[0][1] += c
 630                 return null
 631
 632         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
 633         tok_state_attribute_value_single_quoted = ->
 634                 switch c = txt.charAt(cur++)
 635                         when "'"
 636                                 tok_state = tok_state_after_attribute_value_quoted
 637                         when '&'
 638                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
 639                         when "\u0000"
 640                                 # Parse error
 641                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
 642                         when '' # EOF
 643                                 parse_error()
 644                                 tok_state = tok_state_data
 645                         else
 646                                 tok_cur_tag.attrs_a[0][1] += c
 647                 return null
 648
 649         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
 650         tok_state_attribute_value_unquoted = ->
 651                 switch c = txt.charAt(cur++)
 652                         when "\t", "\n", "\u000c", ' '
 653                                 tok_state = tok_state_before_attribute_name
 654                         when '&'
 655                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
 656                         when '>'
 657                                 tok_state = tok_state_data
 658                                 tmp = tok_cur_tag
 659                                 tok_cur_tag = null
 660                                 return tmp
 661                         when "\u0000"
 662                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
 663                         when '' # EOF
 664                                 parse_error()
 665                                 tok_state = tok_state_data
 666                         else
 667                                 # Parse Error if ', <, = or ` (backtick)
 668                                 tok_cur_tag.attrs_a[0][1] += c
 669                 return null
 670
 671         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
 672         tok_state_after_attribute_value_quoted = ->
 673                 switch c = txt.charAt(cur++)
 674                         when "\t", "\n", "\u000c", ' '
 675                                 tok_state = tok_state_before_attribute_name
 676                         when '/'
 677                                 tok_state = tok_state_self_closing_start_tag
 678                         when '>'
 679                                 tok_state = tok_state_data
 680                                 tmp = tok_cur_tag
 681                                 tok_cur_tag = null
 682                                 return tmp
 683                         when '' # EOF
 684                                 parse_error()
 685                                 tok_state = tok_state_data
 686                         else
 687                                 # Parse Error
 688                                 tok_state = tok_state_before_attribute_name
 689                                 cur -= 1 # we didn't handle that char
 690                 return null
 691
 692         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
 693         # Don't set this as a state, just call it
 694         # returns a string (NOT a text node)
 695         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
 696                 if cur >= txt.length
 697                         return '&'
 698                 switch c = txt.charAt(cur)
 699                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
 700                                 # explicitly not a parse error
 701                                 return '&'
 702                         when ';'
 703                                 # there has to be "one or more" alnums between & and ; to be a parse error
 704                                 return '&'
 705                         when '#'
 706                                 if cur + 1 >= txt.length
 707                                         return '&'
 708                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
 709                                         prefix = '#x'
 710                                         charset = hex_chars
 711                                         start = cur + 2
 712                                 else
 713                                         charset = digits
 714                                         start = cur + 1
 715                                         prefix = '#'
 716                                 i = 0
 717                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
 718                                         i += 1
 719                                 if i is 0
 720                                         return '&'
 721                                 if txt.charAt(start + i) is ';'
 722                                         i += 1
 723                                 # FIXME This is supposed to generate parse errors for some chars
 724                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
 725                                 if decoded?
 726                                         cur = start + i
 727                                         return decoded
 728                                 return '&'
 729                         else
 730                                 for i in [0...31]
 731                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
 732                                                 break
 733                                 if i is 0
 734                                         # exit early, because parse_error() below needs at least one alnum
 735                                         return '&'
 736                                 if txt.charAt(cur + i) is ';'
 737                                         i += 1 # include ';' terminator in value
 738                                         decoded = decode_named_char_ref txt.substr(cur, i)
 739                                         if decoded?
 740                                                 cur += i
 741                                                 return decoded
 742                                         parse_error()
 743                                         return '&'
 744                                 else
 745                                         # no ';' terminator (only legacy char refs)
 746                                         max = i
 747                                         for i in [2..max] # no prefix matches, so ok to check shortest first
 748                                                 c = legacy_char_refs[txt.substr(cur, i)]
 749                                                 if c?
 750                                                         if in_attr
 751                                                                 if txt.charAt(cur + i) is '='
 752                                                                         # "because some legacy user agents will
 753                                                                         # misinterpret the markup in those cases"
 754                                                                         parse_error()
 755                                                                         return '&'
 756                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
 757                                                                         # this makes attributes forgiving about url args
 758                                                                         return '&'
 759                                                         # ok, and besides the weird exceptions for attributes...
 760                                                         # return the matching char
 761                                                         cur += i # consume entity chars
 762                                                         parse_error() # because no terminating ";"
 763                                                         return c
 764                                         parse_error()
 765                                         return '&'
 766                 return # never reached
 767
 768         # tree constructor initialization
 769         # see comments on TYPE_TAG/etc for the structure of this data
 770         tree = new Node TYPE_TAG, name: 'html'
 771         open_tags = [tree]
 772         tree_state = tree_in_body
 773         flag_frameset_ok = true
 774         flag_parsing = true
 775
 776         # tokenizer initialization
 777         tok_state = tok_state_data
 778
 779         # proccess input
 780         while flag_parsing
 781                 t = tok_state()
 782                 if t?
 783                         tree_state t
 784         return tree.children
 785
 786 # everything below is tests on the above
 787 test_equals = (description, output, expected_output) ->
 788         if output is expected_output
 789                 console.log "passed." # don't say name, so smart consoles can merge all of these
 790         else
 791                 console.log "FAILED: \"#{description}\""
 792                 console.log "   Expected: #{expected_output}"
 793                 console.log "     Actual: #{output}"
 794 test_parser = (args) ->
 795         parse_errors = []
 796         errors_cb = (i) ->
 797                 parse_errors.push i
 798         parsed = parse_html args.html, errors_cb
 799         serialized = ''
 800         sep = ''
 801         for t in parsed
 802                 serialized += sep
 803                 sep = ','
 804                 serialized += t.serialize()
 805         if serialized isnt args.expected or parse_errors.length isnt args.errors
 806                 console.log "FAILED: \"#{args.name}\""
 807         else
 808                 console.log "passed \"#{args.name}\""
 809         if serialized isnt args.expected
 810                 console.log "      Input: #{args.html}"
 811                 console.log "    Correct: #{args.expected}"
 812                 console.log "     Output: #{serialized}"
 813         if parse_errors.length isnt args.errors
 814                 console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
 815
 816 test_parser name: "empty", \
 817         html: "",
 818         expected: '',
 819         errors: 0
 820 test_parser name: "just text", \
 821         html: "abc",
 822         expected: 'text:"abc"',
 823         errors: 0
 824 test_parser name: "named entity", \
 825         html: "a&amp;1234",
 826         expected: 'text:"a&1234"',
 827         errors: 0
 828 test_parser name: "broken named character references", \
 829         html: "1&amp2&&amp;3&aabbcc;",
 830         expected: 'text:"1&2&&3&aabbcc;"',
 831         errors: 2
 832 test_parser name: "numbered entity overrides", \
 833         html: "1&#X80&#x80; &#x83",
 834         expected: 'text:"1€€ ƒ"',
 835         errors: 0
 836 test_parser name: "open tag", \
 837         html: "foo<span>bar",
 838         expected: 'text:"foo",tag:"span",{},[text:"bar"]',
 839         errors: 1 # no close tag
 840 test_parser name: "open tag with attributes", \
 841         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
 842         expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
 843         errors: 1 # no close tag
 844 test_parser name: "open tag with attributes of various quotings", \
 845         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
 846         expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
 847         errors: 1 # no close tag
 848 test_parser name: "attribute entity exceptions dq", \
 849         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
 850         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
 851         errors: 2 # no close tag, &amp= in attr
 852 test_parser name: "attribute entity exceptions sq", \
 853         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
 854         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
 855         errors: 2 # no close tag, &amp= in attr
 856 test_parser name: "attribute entity exceptions uq", \
 857         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
 858         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
 859         errors: 2 # no close tag, &amp= in attr
 860 test_parser name: "matching closing tags", \
 861         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
 862         expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
 863         errors: 0
 864 test_parser name: "missing closing tag inside", \
 865         html: "foo<div>bar<span>baz</div>qux",
 866         expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
 867         errors: 1 # close tag mismatch
 868 test_parser name: "mis-matched closing tags", \
 869         html: "<span>12<div>34</span>56</div>78",
 870         expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
 871         errors: 2 # misplaced </span>, no </span> at the end
 872 test_parser name: "mis-matched formatting elements", \
 873         html: "12<b>34<i>56</b>78</i>90",
 874         expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
 875         errors: 2 # FIXME dunno how many there should be