# HTML parser meant to run in a browser, in support of WYSIWYG editor # Copyright 2015 Jason Woofenden # # This program is free software: you can redistribute it and/or modify it under # the terms of the GNU Affero General Public License as published by the Free # Software Foundation, either version 3 of the License, or (at your option) any # later version. # # This program is distributed in the hope that it will be useful, but WITHOUT # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more # details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . # This file implements a parser for html snippets, meant to be used by a # WYSIWYG editor. Hence it does not attempt to parse doctypes, , # or tags, nor does it produce the top level "document" node in the dom # tree, nor nodes for html, head or body. # # Instead, the data structure produced by this parser is an array of nodes. # # Each node is an array. The first element in the array is an integer (one of # the TYPE_* constants below) followed by the appropriate fields for that type # (shown below in the comments after the TYPE_* definition.) TYPE_TAG = 0 # name, {attributes}, [children] TYPE_TEXT = 1 # "text" TYPE_COMMENT = 2 TYPE_DOCTYPE = 3 # the following types are emited by the tokenizer, but shouldn't end up in the tree: TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children] TYPE_END_TAG = 5 # name TYPE_EOF = 6 lc_alpha = "abcdefghijklmnopqrstuvwxqz" uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ" digits = "0123456789" alnum = lc_alpha + uc_alpha + digits hex_chars = digits + "abcdefABCDEF" scopers = { # FIXME these are supposed to be namespace specific 'applet', 'caption', 'html', 'table', 'td', 'th', 'marquee', 'object', 'template', 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml', 'foreignObject', 'desc', 'title' } # some SVG elements have dashes in them tag_name_chars = alnum + "-" # http://www.w3.org/TR/html5/infrastructure.html#space-character space_chars = "\u0009\u000a\u000c\u000d\u0020" # https://en.wikipedia.org/wiki/Whitespace_character#Unicode whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000" # These are the character references that don't need a terminating semicolon # min length: 2, max: 6, none are a prefix of any other. legacy_char_refs = { Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ', aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å', aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦', Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©', curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê', ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë', euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>', Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì', igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<', lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬', Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô', Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø', Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±', pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§', shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ', times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù', ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý', yen: '¥', yuml: 'ÿ' } void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr'] raw_text_elements = ['script', 'style'] escapable_raw_text_elements = ['textarea', 'title'] # http://www.w3.org/TR/SVG/ 1.1 (Second Edition) svg_elements = [ 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor', 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile', 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix', 'feComponentTransfer', 'feComposite', 'feConvolveMatrix', 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood', 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage', 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight', 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter', 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src', 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern', 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata', 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline', 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg', 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use', 'view', 'vkern' ] # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition mathml_elements = [ 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos', 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech', 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card', 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain', 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot', 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree', 'determinant', 'diff', 'divergence', 'divide', 'domain', 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma', 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor', 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary', 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect', 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit', 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup', 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median', 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min', 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode', 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts', 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline', 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup', 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers', 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset', 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece', 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient', 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct', 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff', 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto', 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector', 'vectorproduct', 'xor' ] # foreign_elements = [svg_elements..., mathml_elements...] #normal_elements = All other allowed HTML elements are normal elements. special_elements = { # from HTML: address: true, applet: true, area: true, article: true, aside: true, base: true, basefont: true, bgsound: true, blockquote: true, body: true, br: true, button: true, caption: true, center: true, col: true, colgroup: true, dd: true, details: true, dir: true, div: true, dl: true, dt: true, embed: true, fieldset: true, figcaption: true, figure: true, footer: true, form: true, frame: true, frameset: true, h1: true, h2: true, h3: true, h4: true, h5: true, h6: true, head: true, header: true, hgroup: true, hr: true, html: true, iframe: true, img: true, input: true, isindex: true, li: true, link: true, listing: true, main: true, marquee: true, meta: true, nav: true, noembed: true, noframes: true, noscript: true, object: true, ol: true, p: true, param: true, plaintext: true, pre: true, script: true, section: true, select: true, source: true, style: true, summary: true, table: true, tbody: true, td: true, template: true, textarea: true, tfoot: true, th: true, thead: true, title: true, tr: true, track: true, ul: true, wbr: true, xmp: true, # from MathML: mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true, # from SVG: foreignObject: true, desc: true, title: true } formatting_elements = { a: true, b: true, big: true, code: true, em: true, font: true, i: true, nobr: true, s: true, small: true, strike: true, strong: true, tt: true, u: true } # decode_named_char_ref() # # The list of named character references is _huge_ so ask the browser to decode # for us instead of wasting bandwidth/space on including the table here. # # Pass without the "&" but with the ";" examples: # for "&" pass "amp;" # for "′" pass "x2032;" g_dncr = { cache: {} textarea: document.createElement('textarea') } # TODO test this in IE8 decode_named_char_ref = (txt) -> txt = "&#{txt}" decoded = g_dncr.cache[txt] return decoded if decoded? g_dncr.textarea.innerHTML = txt decoded = g_dncr.textarea.value return null if decoded is txt return g_dncr.cache[txt] = decoded parse_html = (txt) -> cur = 0 # index of next char in txt to be parsed # declare tree and tokenizer variables so they're in scope below tree = null open_tags = [] # stack of open elements tree_state = null tok_state = null tok_cur_tag = null # partially parsed tag flag_frameset_ok = null flag_parsing = null parse_error = -> console.log "Parse error at character #{cur} of #{txt.length}" # the functions below impliment the Tree Contstruction algorithm # http://www.w3.org/TR/html5/syntax.html#tree-construction # But first... the helpers template_tag_is_open = -> for t of open_tags if t[0] is TYPE_TAG and t[1] is 'template' return true return false is_in_scope = (tag_name) -> for t of open_tags if t[0] is TYPE_TAG and t[1] is tag_name return true # FIXME bail if in scopers return false reconstruct_active_formatting_elements = -> # FIXME implement this # http://www.w3.org/TR/html5/syntax.html#close-a-p-element # FIXME implement this close_p_if_in_button_scope = -> if open_tags[0][1] is 'p' # FIXME open_tags.pop() return #p = find_button_scope 'p' #if p? # TODO generate_implied_end_tags except for p tags # TODO parse_error unless open_tags[0][1] is 'p' # TODO pop stack until 'p' popped # http://www.w3.org/TR/html5/syntax.html#insert-a-character tree_insert_a_character = (t) -> # FIXME read spec for "adjusted insertion location, etc, this might be wrong if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT open_tags[0][3][open_tags[0][3].length - 1][1] += t[1] else open_tags[0][3].push t # FIXME read spec, do this right # note: this assumes it's an open tag tree_insert_tag = (t) -> t[0] = TYPE_TAG # not TYPE_OPEN_TAG # convert attributes into a hash attrs = {} while t[2].length a = t[2].pop() attrs[a[0]] = a[1] t[2] = attrs open_tags[0][3].push t open_tags.unshift t # http://www.w3.org/TR/html5/syntax.html#insert-a-comment tree_insert_a_comment = (t) -> # FIXME read spec for "adjusted insertion location, etc, this might be wrong open_tags[0][3].push t # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody tree_in_body = (t) -> switch t[0] when TYPE_TEXT switch t[1] when "\u0000" parse_error() when "\t", "\u000a", "\u000c", "\u000d", ' ' reconstruct_active_formatting_elements() tree_insert_a_character t else reconstruct_active_formatting_elements() tree_insert_a_character t flag_frameset_ok = false when TYPE_COMMENT tree_insert_a_comment t when TYPE_DOCTYPE parse_error() when TYPE_OPEN_TAG switch t[1] when 'html' parse_error() return if template_tag_is_open() root_attrs = open_tags[open_tags.length - 1][3] for k, v of t[2] root_attrs[k] = v unless root_attrs[k]? when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title' # FIXME also do this for (end tag) return tree_in_head t when 'body' parse_error() # TODO when 'frameset' parse_error() # TODO when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul' close_p_if_in_button_scope() tree_insert_tag t when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' close_p_if_in_button_scope() if open_tags[0][1] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] parse_error() open_tags.shift() tree_insert_tag t # TODO lots more to implement here else # any other start tag reconstruct_active_formatting_elements() tree_insert_tag t when TYPE_EOF ok_tags = { dd: true, dt: true, li: true, p: true, tbody: true, td: true, tfoot: true, th: true, thead: true, tr: true, body: true, html: true, } for t in open_tags unless ok_tags[t[1]]? parse_error() break # TODO stack of template insertion modes thing flag_parsing = false # stop parsing when TYPE_END_TAG switch t[1] when 'body' unless is_in_scope 'body' parse_error() return # TODO implement parse error and move to tree_after_body when 'html' unless is_in_scope 'body' # weird, but it's what the spec says parse_error() return # TODO implement parse error and move to tree_after_body, reprocess # TODO lots more close tags to implement here else for node, i in open_tags if node[1] is t[1] # FIXME generate implied end tags except those with name==t[1] parse_error() unless i is 0 while i > 0 open_tags.shift() i -= 1 open_tags.shift() return if special_elements[node[1]]? parse_error() return # the functions below implement the tokenizer stats described here: # http://www.w3.org/TR/html5/syntax.html#tokenization # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state tok_state_data = -> switch c = txt.charAt(cur++) when '&' return [TYPE_TEXT, tokenize_character_reference()] when '<' tok_state = tok_state_tag_open when "\u0000" parse_error() return [TYPE_TEXT, c] when '' # EOF return [TYPE_EOF] else return [TYPE_TEXT, c] return null # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state # not needed: tok_state_character_reference_in_data = -> # just call tok_state_character_reference_in_data() # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state tok_state_tag_open = -> switch c = txt.charAt(cur++) when '!' tok_state = tok_state_markup_declaration_open when '/' tok_state = tok_state_end_tag_open when '?' parse_error() tok_state = tok_state_bogus_comment else if lc_alpha.indexOf(c) > -1 tok_cur_tag = [TYPE_OPEN_TAG, c, [], []] tok_state = tok_state_tag_name else if uc_alpha.indexOf(c) > -1 tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []] tok_state = tok_state_tag_name else parse_error() tok_state = tok_state_data cur -= 1 # we didn't parse/handle the char after < return [TYPE_TEXT, '<'] return null # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state tok_state_end_tag_open = -> switch c = txt.charAt(cur++) when '>' parse_error() tok_state = tok_state_data when '' # EOF parse_error() tok_state = tok_state_data return [TYPE_TEXT, ' -1 tok_cur_tag = [TYPE_END_TAG, c.toLowerCase(), [], []] tok_state = tok_state_tag_name else if lc_alpha.indexOf(c) > -1 tok_cur_tag = [TYPE_END_TAG, c, [], []] tok_state = tok_state_tag_name else parse_error() tok_state = tok_state_bogus_comment return null # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state tok_state_tag_name = -> switch c = txt.charAt(cur++) when "\t", "\n", "\u000c", ' ' tok_state = tok_state_before_attribute_name when '/' tok_state = tok_state_self_closing_start_tag when '>' tok_state = tok_state_data tmp = tok_cur_tag tok_cur_tag = null return tmp when "\u0000" parse_error() tok_cur_tag[1] += "\ufffd" when '' # EOF parse_error() tok_state = tok_state_data else if uc_alpha.indexOf(c) > -1 tok_cur_tag[1] += c.toLowerCase() else tok_cur_tag[1] += c return null # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state tok_state_before_attribute_name = -> attr_name = null switch c = txt.charAt(cur++) when "\t", "\n", "\u000c", ' ' return null when '/' tok_state = tok_state_self_closing_start_tag return null when '>' tok_state = tok_state_data tmp = tok_cur_tag tok_cur_tag = null return tmp when "\u0000" parse_error() attr_name = "\ufffd" when '"', "'", '<', '=' parse_error() attr_name = c when '' # EOF parse_error() tok_state = tok_state_data else if uc_alpha.indexOf(c) > -1 attr_name = c.toLowerCase() else attr_name = c if attr_name? tok_cur_tag[2].unshift [attr_name, ''] tok_state = tok_state_attribute_name return null # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state tok_state_attribute_name = -> switch c = txt.charAt(cur++) when "\t", "\n", "\u000c", ' ' tok_state = tok_state_after_attribute_name when '/' tok_state = tok_state_self_closing_start_tag when '=' tok_state = tok_state_before_attribute_value when '>' tok_state = tok_state_data tmp = tok_cur_tag tok_cur_tag = null return tmp when "\u0000" parse_error() tok_cur_tag[2][0][0] = "\ufffd" when '"', "'", '<' parse_error() tok_cur_tag[2][0][0] = c when '' # EOF parse_error() tok_state = tok_state_data else if uc_alpha.indexOf(c) > -1 tok_cur_tag[2][0][0] = c.toLowerCase() else tok_cur_tag[2][0][0] += c return null # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state tok_state_before_attribute_value = -> switch c = txt.charAt(cur++) when "\t", "\n", "\u000c", ' ' return null when '"' tok_state = tok_state_attribute_value_double_quoted when '&' tok_state = tok_state_attribute_value_unquoted cur -= 1 when "'" tok_state = tok_state_attribute_value_single_quoted when "\u0000" # Parse error tok_cur_tag[2][0][1] += "\ufffd" tok_state = tok_state_attribute_value_unquoted when '>' # Parse error tok_state = tok_state_data tmp = tok_cur_tag tok_cur_tag = null return tmp when '' # EOF parse_error() tok_state = tok_state_data else tok_cur_tag[2][0][1] += c tok_state = tok_state_attribute_value_unquoted return null # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state tok_state_attribute_value_double_quoted = -> switch c = txt.charAt(cur++) when '"' tok_state = tok_state_after_attribute_value_quoted when '&' tok_cur_tag[2][0][1] += tokenize_character_reference '"', true when "\u0000" # Parse error tok_cur_tag[2][0][1] += "\ufffd" when '' # EOF parse_error() tok_state = tok_state_data else tok_cur_tag[2][0][1] += c return null # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state tok_state_attribute_value_single_quoted = -> switch c = txt.charAt(cur++) when "'" tok_state = tok_state_after_attribute_value_quoted when '&' tok_cur_tag[2][0][1] += tokenize_character_reference "'", true when "\u0000" # Parse error tok_cur_tag[2][0][1] += "\ufffd" when '' # EOF parse_error() tok_state = tok_state_data else tok_cur_tag[2][0][1] += c return null # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state tok_state_attribute_value_unquoted = -> switch c = txt.charAt(cur++) when "\t", "\n", "\u000c", ' ' tok_state = tok_state_before_attribute_name when '&' tok_cur_tag[2][0][1] += tokenize_character_reference '>', true when '>' tok_state = tok_state_data tmp = tok_cur_tag tok_cur_tag = null return tmp when "\u0000" tok_cur_tag[2][0][1] += "\ufffd" when '' # EOF parse_error() tok_state = tok_state_data else # Parse Error if ', <, = or ` (backtick) tok_cur_tag[2][0][1] += c return null # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state tok_state_after_attribute_value_quoted = -> switch c = txt.charAt(cur++) when "\t", "\n", "\u000c", ' ' tok_state = tok_state_before_attribute_name when '/' tok_state = tok_state_self_closing_start_tag when '>' tok_state = tok_state_data tmp = tok_cur_tag tok_cur_tag = null return tmp when '' # EOF parse_error() tok_state = tok_state_data else # Parse Error tok_state = tok_state_before_attribute_name cur -= 1 # we didn't handle that char return null # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference # Don't set this as a state, just call it # returns a string (NOT a text node) tokenize_character_reference = (allowed_char = null, in_attr = false) -> if cur >= txt.length return '&' switch c = txt.charAt(cur) when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char # explicitly not a parse error return '&' when ';' # there has to be "one or more" alnums between & and ; to be a parse error return '&' when '#' if cur + 1 >= txt.length return '&' if txt.charAt(cur + 1).toLowerCase() is 'x' prefix = '#x' charset = hex_chars start = cur + 2 else charset = digits start = cur + 1 prefix = '#' i = 0 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1 i += 1 if i is 0 return '&' if txt.charAt(start + i) is ';' i += 1 # FIXME This is supposed to generate parse errors for some chars decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase()) if decoded? cur = start + i return decoded return '&' else for i in [0...31] if alnum.indexOf(txt.charAt(cur + i)) is -1 break if i is 0 # exit early, because parse_error() below needs at least one alnum return '&' if txt.charAt(cur + i) is ';' i += 1 # include ';' terminator in value decoded = decode_named_char_ref txt.substr(cur, i) if decoded? cur += i return decoded parse_error() return '&' else # no ';' terminator (only legacy char refs) max = i for i in [2..max] # no prefix matches, so ok to check shortest first c = legacy_char_refs[txt.substr(cur, i)] if c? if in_attr if txt.charAt(cur + i) is '=' # "because some legacy user agents will # misinterpret the markup in those cases" parse_error() return '&' if alnum.indexOf(txt.charAt(cur + i)) > -1 # this makes attributes forgiving about url args return '&' # ok, and besides the weird exceptions for attributes... # return the matching char cur += i # consume entity chars parse_error() # because no terminating ";" return c parse_error() return '&' return # never reached # tree constructor initialization # see comments on TYPE_TAG/etc for the structure of this data tree = [TYPE_TAG, 'html', {}, []] open_tags = [tree] tree_state = tree_in_body flag_frameset_ok = true flag_parsing = true # tokenizer initialization tok_state = tok_state_data # proccess input while flag_parsing t = tok_state() if t? tree_state t return tree[3] # everything below is tests on the above test_equals = (description, fn, args..., expected_output) -> output = fn.apply this, args if output is expected_output console.log "passed: #{description}." else console.log "FAILED: #{description}..." console.log " Expected: #{expected_output}" console.log " Actual: #{output}" html_to_json = (html) -> return JSON.stringify parse_html html test_equals "empty", html_to_json, "", '[]' test_equals "just text", html_to_json, "abc", '[[1,"abc"]]' test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]' test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]' test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1€€ ƒ"]]' test_equals "open tag", html_to_json, "foobar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]' test_equals "open tag with attributes", html_to_json, "foobar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]' test_equals "open tag with attributes of various quotings", html_to_json, "foobar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]' test_equals "attribute entity exceptions dq", html_to_json, "foobar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]' test_equals "attribute entity exceptions sq", html_to_json, "foo bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]' test_equals "attribute entity exceptions uq", html_to_json, "foo bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]' test_equals "matching closing tags", html_to_json, "foo hi

foo

bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]' test_equals "mis-matched closing tags", html_to_json, "foo

barbaz

qux", '[[1,"foo"],[0,"div",{},[[1,"bar"],[0,"span",{},[[1,"baz"]]]]],[1,"qux"]]'