X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=ef5545f0d603da284694c18c3dca69c857d62ccd;hb=41c743381bdcfdf5303ff0f23eaaf3e121e4ebef;hp=318422b89a0760fd8954a51824dcc2c273d59cbd;hpb=47d40ff2cb949e10270189a1b902d6ce7f4bf1f0;p=peach-html5-editor.git diff --git a/parse-html.coffee b/parse-html.coffee index 318422b..ef5545f 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -22,22 +22,76 @@ # # Instead, the data structure produced by this parser is an array of nodes. # -# Each node is an array. The first element in the array is an integer (one of -# the TYPE_* constants below) followed by the appropriate fields for that type -# (shown below in the comments after the TYPE_* definition.) - +# Each node is an obect of the Node class. Here are the Node types: TYPE_TAG = 0 # name, {attributes}, [children] TYPE_TEXT = 1 # "text" -TYPE_WHITESPACE = 2 -TYPE_COMMENT = 3 +TYPE_COMMENT = 2 +TYPE_DOCTYPE = 3 # the following types are emited by the tokenizer, but shouldn't end up in the tree: -TYPE_OPEN_TAG = 4 +TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children] +TYPE_END_TAG = 5 # name +TYPE_EOF = 6 + +class Node + constructor: (type, args = {}) -> + @type = type # one of the TYPE_* constants above + @name = args.name ? '' # tag name + @text = args.text ? '' # contents for text/comment nodes + @attrs = args.attrs ? {} + @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only + @children = args.children ? [] + serialize: -> # for unit tests + ret = '' + switch @type + when TYPE_TAG + ret += 'tag:' + ret += JSON.stringify @name + ret += ',' + ret += JSON.stringify @attrs + ret += ',' + sep = '[' + for c in @children + ret += sep + sep = ',' + ret += c.serialize() + ret += ']' + when TYPE_TEXT + ret += 'text:' + ret += JSON.stringify @text + when TYPE_COMMENT + ret += 'comment:' + ret += JSON.stringify @text + when TYPE_DOCTYPE + ret += 'doctype' + # FIXME + else + ret += 'unknown:' + return ret + + +# helpers: (only take args that are normally known when parser creates nodes) +new_open_tag = (name) -> + return new Node TYPE_OPEN_TAG, name: name +new_end_tag = (name) -> + return new Node TYPE_END_TAG, name: name +new_text_node = (txt) -> + return new Node TYPE_TEXT, text: txt +new_comment_node = (txt) -> + return new Node TYPE_COMMENT, text: txt +new_eof_token = -> + return new Node TYPE_EOF lc_alpha = "abcdefghijklmnopqrstuvwxqz" uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ" digits = "0123456789" alnum = lc_alpha + uc_alpha + digits hex_chars = digits + "abcdefABCDEF" +scopers = { # FIXME these are supposed to be namespace specific + 'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true, + 'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true, + 'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true, + 'foreignObject': true, 'desc': true, 'title' +} # some SVG elements have dashes in them tag_name_chars = alnum + "-" @@ -127,6 +181,38 @@ mathml_elements = [ # foreign_elements = [svg_elements..., mathml_elements...] #normal_elements = All other allowed HTML elements are normal elements. +special_elements = { + # from HTML: + address: true, applet: true, area: true, article: true, aside: true, + base: true, basefont: true, bgsound: true, blockquote: true, body: true, + br: true, button: true, caption: true, center: true, col: true, + colgroup: true, dd: true, details: true, dir: true, div: true, dl: true, + dt: true, embed: true, fieldset: true, figcaption: true, figure: true, + footer: true, form: true, frame: true, frameset: true, h1: true, h2: true, + h3: true, h4: true, h5: true, h6: true, head: true, header: true, + hgroup: true, hr: true, html: true, iframe: true, img: true, input: true, + isindex: true, li: true, link: true, listing: true, main: true, + marquee: true, meta: true, nav: true, noembed: true, noframes: true, + noscript: true, object: true, ol: true, p: true, param: true, + plaintext: true, pre: true, script: true, section: true, select: true, + source: true, style: true, summary: true, table: true, tbody: true, + td: true, template: true, textarea: true, tfoot: true, th: true, + thead: true, title: true, tr: true, track: true, ul: true, wbr: true, + xmp: true, + + # from MathML: + mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true, + + # from SVG: + foreignObject: true, desc: true, title: true +} + +formatting_elements = { + a: true, b: true, big: true, code: true, em: true, font: true, i: true, + nobr: true, s: true, small: true, strike: true, strong: true, tt: true, + u: true +} + # decode_named_char_ref() # @@ -150,34 +236,194 @@ decode_named_char_ref = (txt) -> return null if decoded is txt return g_dncr.cache[txt] = decoded -parse_html = (txt) -> +parse_html = (txt, parse_error_cb = null) -> cur = 0 # index of next char in txt to be parsed # declare tree and tokenizer variables so they're in scope below tree = null - tree_append_point = null + open_tags = [] # stack of open elements tree_state = null tok_state = null - tok_cur = null # partially parsed tag + tok_cur_tag = null # partially parsed tag + flag_frameset_ok = null + flag_parsing = null + + parse_error = -> + if parse_error_cb? + parse_error_cb cur + else + console.log "Parse error at character #{cur} of #{txt.length}" + + + # the functions below impliment the Tree Contstruction algorithm + # http://www.w3.org/TR/html5/syntax.html#tree-construction + + # But first... the helpers + template_tag_is_open = -> + for t of open_tags + if t.type is TYPE_TAG and t.name is 'template' + return true + return false + is_in_scope = (tag_name) -> + for t of open_tags + if t.name is tag_name + return true + if t.name of scopers + return false + return false + + reconstruct_active_formatting_elements = -> + # FIXME implement this + + # http://www.w3.org/TR/html5/syntax.html#close-a-p-element + # FIXME implement this + close_p_if_in_button_scope = -> + if open_tags[0].name is 'p' + open_tags.pop() + return + #p = find_button_scope 'p' + #if p? + # TODO generate_implied_end_tags except for p tags + # TODO parse_error unless open_tags[0].name is 'p' + # TODO pop stack until 'p' popped + + + + # http://www.w3.org/TR/html5/syntax.html#insert-a-character + tree_insert_a_character = (t) -> + # FIXME read spec for "adjusted insertion location, etc, this might be wrong + dest = open_tags[0].children + if dest.length > 0 and dest[dest.length - 1].type is TYPE_TEXT + dest[dest.length - 1].text += t.text + else + dest.push t + + # FIXME read spec, do this right + # note: this assumes it's an open tag + tree_insert_tag = (t) -> + t.type = TYPE_TAG # not TYPE_OPEN_TAG + # convert attributes into a hash + while t.attrs_a.length + a = t.attrs_a.pop() + t.attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs + open_tags[0].children.push t + open_tags.unshift t + + # http://www.w3.org/TR/html5/syntax.html#insert-a-comment + tree_insert_a_comment = (t) -> + # FIXME read spec for "adjusted insertion location, etc, this might be wrong + open_tags[0].children.push t + + # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody + tree_in_body = (t) -> + switch t.type + when TYPE_TEXT + switch t.text + when "\u0000" + parse_error() + when "\t", "\u000a", "\u000c", "\u000d", ' ' + reconstruct_active_formatting_elements() + tree_insert_a_character t + else + reconstruct_active_formatting_elements() + tree_insert_a_character t + flag_frameset_ok = false + when TYPE_COMMENT + tree_insert_a_comment t + when TYPE_DOCTYPE + parse_error() + when TYPE_OPEN_TAG + switch t.name + when 'html' + parse_error() + return if template_tag_is_open() + root_attrs = open_tags[open_tags.length - 1].children + for k, v of t.attrs + root_attrs[k] = v unless root_attrs[k]? + when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title' + # FIXME also do this for (end tag) + return tree_in_head t + when 'body' + parse_error() + # TODO + when 'frameset' + parse_error() + # TODO + when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul' + close_p_if_in_button_scope() + tree_insert_tag t + when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' + close_p_if_in_button_scope() + if open_tags[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] + parse_error() + open_tags.shift() + tree_insert_tag t + # TODO lots more to implement here + else # any other start tag + reconstruct_active_formatting_elements() + tree_insert_tag t + when TYPE_EOF + ok_tags = { + dd: true, dt: true, li: true, p: true, tbody: true, td: true, + tfoot: true, th: true, thead: true, tr: true, body: true, html: true, + } + for t in open_tags + unless ok_tags[t.name]? + parse_error() + break + # TODO stack of template insertion modes thing + flag_parsing = false # stop parsing + when TYPE_END_TAG + switch t.name + when 'body' + unless is_in_scope 'body' + parse_error() + return + # TODO implement parse error and move to tree_after_body + when 'html' + unless is_in_scope 'body' # weird, but it's what the spec says + parse_error() + return + # TODO implement parse error and move to tree_after_body, reprocess + # TODO lots more close tags to implement here + else + for node, i in open_tags + if node.name is t.name + # FIXME generate implied end tags except those with name==t.name + parse_error() unless i is 0 + while i > 0 + open_tags.shift() + i -= 1 + open_tags.shift() + return + if special_elements[node.name]? + parse_error() + return # the functions below implement the tokenizer stats described here: # http://www.w3.org/TR/html5/syntax.html#tokenization - # http://www.w3.org/TR/html5/syntax.html#data-state + # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state tok_state_data = -> switch c = txt.charAt(cur++) when '&' - tok_state = tok_state_character_reference_in_data + return new_text_node tokenize_character_reference() when '<' tok_state = tok_state_tag_open when "\u0000" - # Parse error - return [TYPE_TEXT, c] + parse_error() + return new_text_node c + when '' # EOF + return new_eof_token() else - return [TYPE_TEXT, c] + return new_text_node c return null - # http://www.w3.org/TR/html5/syntax.html#tag-open-state + # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state + # not needed: tok_state_character_reference_in_data = -> + # just call tok_state_character_reference_in_data() + + # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state tok_state_tag_open = -> switch c = txt.charAt(cur++) when '!' @@ -185,56 +431,255 @@ parse_html = (txt) -> when '/' tok_state = tok_state_end_tag_open when '?' - # Parse error + parse_error() tok_state = tok_state_bogus_comment else if lc_alpha.indexOf(c) > -1 - tok_cur = [TYPE_OPEN_TAG, c, {}, []] + tok_cur_tag = new_open_tag c tok_state = tok_state_tag_name else if uc_alpha.indexOf(c) > -1 - tok_cur = [TYPE_OPEN_TAG, c.toLowerCase(), {}, []] + tok_cur_tag = new_open_tag c.toLowerCase() tok_state = tok_state_tag_name else - # Parse error + parse_error() tok_state = tok_state_data cur -= 1 # we didn't parse/handle the char after < - return [TYPE_TEXT, '<'] + return new_text_node '<' + return null + + # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state + tok_state_end_tag_open = -> + switch c = txt.charAt(cur++) + when '>' + parse_error() + tok_state = tok_state_data + when '' # EOF + parse_error() + tok_state = tok_state_data + return new_text_node ' -1 + tok_cur_tag = new_end_tag c.toLowerCase() + tok_state = tok_state_tag_name + else if lc_alpha.indexOf(c) > -1 + tok_cur_tag = new_end_tag c + tok_state = tok_state_tag_name + else + parse_error() + tok_state = tok_state_bogus_comment return null - # http://www.w3.org/TR/html5/syntax.html#tag-name-state + # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state tok_state_tag_name = -> switch c = txt.charAt(cur++) - when "\t", "\n", ' ' + when "\t", "\n", "\u000c", ' ' tok_state = tok_state_before_attribute_name when '/' tok_state = tok_state_self_closing_start_tag when '>' tok_state = tok_state_data - tmp = tok_cur - tok_cur = null + tmp = tok_cur_tag + tok_cur_tag = null return tmp when "\u0000" - # Parse error - tok_cur[1] += "\ufffd" + parse_error() + tok_cur_tag.name += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data + else + if uc_alpha.indexOf(c) > -1 + tok_cur_tag.name += c.toLowerCase() + else + tok_cur_tag.name += c + return null + + # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state + tok_state_before_attribute_name = -> + attr_name = null + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + return null + when '/' + tok_state = tok_state_self_closing_start_tag + return null + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when "\u0000" + parse_error() + attr_name = "\ufffd" + when '"', "'", '<', '=' + parse_error() + attr_name = c + when '' # EOF + parse_error() + tok_state = tok_state_data + else + if uc_alpha.indexOf(c) > -1 + attr_name = c.toLowerCase() + else + attr_name = c + if attr_name? + tok_cur_tag.attrs_a.unshift [attr_name, ''] + tok_state = tok_state_attribute_name + return null + + # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state + tok_state_attribute_name = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + tok_state = tok_state_after_attribute_name + when '/' + tok_state = tok_state_self_closing_start_tag + when '=' + tok_state = tok_state_before_attribute_value + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when "\u0000" + parse_error() + tok_cur_tag.attrs_a[0][0] = "\ufffd" + when '"', "'", '<' + parse_error() + tok_cur_tag.attrs_a[0][0] = c + when '' # EOF + parse_error() + tok_state = tok_state_data else if uc_alpha.indexOf(c) > -1 - tok_cur[1] += c.toLowerCase() + tok_cur_tag.attrs_a[0][0] = c.toLowerCase() else - tok_cur[1] += c + tok_cur_tag.attrs_a[0][0] += c + return null + + # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state + tok_state_before_attribute_value = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + return null + when '"' + tok_state = tok_state_attribute_value_double_quoted + when '&' + tok_state = tok_state_attribute_value_unquoted + cur -= 1 + when "'" + tok_state = tok_state_attribute_value_single_quoted + when "\u0000" + # Parse error + tok_cur_tag.attrs_a[0][1] += "\ufffd" + tok_state = tok_state_attribute_value_unquoted + when '>' + # Parse error + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when '' # EOF + parse_error() + tok_state = tok_state_data + else + tok_cur_tag.attrs_a[0][1] += c + tok_state = tok_state_attribute_value_unquoted + return null + + # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state + tok_state_attribute_value_double_quoted = -> + switch c = txt.charAt(cur++) + when '"' + tok_state = tok_state_after_attribute_value_quoted + when '&' + tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true + when "\u0000" + # Parse error + tok_cur_tag.attrs_a[0][1] += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data + else + tok_cur_tag.attrs_a[0][1] += c + return null + + # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state + tok_state_attribute_value_single_quoted = -> + switch c = txt.charAt(cur++) + when "'" + tok_state = tok_state_after_attribute_value_quoted + when '&' + tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true + when "\u0000" + # Parse error + tok_cur_tag.attrs_a[0][1] += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data + else + tok_cur_tag.attrs_a[0][1] += c + return null + + # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state + tok_state_attribute_value_unquoted = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + tok_state = tok_state_before_attribute_name + when '&' + tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when "\u0000" + tok_cur_tag.attrs_a[0][1] += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data + else + # Parse Error if ', <, = or ` (backtick) + tok_cur_tag.attrs_a[0][1] += c + return null + + # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state + tok_state_after_attribute_value_quoted = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + tok_state = tok_state_before_attribute_name + when '/' + tok_state = tok_state_self_closing_start_tag + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when '' # EOF + parse_error() + tok_state = tok_state_data + else + # Parse Error + tok_state = tok_state_before_attribute_name + cur -= 1 # we didn't handle that char return null - # http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state - # & just got consumed - tok_state_character_reference_in_data = -> - tok_state = tok_state_data + # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference + # Don't set this as a state, just call it + # returns a string (NOT a text node) + tokenize_character_reference = (allowed_char = null, in_attr = false) -> if cur >= txt.length - return [TYPE_TEXT, '&'] + return '&' switch c = txt.charAt(cur) + when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char + # explicitly not a parse error + return '&' when ';' - return [TYPE_TEXT, '&'] + # there has to be "one or more" alnums between & and ; to be a parse error + return '&' when '#' if cur + 1 >= txt.length - return [TYPE_TEXT, '&'] + return '&' if txt.charAt(cur + 1).toLowerCase() is 'x' prefix = '#x' charset = hex_chars @@ -247,82 +692,151 @@ parse_html = (txt) -> while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1 i += 1 if i is 0 - return [TYPE_TEXT, '&'] + return '&' if txt.charAt(start + i) is ';' i += 1 + # FIXME This is supposed to generate parse errors for some chars decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase()) if decoded? cur = start + i - return [TYPE_TEXT, decoded] - return [TYPE_TEXT, '&'] + return decoded + return '&' else for i in [0...31] if alnum.indexOf(txt.charAt(cur + i)) is -1 break if i is 0 - return [TYPE_TEXT, '&'] + # exit early, because parse_error() below needs at least one alnum + return '&' if txt.charAt(cur + i) is ';' i += 1 # include ';' terminator in value decoded = decode_named_char_ref txt.substr(cur, i) if decoded? cur += i - return [TYPE_TEXT, decoded] - return [TYPE_TEXT, '&'] + return decoded + parse_error() + return '&' else # no ';' terminator (only legacy char refs) - if i < 2 or i > 6 - return [TYPE_TEXT, '&'] - # FIXME: if we're inside an attribute: - # 1. don't parse refs that are followed by = - # 2. don't parse refs that are followed by alnum max = i for i in [2..max] # no prefix matches, so ok to check shortest first c = legacy_char_refs[txt.substr(cur, i)] if c? + if in_attr + if txt.charAt(cur + i) is '=' + # "because some legacy user agents will + # misinterpret the markup in those cases" + parse_error() + return '&' + if alnum.indexOf(txt.charAt(cur + i)) > -1 + # this makes attributes forgiving about url args + return '&' + # ok, and besides the weird exceptions for attributes... + # return the matching char cur += i # consume entity chars - return [TYPE_TEXT, c] - return null - - # the functions below impliment the Tree Contstruction algorithm here: - # http://www.w3.org/TR/html5/syntax.html#tree-construction - # FIXME this is just a bit of a hack that makes sense... read spec and do it that way - tree_append = (t) -> - if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT - tree_append_point[tree_append_point.length - 1][1] += t[1] - else - tree_append_point.push t - if t[0] is TYPE_OPEN_TAG - t[0] = TYPE_TAG - tree_append_point = t[3] + parse_error() # because no terminating ";" + return c + parse_error() + return '&' + return # never reached # tree constructor initialization - tree = [] # see comments on TYPE_TAG/etc for the structure of this data - tree_append_point = tree - tree_state = tree_append + # see comments on TYPE_TAG/etc for the structure of this data + tree = new Node TYPE_TAG, name: 'html' + open_tags = [tree] + tree_state = tree_in_body + flag_frameset_ok = true + flag_parsing = true # tokenizer initialization tok_state = tok_state_data # proccess input - while cur < txt.length + while flag_parsing t = tok_state() if t? tree_state t - - return tree + return tree.children # everything below is tests on the above -test_equals = (description, fn, args..., expected_output) -> - output = fn.apply this, args +test_equals = (description, output, expected_output) -> if output is expected_output - console.log "passed: #{description}." + console.log "passed." # don't say name, so smart consoles can merge all of these else - console.log "FAILED: #{description}. Expected: #{expected_output}, actual: #{output}" -html_to_json = (html) -> - return JSON.stringify parse_html html -test_equals "empty", html_to_json, "", '[]' -test_equals "just text", html_to_json, "abc", '[[1,"abc"]]' -test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]' -test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]' -test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1€€ ƒ"]]' -test_equals "open_tag", html_to_json, "foobar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]' + console.log "FAILED: \"#{description}\"" + console.log " Expected: #{expected_output}" + console.log " Actual: #{output}" +test_parser = (args) -> + parse_errors = [] + errors_cb = (i) -> + parse_errors.push i + parsed = parse_html args.html, errors_cb + serialized = '' + sep = '' + for t in parsed + serialized += sep + sep = ',' + serialized += t.serialize() + if serialized isnt args.expected or parse_errors.length isnt args.errors + console.log "test FAILED: \"#{args.name}\"" + else + console.log 'test passed' + if serialized isnt args.expected + console.log " Input: #{args.html}" + console.log " Correct: #{args.expected}" + console.log " Output: #{serialized}" + if parse_errors.length isnt args.errors + console.log " Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}" + +test_parser name: "empty", \ + html: "", + expected: '', + errors: 0 +test_parser name: "just text", \ + html: "abc", + expected: 'text:"abc"', + errors: 0 +test_parser name: "named entity", \ + html: "a&1234", + expected: 'text:"a&1234"', + errors: 0 +test_parser name: "broken named character references", \ + html: "1&2&&3&aabbcc;", + expected: 'text:"1&2&&3&aabbcc;"', + errors: 2 +test_parser name: "numbered entity overrides", \ + html: "1€€ ƒ", + expected: 'text:"1€€ ƒ"', + errors: 0 +test_parser name: "open tag", \ + html: "foobar", + expected: 'text:"foo",tag:"span",{},[text:"bar"]', + errors: 1 # no close tag +test_parser name: "open tag with attributes", \ + html: "foobar", + expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]', + errors: 1 # no close tag +test_parser name: "open tag with attributes of various quotings", \ + html: "foobar", + expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]', + errors: 1 # no close tag +test_parser name: "attribute entity exceptions dq", \ + html: "foobar", + expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]', + errors: 2 # no close tag, &= in attr +test_parser name: "attribute entity exceptions sq", \ + html: "foobar", + expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]', + errors: 2 # no close tag, &= in attr +test_parser name: "attribute entity exceptions uq", \ + html: "foobar", + expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]', + errors: 2 # no close tag, &= in attr +test_parser name: "matching closing tags", \ + html: "foohi
1
foo
2
bar", + expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"', + errors: 0 +test_parser name: "mis-matched closing tags", \ + html: "foo
barbaz
qux", + expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"', + errors: 1 # close tag mismatch