From 41c743381bdcfdf5303ff0f23eaaf3e121e4ebef Mon Sep 17 00:00:00 2001 From: Jason Woofenden Date: Tue, 15 Dec 2015 13:23:17 -0500 Subject: [PATCH] use Node class for data structures, improve is_in_scope --- parse-html.coffee | 216 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 134 insertions(+), 82 deletions(-) diff --git a/parse-html.coffee b/parse-html.coffee index 927cb95..ef5545f 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -22,10 +22,7 @@ # # Instead, the data structure produced by this parser is an array of nodes. # -# Each node is an array. The first element in the array is an integer (one of -# the TYPE_* constants below) followed by the appropriate fields for that type -# (shown below in the comments after the TYPE_* definition.) - +# Each node is an obect of the Node class. Here are the Node types: TYPE_TAG = 0 # name, {attributes}, [children] TYPE_TEXT = 1 # "text" TYPE_COMMENT = 2 @@ -35,15 +32,65 @@ TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [child TYPE_END_TAG = 5 # name TYPE_EOF = 6 +class Node + constructor: (type, args = {}) -> + @type = type # one of the TYPE_* constants above + @name = args.name ? '' # tag name + @text = args.text ? '' # contents for text/comment nodes + @attrs = args.attrs ? {} + @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only + @children = args.children ? [] + serialize: -> # for unit tests + ret = '' + switch @type + when TYPE_TAG + ret += 'tag:' + ret += JSON.stringify @name + ret += ',' + ret += JSON.stringify @attrs + ret += ',' + sep = '[' + for c in @children + ret += sep + sep = ',' + ret += c.serialize() + ret += ']' + when TYPE_TEXT + ret += 'text:' + ret += JSON.stringify @text + when TYPE_COMMENT + ret += 'comment:' + ret += JSON.stringify @text + when TYPE_DOCTYPE + ret += 'doctype' + # FIXME + else + ret += 'unknown:' + return ret + + +# helpers: (only take args that are normally known when parser creates nodes) +new_open_tag = (name) -> + return new Node TYPE_OPEN_TAG, name: name +new_end_tag = (name) -> + return new Node TYPE_END_TAG, name: name +new_text_node = (txt) -> + return new Node TYPE_TEXT, text: txt +new_comment_node = (txt) -> + return new Node TYPE_COMMENT, text: txt +new_eof_token = -> + return new Node TYPE_EOF + lc_alpha = "abcdefghijklmnopqrstuvwxqz" uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ" digits = "0123456789" alnum = lc_alpha + uc_alpha + digits hex_chars = digits + "abcdefABCDEF" scopers = { # FIXME these are supposed to be namespace specific - 'applet', 'caption', 'html', 'table', 'td', 'th', 'marquee', 'object', - 'template', 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml', - 'foreignObject', 'desc', 'title' + 'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true, + 'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true, + 'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true, + 'foreignObject': true, 'desc': true, 'title' } # some SVG elements have dashes in them @@ -213,14 +260,15 @@ parse_html = (txt, parse_error_cb = null) -> # But first... the helpers template_tag_is_open = -> for t of open_tags - if t[0] is TYPE_TAG and t[1] is 'template' + if t.type is TYPE_TAG and t.name is 'template' return true return false is_in_scope = (tag_name) -> for t of open_tags - if t[0] is TYPE_TAG and t[1] is tag_name + if t.name is tag_name return true - # FIXME bail if in scopers + if t.name of scopers + return false return false reconstruct_active_formatting_elements = -> @@ -229,13 +277,13 @@ parse_html = (txt, parse_error_cb = null) -> # http://www.w3.org/TR/html5/syntax.html#close-a-p-element # FIXME implement this close_p_if_in_button_scope = -> - if open_tags[0][1] is 'p' # FIXME + if open_tags[0].name is 'p' open_tags.pop() return #p = find_button_scope 'p' #if p? # TODO generate_implied_end_tags except for p tags - # TODO parse_error unless open_tags[0][1] is 'p' + # TODO parse_error unless open_tags[0].name is 'p' # TODO pop stack until 'p' popped @@ -243,34 +291,33 @@ parse_html = (txt, parse_error_cb = null) -> # http://www.w3.org/TR/html5/syntax.html#insert-a-character tree_insert_a_character = (t) -> # FIXME read spec for "adjusted insertion location, etc, this might be wrong - if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT - open_tags[0][3][open_tags[0][3].length - 1][1] += t[1] + dest = open_tags[0].children + if dest.length > 0 and dest[dest.length - 1].type is TYPE_TEXT + dest[dest.length - 1].text += t.text else - open_tags[0][3].push t + dest.push t # FIXME read spec, do this right # note: this assumes it's an open tag tree_insert_tag = (t) -> - t[0] = TYPE_TAG # not TYPE_OPEN_TAG + t.type = TYPE_TAG # not TYPE_OPEN_TAG # convert attributes into a hash - attrs = {} - while t[2].length - a = t[2].pop() - attrs[a[0]] = a[1] - t[2] = attrs - open_tags[0][3].push t + while t.attrs_a.length + a = t.attrs_a.pop() + t.attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs + open_tags[0].children.push t open_tags.unshift t # http://www.w3.org/TR/html5/syntax.html#insert-a-comment tree_insert_a_comment = (t) -> # FIXME read spec for "adjusted insertion location, etc, this might be wrong - open_tags[0][3].push t + open_tags[0].children.push t # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody tree_in_body = (t) -> - switch t[0] + switch t.type when TYPE_TEXT - switch t[1] + switch t.text when "\u0000" parse_error() when "\t", "\u000a", "\u000c", "\u000d", ' ' @@ -285,12 +332,12 @@ parse_html = (txt, parse_error_cb = null) -> when TYPE_DOCTYPE parse_error() when TYPE_OPEN_TAG - switch t[1] + switch t.name when 'html' parse_error() return if template_tag_is_open() - root_attrs = open_tags[open_tags.length - 1][3] - for k, v of t[2] + root_attrs = open_tags[open_tags.length - 1].children + for k, v of t.attrs root_attrs[k] = v unless root_attrs[k]? when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title' # FIXME also do this for (end tag) @@ -306,7 +353,7 @@ parse_html = (txt, parse_error_cb = null) -> tree_insert_tag t when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6' close_p_if_in_button_scope() - if open_tags[0][1] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] + if open_tags[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] parse_error() open_tags.shift() tree_insert_tag t @@ -320,13 +367,13 @@ parse_html = (txt, parse_error_cb = null) -> tfoot: true, th: true, thead: true, tr: true, body: true, html: true, } for t in open_tags - unless ok_tags[t[1]]? + unless ok_tags[t.name]? parse_error() break # TODO stack of template insertion modes thing flag_parsing = false # stop parsing when TYPE_END_TAG - switch t[1] + switch t.name when 'body' unless is_in_scope 'body' parse_error() @@ -340,15 +387,15 @@ parse_html = (txt, parse_error_cb = null) -> # TODO lots more close tags to implement here else for node, i in open_tags - if node[1] is t[1] - # FIXME generate implied end tags except those with name==t[1] + if node.name is t.name + # FIXME generate implied end tags except those with name==t.name parse_error() unless i is 0 while i > 0 open_tags.shift() i -= 1 open_tags.shift() return - if special_elements[node[1]]? + if special_elements[node.name]? parse_error() return @@ -360,16 +407,16 @@ parse_html = (txt, parse_error_cb = null) -> tok_state_data = -> switch c = txt.charAt(cur++) when '&' - return [TYPE_TEXT, tokenize_character_reference()] + return new_text_node tokenize_character_reference() when '<' tok_state = tok_state_tag_open when "\u0000" parse_error() - return [TYPE_TEXT, c] + return new_text_node c when '' # EOF - return [TYPE_EOF] + return new_eof_token() else - return [TYPE_TEXT, c] + return new_text_node c return null # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state @@ -388,16 +435,16 @@ parse_html = (txt, parse_error_cb = null) -> tok_state = tok_state_bogus_comment else if lc_alpha.indexOf(c) > -1 - tok_cur_tag = [TYPE_OPEN_TAG, c, [], []] + tok_cur_tag = new_open_tag c tok_state = tok_state_tag_name else if uc_alpha.indexOf(c) > -1 - tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []] + tok_cur_tag = new_open_tag c.toLowerCase() tok_state = tok_state_tag_name else parse_error() tok_state = tok_state_data cur -= 1 # we didn't parse/handle the char after < - return [TYPE_TEXT, '<'] + return new_text_node '<' return null # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state @@ -409,13 +456,13 @@ parse_html = (txt, parse_error_cb = null) -> when '' # EOF parse_error() tok_state = tok_state_data - return [TYPE_TEXT, ' -1 - tok_cur_tag = [TYPE_END_TAG, c.toLowerCase(), [], []] + tok_cur_tag = new_end_tag c.toLowerCase() tok_state = tok_state_tag_name else if lc_alpha.indexOf(c) > -1 - tok_cur_tag = [TYPE_END_TAG, c, [], []] + tok_cur_tag = new_end_tag c tok_state = tok_state_tag_name else parse_error() @@ -436,15 +483,15 @@ parse_html = (txt, parse_error_cb = null) -> return tmp when "\u0000" parse_error() - tok_cur_tag[1] += "\ufffd" + tok_cur_tag.name += "\ufffd" when '' # EOF parse_error() tok_state = tok_state_data else if uc_alpha.indexOf(c) > -1 - tok_cur_tag[1] += c.toLowerCase() + tok_cur_tag.name += c.toLowerCase() else - tok_cur_tag[1] += c + tok_cur_tag.name += c return null # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state @@ -476,7 +523,7 @@ parse_html = (txt, parse_error_cb = null) -> else attr_name = c if attr_name? - tok_cur_tag[2].unshift [attr_name, ''] + tok_cur_tag.attrs_a.unshift [attr_name, ''] tok_state = tok_state_attribute_name return null @@ -496,18 +543,18 @@ parse_html = (txt, parse_error_cb = null) -> return tmp when "\u0000" parse_error() - tok_cur_tag[2][0][0] = "\ufffd" + tok_cur_tag.attrs_a[0][0] = "\ufffd" when '"', "'", '<' parse_error() - tok_cur_tag[2][0][0] = c + tok_cur_tag.attrs_a[0][0] = c when '' # EOF parse_error() tok_state = tok_state_data else if uc_alpha.indexOf(c) > -1 - tok_cur_tag[2][0][0] = c.toLowerCase() + tok_cur_tag.attrs_a[0][0] = c.toLowerCase() else - tok_cur_tag[2][0][0] += c + tok_cur_tag.attrs_a[0][0] += c return null # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state @@ -524,7 +571,7 @@ parse_html = (txt, parse_error_cb = null) -> tok_state = tok_state_attribute_value_single_quoted when "\u0000" # Parse error - tok_cur_tag[2][0][1] += "\ufffd" + tok_cur_tag.attrs_a[0][1] += "\ufffd" tok_state = tok_state_attribute_value_unquoted when '>' # Parse error @@ -536,7 +583,7 @@ parse_html = (txt, parse_error_cb = null) -> parse_error() tok_state = tok_state_data else - tok_cur_tag[2][0][1] += c + tok_cur_tag.attrs_a[0][1] += c tok_state = tok_state_attribute_value_unquoted return null @@ -546,15 +593,15 @@ parse_html = (txt, parse_error_cb = null) -> when '"' tok_state = tok_state_after_attribute_value_quoted when '&' - tok_cur_tag[2][0][1] += tokenize_character_reference '"', true + tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true when "\u0000" # Parse error - tok_cur_tag[2][0][1] += "\ufffd" + tok_cur_tag.attrs_a[0][1] += "\ufffd" when '' # EOF parse_error() tok_state = tok_state_data else - tok_cur_tag[2][0][1] += c + tok_cur_tag.attrs_a[0][1] += c return null # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state @@ -563,15 +610,15 @@ parse_html = (txt, parse_error_cb = null) -> when "'" tok_state = tok_state_after_attribute_value_quoted when '&' - tok_cur_tag[2][0][1] += tokenize_character_reference "'", true + tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true when "\u0000" # Parse error - tok_cur_tag[2][0][1] += "\ufffd" + tok_cur_tag.attrs_a[0][1] += "\ufffd" when '' # EOF parse_error() tok_state = tok_state_data else - tok_cur_tag[2][0][1] += c + tok_cur_tag.attrs_a[0][1] += c return null # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state @@ -580,20 +627,20 @@ parse_html = (txt, parse_error_cb = null) -> when "\t", "\n", "\u000c", ' ' tok_state = tok_state_before_attribute_name when '&' - tok_cur_tag[2][0][1] += tokenize_character_reference '>', true + tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true when '>' tok_state = tok_state_data tmp = tok_cur_tag tok_cur_tag = null return tmp when "\u0000" - tok_cur_tag[2][0][1] += "\ufffd" + tok_cur_tag.attrs_a[0][1] += "\ufffd" when '' # EOF parse_error() tok_state = tok_state_data else # Parse Error if ', <, = or ` (backtick) - tok_cur_tag[2][0][1] += c + tok_cur_tag.attrs_a[0][1] += c return null # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state @@ -695,7 +742,7 @@ parse_html = (txt, parse_error_cb = null) -> # tree constructor initialization # see comments on TYPE_TAG/etc for the structure of this data - tree = [TYPE_TAG, 'html', {}, []] + tree = new Node TYPE_TAG, name: 'html' open_tags = [tree] tree_state = tree_in_body flag_frameset_ok = true @@ -709,7 +756,7 @@ parse_html = (txt, parse_error_cb = null) -> t = tok_state() if t? tree_state t - return tree[3] + return tree.children # everything below is tests on the above test_equals = (description, output, expected_output) -> @@ -724,67 +771,72 @@ test_parser = (args) -> errors_cb = (i) -> parse_errors.push i parsed = parse_html args.html, errors_cb - parsed = JSON.stringify parsed - if parsed isnt args.expected or parse_errors.length isnt args.errors + serialized = '' + sep = '' + for t in parsed + serialized += sep + sep = ',' + serialized += t.serialize() + if serialized isnt args.expected or parse_errors.length isnt args.errors console.log "test FAILED: \"#{args.name}\"" else console.log 'test passed' - if parsed isnt args.expected + if serialized isnt args.expected console.log " Input: #{args.html}" console.log " Correct: #{args.expected}" - console.log " Output: #{parsed}" + console.log " Output: #{serialized}" if parse_errors.length isnt args.errors console.log " Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}" test_parser name: "empty", \ html: "", - expected: '[]', + expected: '', errors: 0 test_parser name: "just text", \ html: "abc", - expected: '[[1,"abc"]]', + expected: 'text:"abc"', errors: 0 test_parser name: "named entity", \ html: "a&1234", - expected: '[[1,"a&1234"]]', + expected: 'text:"a&1234"', errors: 0 test_parser name: "broken named character references", \ html: "1&2&&3&aabbcc;", - expected: '[[1,"1&2&&3&aabbcc;"]]', + expected: 'text:"1&2&&3&aabbcc;"', errors: 2 test_parser name: "numbered entity overrides", \ html: "1€€ ƒ", - expected: '[[1,"1€€ ƒ"]]', + expected: 'text:"1€€ ƒ"', errors: 0 test_parser name: "open tag", \ html: "foobar", - expected: '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]', + expected: 'text:"foo",tag:"span",{},[text:"bar"]', errors: 1 # no close tag test_parser name: "open tag with attributes", \ html: "foobar", - expected: '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]', + expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]', errors: 1 # no close tag test_parser name: "open tag with attributes of various quotings", \ html: "foobar", - expected: '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]', + expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]', errors: 1 # no close tag test_parser name: "attribute entity exceptions dq", \ html: "foobar", - expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]', + expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]', errors: 2 # no close tag, &= in attr test_parser name: "attribute entity exceptions sq", \ html: "foobar", - expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]', + expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]', errors: 2 # no close tag, &= in attr test_parser name: "attribute entity exceptions uq", \ html: "foobar", - expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]', + expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]', errors: 2 # no close tag, &= in attr test_parser name: "matching closing tags", \ html: "foohi
1
foo
2
bar", - expected: '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]', + expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"', errors: 0 test_parser name: "mis-matched closing tags", \ html: "foo
barbaz
qux", - expected: '[[1,"foo"],[0,"div",{},[[1,"bar"],[0,"span",{},[[1,"baz"]]]]],[1,"qux"]]', + expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"', errors: 1 # close tag mismatch -- 1.7.10.4