From b1041cd8d6358a3dcc545cb25acac30fb87f281b Mon Sep 17 00:00:00 2001 From: Jason Woofenden Date: Sun, 13 Dec 2015 21:36:29 -0500 Subject: [PATCH] parsing some attributes --- parse-html.coffee | 156 +++++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 141 insertions(+), 15 deletions(-) diff --git a/parse-html.coffee b/parse-html.coffee index 318422b..7ed736e 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -31,7 +31,7 @@ TYPE_TEXT = 1 # "text" TYPE_WHITESPACE = 2 TYPE_COMMENT = 3 # the following types are emited by the tokenizer, but shouldn't end up in the tree: -TYPE_OPEN_TAG = 4 +TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children] lc_alpha = "abcdefghijklmnopqrstuvwxqz" uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ" @@ -157,13 +157,13 @@ parse_html = (txt) -> tree_append_point = null tree_state = null tok_state = null - tok_cur = null # partially parsed tag + tok_cur_tag = null # partially parsed tag # the functions below implement the tokenizer stats described here: # http://www.w3.org/TR/html5/syntax.html#tokenization - # http://www.w3.org/TR/html5/syntax.html#data-state + # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state tok_state_data = -> switch c = txt.charAt(cur++) when '&' @@ -177,7 +177,7 @@ parse_html = (txt) -> return [TYPE_TEXT, c] return null - # http://www.w3.org/TR/html5/syntax.html#tag-open-state + # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state tok_state_tag_open = -> switch c = txt.charAt(cur++) when '!' @@ -189,10 +189,10 @@ parse_html = (txt) -> tok_state = tok_state_bogus_comment else if lc_alpha.indexOf(c) > -1 - tok_cur = [TYPE_OPEN_TAG, c, {}, []] + tok_cur_tag = [TYPE_OPEN_TAG, c, [], []] tok_state = tok_state_tag_name else if uc_alpha.indexOf(c) > -1 - tok_cur = [TYPE_OPEN_TAG, c.toLowerCase(), {}, []] + tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []] tok_state = tok_state_tag_name else # Parse error @@ -201,29 +201,149 @@ parse_html = (txt) -> return [TYPE_TEXT, '<'] return null - # http://www.w3.org/TR/html5/syntax.html#tag-name-state + # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state tok_state_tag_name = -> switch c = txt.charAt(cur++) - when "\t", "\n", ' ' + when "\t", "\n", "\u000c", ' ' tok_state = tok_state_before_attribute_name when '/' tok_state = tok_state_self_closing_start_tag when '>' tok_state = tok_state_data - tmp = tok_cur - tok_cur = null + tmp = tok_cur_tag + tok_cur_tag = null return tmp when "\u0000" # Parse error - tok_cur[1] += "\ufffd" + tok_cur_tag[1] += "\ufffd" else if uc_alpha.indexOf(c) > -1 - tok_cur[1] += c.toLowerCase() + tok_cur_tag[1] += c.toLowerCase() else - tok_cur[1] += c + tok_cur_tag[1] += c return null - # http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state + # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state + tok_state_before_attribute_name = -> + attr_name = null + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + return null + when '/' + tok_state = tok_state_self_closing_start_tag + return null + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when "\u0000" + # Parse error + attr_name = "\ufffd" + when '"', "'", '<', '=' + # Parse error + attr_name = c + else + if uc_alpha.indexOf(c) > -1 + attr_name = c.toLowerCase() + else + attr_name = c + if attr_name? + tok_cur_tag[2].unshift [attr_name, ''] + tok_state = tok_state_attribute_name + return null + + # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state + tok_state_attribute_name = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + tok_state = tok_state_after_attribute_name + when '/' + tok_state = tok_state_self_closing_start_tag + when '=' + tok_state = tok_state_before_attribute_value + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when "\u0000" + # Parse error + tok_cur_tag[2][0][0] += "\ufffd" + else + if uc_alpha.indexOf(c) > -1 + tok_cur_tag[2][0][0] += c.toLowerCase() + else + # Parse error if ", ' or < + tok_cur_tag[2][0][0] += c + return null + + # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state + tok_state_before_attribute_value = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + return null + when '"' + tok_state = tok_state_attribute_value_double_quoted + when '&' + tok_state = tok_state_attribute_value_unquoted + cur -= 1 + when "'" + tok_state = tok_state_attribute_value_single_quoted + when "\u0000" + # Parse error + tok_cur_tag[2][0][1] += "\ufffd" + tok_state = tok_state_attribute_value_unquoted + when '>' + # Parse error + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + else + if uc_alpha.indexOf(c) > -1 + tok_cur_tag[2][0][1] += c.toLowerCase() + else + # Parse error if ", ` or < (that's a backtick) + tok_cur_tag[2][0][1] += c + return null + + # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state + tok_state_attribute_value_double_quoted = -> + switch c = txt.charAt(cur++) + when '"' + tok_state = tok_state_after_attribute_value_quoted + when '&' + tok_state = tok_state_character_reference_in_attribute_value + tok_char_ref_addl_allowed = '"' # FIXME + when "\u0000" + # Parse error + tok_cur_tag[2][0][1] += "\ufffd" + tok_state = tok_state_attribute_value_unquoted + else + tok_cur_tag[2][0][1] += c + return null + + # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state + tok_state_after_attribute_value_quoted = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + tok_state = tok_state_before_attribute_name + when '/' + tok_state = tok_state_self_closing_start_tag + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + else + # Parse Error + tok_state = tok_state_before_attribute_name + cur -= 1 # we didn't handle that char + return null + + + # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state # & just got consumed tok_state_character_reference_in_data = -> tok_state = tok_state_data @@ -293,6 +413,11 @@ parse_html = (txt) -> tree_append_point.push t if t[0] is TYPE_OPEN_TAG t[0] = TYPE_TAG + attrs = {} + while t[2].length + a = t[2].pop() + attrs[a[0]] = a[1] + t[2] = attrs tree_append_point = t[3] # tree constructor initialization @@ -325,4 +450,5 @@ test_equals "just text", html_to_json, "abc", '[[1,"abc"]]' test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]' test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]' test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1€€ ƒ"]]' -test_equals "open_tag", html_to_json, "foobar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]' +test_equals "open tag", html_to_json, "foobar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]' +test_equals "open tag with attributes", html_to_json, "foobar", '[[1,"foo"],[0,"span",{"style":"foo: bar"},[[1,"bar"]]]]' -- 1.7.10.4