From 06466aac55914c23fc6ba986c28ccf069386767c Mon Sep 17 00:00:00 2001 From: Jason Woofenden Date: Sun, 13 Dec 2015 22:07:41 -0500 Subject: [PATCH] parse unquoted and singlequoted attributes --- parse-html.coffee | 46 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/parse-html.coffee b/parse-html.coffee index 204c6ff..1ef077a 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -361,11 +361,8 @@ parse_html = (txt) -> tok_cur_tag = null return tmp else - if uc_alpha.indexOf(c) > -1 - tok_cur_tag[2][0][1] += c.toLowerCase() - else - # Parse error if ", ` or < (that's a backtick) - tok_cur_tag[2][0][1] += c + tok_cur_tag[2][0][1] += c + tok_state = tok_state_attribute_value_unquoted return null # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state @@ -379,11 +376,45 @@ parse_html = (txt) -> when "\u0000" # Parse error tok_cur_tag[2][0][1] += "\ufffd" - tok_state = tok_state_attribute_value_unquoted else tok_cur_tag[2][0][1] += c return null + # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state + tok_state_attribute_value_single_quoted = -> + switch c = txt.charAt(cur++) + when "'" + tok_state = tok_state_after_attribute_value_quoted + when '&' + tok_state = tok_state_character_reference_in_attribute_value + tok_char_ref_addl_allowed = "'" # FIXME + when "\u0000" + # Parse error + tok_cur_tag[2][0][1] += "\ufffd" + else + tok_cur_tag[2][0][1] += c + return null + + # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state + tok_state_attribute_value_unquoted = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + tok_state = tok_state_before_attribute_name + when '&' + tok_state = tok_state_character_reference_in_attribute_value + tok_char_ref_addl_allowed = '>' # FIXME + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when "\u0000" + tok_cur_tag[2][0][1] += "\ufffd" + else + # Parse Error if ', <, = or ` (backtick) + tok_cur_tag[2][0][1] += c + return null + # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state tok_state_after_attribute_value_quoted = -> switch c = txt.charAt(cur++) @@ -450,4 +481,5 @@ test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]' test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]' test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1€€ ƒ"]]' test_equals "open tag", html_to_json, "foobar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]' -test_equals "open tag with attributes", html_to_json, "foobar", '[[1,"foo"],[0,"span",{"style":"foo: bar"},[[1,"bar"]]]]' +test_equals "open tag with attributes", html_to_json, "foobar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]' +test_equals "open tag with attributes of various quotings", html_to_json, "foobar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]' -- 1.7.10.4