X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=1ef077ad26323307c25ea6ee4ec2c338a2ff2d78;hb=06466aac55914c23fc6ba986c28ccf069386767c;hp=7ed736ed1dae9720a73be6ebb849bf534b72d7a9;hpb=b1041cd8d6358a3dcc545cb25acac30fb87f281b;p=peach-html5-editor.git diff --git a/parse-html.coffee b/parse-html.coffee index 7ed736e..1ef077a 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -177,6 +177,66 @@ parse_html = (txt) -> return [TYPE_TEXT, c] return null + # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state + # & just got consumed + tok_state_character_reference_in_data = -> + tok_state = tok_state_data + if cur >= txt.length + return [TYPE_TEXT, '&'] + switch c = txt.charAt(cur) + when ';' + return [TYPE_TEXT, '&'] + when '#' + if cur + 1 >= txt.length + return [TYPE_TEXT, '&'] + if txt.charAt(cur + 1).toLowerCase() is 'x' + prefix = '#x' + charset = hex_chars + start = cur + 2 + else + charset = digits + start = cur + 1 + prefix = '#' + i = 0 + while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1 + i += 1 + if i is 0 + return [TYPE_TEXT, '&'] + if txt.charAt(start + i) is ';' + i += 1 + decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase()) + if decoded? + cur = start + i + return [TYPE_TEXT, decoded] + return [TYPE_TEXT, '&'] + else + for i in [0...31] + if alnum.indexOf(txt.charAt(cur + i)) is -1 + break + if i is 0 + return [TYPE_TEXT, '&'] + if txt.charAt(cur + i) is ';' + i += 1 # include ';' terminator in value + decoded = decode_named_char_ref txt.substr(cur, i) + if decoded? + cur += i + return [TYPE_TEXT, decoded] + return [TYPE_TEXT, '&'] + else + # no ';' terminator (only legacy char refs) + if i < 2 or i > 6 + return [TYPE_TEXT, '&'] + # FIXME: if we're inside an attribute: + # 1. don't parse refs that are followed by = + # 2. don't parse refs that are followed by alnum + max = i + for i in [2..max] # no prefix matches, so ok to check shortest first + c = legacy_char_refs[txt.substr(cur, i)] + if c? + cur += i # consume entity chars + return [TYPE_TEXT, c] + return null + # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state tok_state_tag_open = -> switch c = txt.charAt(cur++) @@ -301,11 +361,8 @@ parse_html = (txt) -> tok_cur_tag = null return tmp else - if uc_alpha.indexOf(c) > -1 - tok_cur_tag[2][0][1] += c.toLowerCase() - else - # Parse error if ", ` or < (that's a backtick) - tok_cur_tag[2][0][1] += c + tok_cur_tag[2][0][1] += c + tok_state = tok_state_attribute_value_unquoted return null # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state @@ -319,11 +376,45 @@ parse_html = (txt) -> when "\u0000" # Parse error tok_cur_tag[2][0][1] += "\ufffd" - tok_state = tok_state_attribute_value_unquoted else tok_cur_tag[2][0][1] += c return null + # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state + tok_state_attribute_value_single_quoted = -> + switch c = txt.charAt(cur++) + when "'" + tok_state = tok_state_after_attribute_value_quoted + when '&' + tok_state = tok_state_character_reference_in_attribute_value + tok_char_ref_addl_allowed = "'" # FIXME + when "\u0000" + # Parse error + tok_cur_tag[2][0][1] += "\ufffd" + else + tok_cur_tag[2][0][1] += c + return null + + # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state + tok_state_attribute_value_unquoted = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + tok_state = tok_state_before_attribute_name + when '&' + tok_state = tok_state_character_reference_in_attribute_value + tok_char_ref_addl_allowed = '>' # FIXME + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when "\u0000" + tok_cur_tag[2][0][1] += "\ufffd" + else + # Parse Error if ', <, = or ` (backtick) + tok_cur_tag[2][0][1] += c + return null + # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state tok_state_after_attribute_value_quoted = -> switch c = txt.charAt(cur++) @@ -342,67 +433,6 @@ parse_html = (txt) -> cur -= 1 # we didn't handle that char return null - - # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state - # & just got consumed - tok_state_character_reference_in_data = -> - tok_state = tok_state_data - if cur >= txt.length - return [TYPE_TEXT, '&'] - switch c = txt.charAt(cur) - when ';' - return [TYPE_TEXT, '&'] - when '#' - if cur + 1 >= txt.length - return [TYPE_TEXT, '&'] - if txt.charAt(cur + 1).toLowerCase() is 'x' - prefix = '#x' - charset = hex_chars - start = cur + 2 - else - charset = digits - start = cur + 1 - prefix = '#' - i = 0 - while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1 - i += 1 - if i is 0 - return [TYPE_TEXT, '&'] - if txt.charAt(start + i) is ';' - i += 1 - decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase()) - if decoded? - cur = start + i - return [TYPE_TEXT, decoded] - return [TYPE_TEXT, '&'] - else - for i in [0...31] - if alnum.indexOf(txt.charAt(cur + i)) is -1 - break - if i is 0 - return [TYPE_TEXT, '&'] - if txt.charAt(cur + i) is ';' - i += 1 # include ';' terminator in value - decoded = decode_named_char_ref txt.substr(cur, i) - if decoded? - cur += i - return [TYPE_TEXT, decoded] - return [TYPE_TEXT, '&'] - else - # no ';' terminator (only legacy char refs) - if i < 2 or i > 6 - return [TYPE_TEXT, '&'] - # FIXME: if we're inside an attribute: - # 1. don't parse refs that are followed by = - # 2. don't parse refs that are followed by alnum - max = i - for i in [2..max] # no prefix matches, so ok to check shortest first - c = legacy_char_refs[txt.substr(cur, i)] - if c? - cur += i # consume entity chars - return [TYPE_TEXT, c] - return null - # the functions below impliment the Tree Contstruction algorithm here: # http://www.w3.org/TR/html5/syntax.html#tree-construction # FIXME this is just a bit of a hack that makes sense... read spec and do it that way @@ -451,4 +481,5 @@ test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]' test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]' test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1€€ ƒ"]]' test_equals "open tag", html_to_json, "foobar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]' -test_equals "open tag with attributes", html_to_json, "foobar", '[[1,"foo"],[0,"span",{"style":"foo: bar"},[[1,"bar"]]]]' +test_equals "open tag with attributes", html_to_json, "foobar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]' +test_equals "open tag with attributes of various quotings", html_to_json, "foobar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'