From: Jason Woofenden Date: Mon, 14 Dec 2015 19:37:25 +0000 (-0500) Subject: parse_errors, EOF, &/etc in attrs X-Git-Url: https://jasonwoof.com/gitweb/?a=commitdiff_plain;h=3cf2532965e15d15b4f62868abb1ed5cf3eaee45;p=peach-html5-editor.git parse_errors, EOF, &/etc in attrs --- diff --git a/parse-html.coffee b/parse-html.coffee index 1ef077a..a393dc0 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -32,6 +32,7 @@ TYPE_WHITESPACE = 2 TYPE_COMMENT = 3 # the following types are emited by the tokenizer, but shouldn't end up in the tree: TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children] +TYPE_EOF = 5 lc_alpha = "abcdefghijklmnopqrstuvwxqz" uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ" @@ -159,6 +160,8 @@ parse_html = (txt) -> tok_state = null tok_cur_tag = null # partially parsed tag + parse_error = -> + console.log "Parse error at character #{cur} of #{txt.length}" # the functions below implement the tokenizer stats described here: # http://www.w3.org/TR/html5/syntax.html#tokenization @@ -167,75 +170,21 @@ parse_html = (txt) -> tok_state_data = -> switch c = txt.charAt(cur++) when '&' - tok_state = tok_state_character_reference_in_data + return [TYPE_TEXT, tokenize_character_reference()] when '<' tok_state = tok_state_tag_open when "\u0000" - # Parse error + parse_error() return [TYPE_TEXT, c] + when '' # EOF + return [TYPE_EOF] else return [TYPE_TEXT, c] return null # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state - # & just got consumed - tok_state_character_reference_in_data = -> - tok_state = tok_state_data - if cur >= txt.length - return [TYPE_TEXT, '&'] - switch c = txt.charAt(cur) - when ';' - return [TYPE_TEXT, '&'] - when '#' - if cur + 1 >= txt.length - return [TYPE_TEXT, '&'] - if txt.charAt(cur + 1).toLowerCase() is 'x' - prefix = '#x' - charset = hex_chars - start = cur + 2 - else - charset = digits - start = cur + 1 - prefix = '#' - i = 0 - while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1 - i += 1 - if i is 0 - return [TYPE_TEXT, '&'] - if txt.charAt(start + i) is ';' - i += 1 - decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase()) - if decoded? - cur = start + i - return [TYPE_TEXT, decoded] - return [TYPE_TEXT, '&'] - else - for i in [0...31] - if alnum.indexOf(txt.charAt(cur + i)) is -1 - break - if i is 0 - return [TYPE_TEXT, '&'] - if txt.charAt(cur + i) is ';' - i += 1 # include ';' terminator in value - decoded = decode_named_char_ref txt.substr(cur, i) - if decoded? - cur += i - return [TYPE_TEXT, decoded] - return [TYPE_TEXT, '&'] - else - # no ';' terminator (only legacy char refs) - if i < 2 or i > 6 - return [TYPE_TEXT, '&'] - # FIXME: if we're inside an attribute: - # 1. don't parse refs that are followed by = - # 2. don't parse refs that are followed by alnum - max = i - for i in [2..max] # no prefix matches, so ok to check shortest first - c = legacy_char_refs[txt.substr(cur, i)] - if c? - cur += i # consume entity chars - return [TYPE_TEXT, c] - return null + # not needed: tok_state_character_reference_in_data = -> + # just call tok_state_character_reference_in_data() # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state tok_state_tag_open = -> @@ -245,7 +194,7 @@ parse_html = (txt) -> when '/' tok_state = tok_state_end_tag_open when '?' - # Parse error + parse_error() tok_state = tok_state_bogus_comment else if lc_alpha.indexOf(c) > -1 @@ -255,7 +204,7 @@ parse_html = (txt) -> tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []] tok_state = tok_state_tag_name else - # Parse error + parse_error() tok_state = tok_state_data cur -= 1 # we didn't parse/handle the char after < return [TYPE_TEXT, '<'] @@ -274,8 +223,11 @@ parse_html = (txt) -> tok_cur_tag = null return tmp when "\u0000" - # Parse error + parse_error() tok_cur_tag[1] += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data else if uc_alpha.indexOf(c) > -1 tok_cur_tag[1] += c.toLowerCase() @@ -298,11 +250,14 @@ parse_html = (txt) -> tok_cur_tag = null return tmp when "\u0000" - # Parse error + parse_error() attr_name = "\ufffd" when '"', "'", '<', '=' - # Parse error + parse_error() attr_name = c + when '' # EOF + parse_error() + tok_state = tok_state_data else if uc_alpha.indexOf(c) > -1 attr_name = c.toLowerCase() @@ -328,13 +283,18 @@ parse_html = (txt) -> tok_cur_tag = null return tmp when "\u0000" - # Parse error - tok_cur_tag[2][0][0] += "\ufffd" + parse_error() + tok_cur_tag[2][0][0] = "\ufffd" + when '"', "'", '<' + parse_error() + tok_cur_tag[2][0][0] = c + when '' # EOF + parse_error() + tok_state = tok_state_data else if uc_alpha.indexOf(c) > -1 - tok_cur_tag[2][0][0] += c.toLowerCase() + tok_cur_tag[2][0][0] = c.toLowerCase() else - # Parse error if ", ' or < tok_cur_tag[2][0][0] += c return null @@ -360,6 +320,9 @@ parse_html = (txt) -> tmp = tok_cur_tag tok_cur_tag = null return tmp + when '' # EOF + parse_error() + tok_state = tok_state_data else tok_cur_tag[2][0][1] += c tok_state = tok_state_attribute_value_unquoted @@ -371,11 +334,13 @@ parse_html = (txt) -> when '"' tok_state = tok_state_after_attribute_value_quoted when '&' - tok_state = tok_state_character_reference_in_attribute_value - tok_char_ref_addl_allowed = '"' # FIXME + tok_cur_tag[2][0][1] += tokenize_character_reference '"', true when "\u0000" # Parse error tok_cur_tag[2][0][1] += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data else tok_cur_tag[2][0][1] += c return null @@ -386,11 +351,13 @@ parse_html = (txt) -> when "'" tok_state = tok_state_after_attribute_value_quoted when '&' - tok_state = tok_state_character_reference_in_attribute_value - tok_char_ref_addl_allowed = "'" # FIXME + tok_cur_tag[2][0][1] += tokenize_character_reference "'", true when "\u0000" # Parse error tok_cur_tag[2][0][1] += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data else tok_cur_tag[2][0][1] += c return null @@ -401,8 +368,7 @@ parse_html = (txt) -> when "\t", "\n", "\u000c", ' ' tok_state = tok_state_before_attribute_name when '&' - tok_state = tok_state_character_reference_in_attribute_value - tok_char_ref_addl_allowed = '>' # FIXME + tok_cur_tag[2][0][1] += tokenize_character_reference '>', true when '>' tok_state = tok_state_data tmp = tok_cur_tag @@ -410,6 +376,9 @@ parse_html = (txt) -> return tmp when "\u0000" tok_cur_tag[2][0][1] += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data else # Parse Error if ', <, = or ` (backtick) tok_cur_tag[2][0][1] += c @@ -427,28 +396,119 @@ parse_html = (txt) -> tmp = tok_cur_tag tok_cur_tag = null return tmp + when '' # EOF + parse_error() + tok_state = tok_state_data else # Parse Error tok_state = tok_state_before_attribute_name cur -= 1 # we didn't handle that char return null + # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference + # Don't set this as a state, just call it + # returns a string (NOT a text node) + tokenize_character_reference = (allowed_char = null, in_attr = false) -> + if cur >= txt.length + return '&' + switch c = txt.charAt(cur) + when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char + # explicitly not a parse error + return '&' + when ';' + # there has to be "one or more" alnums between & and ; to be a parse error + return '&' + when '#' + if cur + 1 >= txt.length + return '&' + if txt.charAt(cur + 1).toLowerCase() is 'x' + prefix = '#x' + charset = hex_chars + start = cur + 2 + else + charset = digits + start = cur + 1 + prefix = '#' + i = 0 + while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1 + i += 1 + if i is 0 + return '&' + if txt.charAt(start + i) is ';' + i += 1 + # FIXME This is supposed to generate parse errors for some chars + decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase()) + if decoded? + cur = start + i + return decoded + return '&' + else + for i in [0...31] + if alnum.indexOf(txt.charAt(cur + i)) is -1 + break + if i is 0 + # exit early, because parse_error() below needs at least one alnum + return '&' + if txt.charAt(cur + i) is ';' + i += 1 # include ';' terminator in value + decoded = decode_named_char_ref txt.substr(cur, i) + if decoded? + cur += i + return decoded + parse_error() + return '&' + else + # no ';' terminator (only legacy char refs) + max = i + for i in [2..max] # no prefix matches, so ok to check shortest first + c = legacy_char_refs[txt.substr(cur, i)] + if c? + if in_attr + if txt.charAt(cur + i) is '=' + # "because some legacy user agents will + # misinterpret the markup in those cases" + parse_error() + return '&' + if alnum.indexOf(txt.charAt(cur + i)) > -1 + # this makes attributes forgiving about url args + return '&' + # ok, and besides the weird exceptions for attributes... + # return the matching char + cur += i # consume entity chars + parse_error() # because no terminating ";" + return c + parse_error() + return '&' + return # never reached + # the functions below impliment the Tree Contstruction algorithm here: # http://www.w3.org/TR/html5/syntax.html#tree-construction # FIXME this is just a bit of a hack that makes sense... read spec and do it that way tree_append = (t) -> - if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT - tree_append_point[tree_append_point.length - 1][1] += t[1] - else - tree_append_point.push t - if t[0] is TYPE_OPEN_TAG + switch t[0] + when TYPE_TEXT + if tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT + tree_append_point[tree_append_point.length - 1][1] += t[1] + else + tree_append_point.push t + when TYPE_OPEN_TAG t[0] = TYPE_TAG + # convert attributes into a hash attrs = {} while t[2].length a = t[2].pop() attrs[a[0]] = a[1] t[2] = attrs + tree_append_point.push t tree_append_point = t[3] + # TODO implement stack of open elements + # TODO implement formatting elements thing + when TYPE_EOF + return + # TODO implement close tags + # TODO implement self-closing tags + else + console.log "UNIMPLEMENTED tag type: #{t[0]}" # tree constructor initialization tree = [] # see comments on TYPE_TAG/etc for the structure of this data @@ -459,12 +519,13 @@ parse_html = (txt) -> tok_state = tok_state_data # proccess input - while cur < txt.length + loop t = tok_state() if t? tree_state t - - return tree + if t[0] is TYPE_EOF + return tree + return # never reached # everything below is tests on the above test_equals = (description, fn, args..., expected_output) -> @@ -472,7 +533,9 @@ test_equals = (description, fn, args..., expected_output) -> if output is expected_output console.log "passed: #{description}." else - console.log "FAILED: #{description}. Expected: #{expected_output}, actual: #{output}" + console.log "FAILED: #{description}..." + console.log " Expected: #{expected_output}" + console.log " Actual: #{output}" html_to_json = (html) -> return JSON.stringify parse_html html test_equals "empty", html_to_json, "", '[]' @@ -483,3 +546,6 @@ test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[ test_equals "open tag", html_to_json, "foobar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]' test_equals "open tag with attributes", html_to_json, "foobar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]' test_equals "open tag with attributes of various quotings", html_to_json, "foobar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]' +test_equals "attribute entity exceptions dq", html_to_json, "foobar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]' +test_equals "attribute entity exceptions sq", html_to_json, "foobar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]' +test_equals "attribute entity exceptions uq", html_to_json, "foobar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'