X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=db1837bd6d02bfc0ef5f6a5a7cd52e0a90750cf3;hb=50c30e8b7c0663890369776992b1f57876e6226e;hp=1ef077ad26323307c25ea6ee4ec2c338a2ff2d78;hpb=06466aac55914c23fc6ba986c28ccf069386767c;p=peach-html5-editor.git diff --git a/parse-html.coffee b/parse-html.coffee index 1ef077a..db1837b 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -32,6 +32,8 @@ TYPE_WHITESPACE = 2 TYPE_COMMENT = 3 # the following types are emited by the tokenizer, but shouldn't end up in the tree: TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children] +TYPE_CLOSE_TAG = 5 # name +TYPE_EOF = 6 lc_alpha = "abcdefghijklmnopqrstuvwxqz" uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ" @@ -154,11 +156,13 @@ parse_html = (txt) -> cur = 0 # index of next char in txt to be parsed # declare tree and tokenizer variables so they're in scope below tree = null - tree_append_point = null + open_tags = [] # stack of open elements tree_state = null tok_state = null tok_cur_tag = null # partially parsed tag + parse_error = -> + console.log "Parse error at character #{cur} of #{txt.length}" # the functions below implement the tokenizer stats described here: # http://www.w3.org/TR/html5/syntax.html#tokenization @@ -167,75 +171,21 @@ parse_html = (txt) -> tok_state_data = -> switch c = txt.charAt(cur++) when '&' - tok_state = tok_state_character_reference_in_data + return [TYPE_TEXT, tokenize_character_reference()] when '<' tok_state = tok_state_tag_open when "\u0000" - # Parse error + parse_error() return [TYPE_TEXT, c] + when '' # EOF + return [TYPE_EOF] else return [TYPE_TEXT, c] return null # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state - # & just got consumed - tok_state_character_reference_in_data = -> - tok_state = tok_state_data - if cur >= txt.length - return [TYPE_TEXT, '&'] - switch c = txt.charAt(cur) - when ';' - return [TYPE_TEXT, '&'] - when '#' - if cur + 1 >= txt.length - return [TYPE_TEXT, '&'] - if txt.charAt(cur + 1).toLowerCase() is 'x' - prefix = '#x' - charset = hex_chars - start = cur + 2 - else - charset = digits - start = cur + 1 - prefix = '#' - i = 0 - while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1 - i += 1 - if i is 0 - return [TYPE_TEXT, '&'] - if txt.charAt(start + i) is ';' - i += 1 - decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase()) - if decoded? - cur = start + i - return [TYPE_TEXT, decoded] - return [TYPE_TEXT, '&'] - else - for i in [0...31] - if alnum.indexOf(txt.charAt(cur + i)) is -1 - break - if i is 0 - return [TYPE_TEXT, '&'] - if txt.charAt(cur + i) is ';' - i += 1 # include ';' terminator in value - decoded = decode_named_char_ref txt.substr(cur, i) - if decoded? - cur += i - return [TYPE_TEXT, decoded] - return [TYPE_TEXT, '&'] - else - # no ';' terminator (only legacy char refs) - if i < 2 or i > 6 - return [TYPE_TEXT, '&'] - # FIXME: if we're inside an attribute: - # 1. don't parse refs that are followed by = - # 2. don't parse refs that are followed by alnum - max = i - for i in [2..max] # no prefix matches, so ok to check shortest first - c = legacy_char_refs[txt.substr(cur, i)] - if c? - cur += i # consume entity chars - return [TYPE_TEXT, c] - return null + # not needed: tok_state_character_reference_in_data = -> + # just call tok_state_character_reference_in_data() # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state tok_state_tag_open = -> @@ -245,7 +195,7 @@ parse_html = (txt) -> when '/' tok_state = tok_state_end_tag_open when '?' - # Parse error + parse_error() tok_state = tok_state_bogus_comment else if lc_alpha.indexOf(c) > -1 @@ -255,12 +205,34 @@ parse_html = (txt) -> tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []] tok_state = tok_state_tag_name else - # Parse error + parse_error() tok_state = tok_state_data cur -= 1 # we didn't parse/handle the char after < return [TYPE_TEXT, '<'] return null + # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state + tok_state_end_tag_open = -> + switch c = txt.charAt(cur++) + when '>' + parse_error() + tok_state = tok_state_data + when '' # EOF + parse_error() + tok_state = tok_state_data + return [TYPE_TEXT, ' -1 + tok_cur_tag = [TYPE_CLOSE_TAG, c.toLowerCase(), [], []] + tok_state = tok_state_tag_name + else if lc_alpha.indexOf(c) > -1 + tok_cur_tag = [TYPE_CLOSE_TAG, c, [], []] + tok_state = tok_state_tag_name + else + parse_error() + tok_state = tok_state_bogus_comment + return null + # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state tok_state_tag_name = -> switch c = txt.charAt(cur++) @@ -274,8 +246,11 @@ parse_html = (txt) -> tok_cur_tag = null return tmp when "\u0000" - # Parse error + parse_error() tok_cur_tag[1] += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data else if uc_alpha.indexOf(c) > -1 tok_cur_tag[1] += c.toLowerCase() @@ -298,11 +273,14 @@ parse_html = (txt) -> tok_cur_tag = null return tmp when "\u0000" - # Parse error + parse_error() attr_name = "\ufffd" when '"', "'", '<', '=' - # Parse error + parse_error() attr_name = c + when '' # EOF + parse_error() + tok_state = tok_state_data else if uc_alpha.indexOf(c) > -1 attr_name = c.toLowerCase() @@ -328,13 +306,18 @@ parse_html = (txt) -> tok_cur_tag = null return tmp when "\u0000" - # Parse error - tok_cur_tag[2][0][0] += "\ufffd" + parse_error() + tok_cur_tag[2][0][0] = "\ufffd" + when '"', "'", '<' + parse_error() + tok_cur_tag[2][0][0] = c + when '' # EOF + parse_error() + tok_state = tok_state_data else if uc_alpha.indexOf(c) > -1 - tok_cur_tag[2][0][0] += c.toLowerCase() + tok_cur_tag[2][0][0] = c.toLowerCase() else - # Parse error if ", ' or < tok_cur_tag[2][0][0] += c return null @@ -360,6 +343,9 @@ parse_html = (txt) -> tmp = tok_cur_tag tok_cur_tag = null return tmp + when '' # EOF + parse_error() + tok_state = tok_state_data else tok_cur_tag[2][0][1] += c tok_state = tok_state_attribute_value_unquoted @@ -371,11 +357,13 @@ parse_html = (txt) -> when '"' tok_state = tok_state_after_attribute_value_quoted when '&' - tok_state = tok_state_character_reference_in_attribute_value - tok_char_ref_addl_allowed = '"' # FIXME + tok_cur_tag[2][0][1] += tokenize_character_reference '"', true when "\u0000" # Parse error tok_cur_tag[2][0][1] += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data else tok_cur_tag[2][0][1] += c return null @@ -386,11 +374,13 @@ parse_html = (txt) -> when "'" tok_state = tok_state_after_attribute_value_quoted when '&' - tok_state = tok_state_character_reference_in_attribute_value - tok_char_ref_addl_allowed = "'" # FIXME + tok_cur_tag[2][0][1] += tokenize_character_reference "'", true when "\u0000" # Parse error tok_cur_tag[2][0][1] += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data else tok_cur_tag[2][0][1] += c return null @@ -401,8 +391,7 @@ parse_html = (txt) -> when "\t", "\n", "\u000c", ' ' tok_state = tok_state_before_attribute_name when '&' - tok_state = tok_state_character_reference_in_attribute_value - tok_char_ref_addl_allowed = '>' # FIXME + tok_cur_tag[2][0][1] += tokenize_character_reference '>', true when '>' tok_state = tok_state_data tmp = tok_cur_tag @@ -410,6 +399,9 @@ parse_html = (txt) -> return tmp when "\u0000" tok_cur_tag[2][0][1] += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data else # Parse Error if ', <, = or ` (backtick) tok_cur_tag[2][0][1] += c @@ -427,44 +419,147 @@ parse_html = (txt) -> tmp = tok_cur_tag tok_cur_tag = null return tmp + when '' # EOF + parse_error() + tok_state = tok_state_data else # Parse Error tok_state = tok_state_before_attribute_name cur -= 1 # we didn't handle that char return null + # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference + # Don't set this as a state, just call it + # returns a string (NOT a text node) + tokenize_character_reference = (allowed_char = null, in_attr = false) -> + if cur >= txt.length + return '&' + switch c = txt.charAt(cur) + when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char + # explicitly not a parse error + return '&' + when ';' + # there has to be "one or more" alnums between & and ; to be a parse error + return '&' + when '#' + if cur + 1 >= txt.length + return '&' + if txt.charAt(cur + 1).toLowerCase() is 'x' + prefix = '#x' + charset = hex_chars + start = cur + 2 + else + charset = digits + start = cur + 1 + prefix = '#' + i = 0 + while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1 + i += 1 + if i is 0 + return '&' + if txt.charAt(start + i) is ';' + i += 1 + # FIXME This is supposed to generate parse errors for some chars + decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase()) + if decoded? + cur = start + i + return decoded + return '&' + else + for i in [0...31] + if alnum.indexOf(txt.charAt(cur + i)) is -1 + break + if i is 0 + # exit early, because parse_error() below needs at least one alnum + return '&' + if txt.charAt(cur + i) is ';' + i += 1 # include ';' terminator in value + decoded = decode_named_char_ref txt.substr(cur, i) + if decoded? + cur += i + return decoded + parse_error() + return '&' + else + # no ';' terminator (only legacy char refs) + max = i + for i in [2..max] # no prefix matches, so ok to check shortest first + c = legacy_char_refs[txt.substr(cur, i)] + if c? + if in_attr + if txt.charAt(cur + i) is '=' + # "because some legacy user agents will + # misinterpret the markup in those cases" + parse_error() + return '&' + if alnum.indexOf(txt.charAt(cur + i)) > -1 + # this makes attributes forgiving about url args + return '&' + # ok, and besides the weird exceptions for attributes... + # return the matching char + cur += i # consume entity chars + parse_error() # because no terminating ";" + return c + parse_error() + return '&' + return # never reached + # the functions below impliment the Tree Contstruction algorithm here: # http://www.w3.org/TR/html5/syntax.html#tree-construction # FIXME this is just a bit of a hack that makes sense... read spec and do it that way tree_append = (t) -> - if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT - tree_append_point[tree_append_point.length - 1][1] += t[1] - else - tree_append_point.push t - if t[0] is TYPE_OPEN_TAG + switch t[0] + when TYPE_TEXT + if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT + open_tags[0][3][open_tags[0][3].length - 1][1] += t[1] + else + open_tags[0][3].push t + when TYPE_OPEN_TAG t[0] = TYPE_TAG + # convert attributes into a hash attrs = {} while t[2].length a = t[2].pop() attrs[a[0]] = a[1] t[2] = attrs - tree_append_point = t[3] + # FIXME probs have to auto-close things first + open_tags[0][3].push t + open_tags.unshift t + # TODO implement formatting elements thing + when TYPE_CLOSE_TAG + # FIXME this is just a hack for now + if open_tags.length < 2 + parse_error() + return + if open_tags[0][1] isnt t[1] + parse_error() + # fall through and close something anyway + open_tags.shift() + when TYPE_EOF + return + # TODO implement close tags + # TODO implement self-closing tags + else + console.log "UNIMPLEMENTED tag type: #{t[0]}" + return # tree constructor initialization - tree = [] # see comments on TYPE_TAG/etc for the structure of this data - tree_append_point = tree + # see comments on TYPE_TAG/etc for the structure of this data + tree = [TYPE_TAG, 'html', {}, []] + open_tags = [tree] tree_state = tree_append # tokenizer initialization tok_state = tok_state_data # proccess input - while cur < txt.length + loop t = tok_state() if t? tree_state t - - return tree + if t[0] is TYPE_EOF + return tree[3] + return # never reached # everything below is tests on the above test_equals = (description, fn, args..., expected_output) -> @@ -472,7 +567,9 @@ test_equals = (description, fn, args..., expected_output) -> if output is expected_output console.log "passed: #{description}." else - console.log "FAILED: #{description}. Expected: #{expected_output}, actual: #{output}" + console.log "FAILED: #{description}..." + console.log " Expected: #{expected_output}" + console.log " Actual: #{output}" html_to_json = (html) -> return JSON.stringify parse_html html test_equals "empty", html_to_json, "", '[]' @@ -483,3 +580,7 @@ test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[ test_equals "open tag", html_to_json, "foobar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]' test_equals "open tag with attributes", html_to_json, "foobar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]' test_equals "open tag with attributes of various quotings", html_to_json, "foobar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]' +test_equals "attribute entity exceptions dq", html_to_json, "foobar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]' +test_equals "attribute entity exceptions sq", html_to_json, "foobar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]' +test_equals "attribute entity exceptions uq", html_to_json, "foobar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]' +test_equals "matching closing tags", html_to_json, "foohi
1
foo
2
bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]'