X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=db1837bd6d02bfc0ef5f6a5a7cd52e0a90750cf3;hb=50c30e8b7c0663890369776992b1f57876e6226e;hp=04733fbc6dcf33b149ed81c61094a78601c87dfc;hpb=cc2911623e5e9a50e19ae8233a990c2e633c218e;p=peach-html5-editor.git diff --git a/parse-html.coffee b/parse-html.coffee index 04733fb..db1837b 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -30,10 +30,16 @@ TYPE_TAG = 0 # name, {attributes}, [children] TYPE_TEXT = 1 # "text" TYPE_WHITESPACE = 2 TYPE_COMMENT = 3 +# the following types are emited by the tokenizer, but shouldn't end up in the tree: +TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children] +TYPE_CLOSE_TAG = 5 # name +TYPE_EOF = 6 -alnum = "abcdefghijklmnopqrstuvwxqzABCDEFGHIJKLMNOPQRSTUVWXQZ0123456789" -hex_chars = "0123456789abcdefABCDEF" +lc_alpha = "abcdefghijklmnopqrstuvwxqz" +uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ" digits = "0123456789" +alnum = lc_alpha + uc_alpha + digits +hex_chars = digits + "abcdefABCDEF" # some SVG elements have dashes in them tag_name_chars = alnum + "-" @@ -150,40 +156,294 @@ parse_html = (txt) -> cur = 0 # index of next char in txt to be parsed # declare tree and tokenizer variables so they're in scope below tree = null - tree_append_point = null + open_tags = [] # stack of open elements tree_state = null tok_state = null + tok_cur_tag = null # partially parsed tag + parse_error = -> + console.log "Parse error at character #{cur} of #{txt.length}" # the functions below implement the tokenizer stats described here: # http://www.w3.org/TR/html5/syntax.html#tokenization + # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state tok_state_data = -> - if cur >= txt.length - return null switch c = txt.charAt(cur++) when '&' - tok_state = tok_state_character_reference_in_data + return [TYPE_TEXT, tokenize_character_reference()] when '<' tok_state = tok_state_tag_open when "\u0000" - # Parse error + parse_error() return [TYPE_TEXT, c] + when '' # EOF + return [TYPE_EOF] else return [TYPE_TEXT, c] return null - # & just got consumed - tok_state_character_reference_in_data = -> - tok_state = tok_state_data + # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state + # not needed: tok_state_character_reference_in_data = -> + # just call tok_state_character_reference_in_data() + + # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state + tok_state_tag_open = -> + switch c = txt.charAt(cur++) + when '!' + tok_state = tok_state_markup_declaration_open + when '/' + tok_state = tok_state_end_tag_open + when '?' + parse_error() + tok_state = tok_state_bogus_comment + else + if lc_alpha.indexOf(c) > -1 + tok_cur_tag = [TYPE_OPEN_TAG, c, [], []] + tok_state = tok_state_tag_name + else if uc_alpha.indexOf(c) > -1 + tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []] + tok_state = tok_state_tag_name + else + parse_error() + tok_state = tok_state_data + cur -= 1 # we didn't parse/handle the char after < + return [TYPE_TEXT, '<'] + return null + + # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state + tok_state_end_tag_open = -> + switch c = txt.charAt(cur++) + when '>' + parse_error() + tok_state = tok_state_data + when '' # EOF + parse_error() + tok_state = tok_state_data + return [TYPE_TEXT, ' -1 + tok_cur_tag = [TYPE_CLOSE_TAG, c.toLowerCase(), [], []] + tok_state = tok_state_tag_name + else if lc_alpha.indexOf(c) > -1 + tok_cur_tag = [TYPE_CLOSE_TAG, c, [], []] + tok_state = tok_state_tag_name + else + parse_error() + tok_state = tok_state_bogus_comment + return null + + # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state + tok_state_tag_name = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + tok_state = tok_state_before_attribute_name + when '/' + tok_state = tok_state_self_closing_start_tag + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when "\u0000" + parse_error() + tok_cur_tag[1] += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data + else + if uc_alpha.indexOf(c) > -1 + tok_cur_tag[1] += c.toLowerCase() + else + tok_cur_tag[1] += c + return null + + # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state + tok_state_before_attribute_name = -> + attr_name = null + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + return null + when '/' + tok_state = tok_state_self_closing_start_tag + return null + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when "\u0000" + parse_error() + attr_name = "\ufffd" + when '"', "'", '<', '=' + parse_error() + attr_name = c + when '' # EOF + parse_error() + tok_state = tok_state_data + else + if uc_alpha.indexOf(c) > -1 + attr_name = c.toLowerCase() + else + attr_name = c + if attr_name? + tok_cur_tag[2].unshift [attr_name, ''] + tok_state = tok_state_attribute_name + return null + + # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state + tok_state_attribute_name = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + tok_state = tok_state_after_attribute_name + when '/' + tok_state = tok_state_self_closing_start_tag + when '=' + tok_state = tok_state_before_attribute_value + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when "\u0000" + parse_error() + tok_cur_tag[2][0][0] = "\ufffd" + when '"', "'", '<' + parse_error() + tok_cur_tag[2][0][0] = c + when '' # EOF + parse_error() + tok_state = tok_state_data + else + if uc_alpha.indexOf(c) > -1 + tok_cur_tag[2][0][0] = c.toLowerCase() + else + tok_cur_tag[2][0][0] += c + return null + + # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state + tok_state_before_attribute_value = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + return null + when '"' + tok_state = tok_state_attribute_value_double_quoted + when '&' + tok_state = tok_state_attribute_value_unquoted + cur -= 1 + when "'" + tok_state = tok_state_attribute_value_single_quoted + when "\u0000" + # Parse error + tok_cur_tag[2][0][1] += "\ufffd" + tok_state = tok_state_attribute_value_unquoted + when '>' + # Parse error + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when '' # EOF + parse_error() + tok_state = tok_state_data + else + tok_cur_tag[2][0][1] += c + tok_state = tok_state_attribute_value_unquoted + return null + + # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state + tok_state_attribute_value_double_quoted = -> + switch c = txt.charAt(cur++) + when '"' + tok_state = tok_state_after_attribute_value_quoted + when '&' + tok_cur_tag[2][0][1] += tokenize_character_reference '"', true + when "\u0000" + # Parse error + tok_cur_tag[2][0][1] += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data + else + tok_cur_tag[2][0][1] += c + return null + + # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state + tok_state_attribute_value_single_quoted = -> + switch c = txt.charAt(cur++) + when "'" + tok_state = tok_state_after_attribute_value_quoted + when '&' + tok_cur_tag[2][0][1] += tokenize_character_reference "'", true + when "\u0000" + # Parse error + tok_cur_tag[2][0][1] += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data + else + tok_cur_tag[2][0][1] += c + return null + + # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state + tok_state_attribute_value_unquoted = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + tok_state = tok_state_before_attribute_name + when '&' + tok_cur_tag[2][0][1] += tokenize_character_reference '>', true + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when "\u0000" + tok_cur_tag[2][0][1] += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data + else + # Parse Error if ', <, = or ` (backtick) + tok_cur_tag[2][0][1] += c + return null + + # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state + tok_state_after_attribute_value_quoted = -> + switch c = txt.charAt(cur++) + when "\t", "\n", "\u000c", ' ' + tok_state = tok_state_before_attribute_name + when '/' + tok_state = tok_state_self_closing_start_tag + when '>' + tok_state = tok_state_data + tmp = tok_cur_tag + tok_cur_tag = null + return tmp + when '' # EOF + parse_error() + tok_state = tok_state_data + else + # Parse Error + tok_state = tok_state_before_attribute_name + cur -= 1 # we didn't handle that char + return null + + # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference + # Don't set this as a state, just call it + # returns a string (NOT a text node) + tokenize_character_reference = (allowed_char = null, in_attr = false) -> if cur >= txt.length - return [TYPE_TEXT, '&'] + return '&' switch c = txt.charAt(cur) + when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char + # explicitly not a parse error + return '&' when ';' - return [TYPE_TEXT, '&'] + # there has to be "one or more" alnums between & and ; to be a parse error + return '&' when '#' if cur + 1 >= txt.length - return [TYPE_TEXT, '&'] + return '&' if txt.charAt(cur + 1).toLowerCase() is 'x' prefix = '#x' charset = hex_chars @@ -196,66 +456,110 @@ parse_html = (txt) -> while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1 i += 1 if i is 0 - return [TYPE_TEXT, '&'] + return '&' if txt.charAt(start + i) is ';' i += 1 + # FIXME This is supposed to generate parse errors for some chars decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase()) if decoded? cur = start + i - return [TYPE_TEXT, decoded] - return [TYPE_TEXT, '&'] + return decoded + return '&' else for i in [0...31] if alnum.indexOf(txt.charAt(cur + i)) is -1 break if i is 0 - return [TYPE_TEXT, '&'] + # exit early, because parse_error() below needs at least one alnum + return '&' if txt.charAt(cur + i) is ';' i += 1 # include ';' terminator in value decoded = decode_named_char_ref txt.substr(cur, i) if decoded? cur += i - return [TYPE_TEXT, decoded] - return [TYPE_TEXT, '&'] + return decoded + parse_error() + return '&' else # no ';' terminator (only legacy char refs) - if i < 2 or i > 6 - return [TYPE_TEXT, '&'] - # FIXME: if we're inside an attribute: - # 1. don't parse refs that are followed by = - # 2. don't parse refs that are followed by alnum max = i for i in [2..max] # no prefix matches, so ok to check shortest first c = legacy_char_refs[txt.substr(cur, i)] if c? + if in_attr + if txt.charAt(cur + i) is '=' + # "because some legacy user agents will + # misinterpret the markup in those cases" + parse_error() + return '&' + if alnum.indexOf(txt.charAt(cur + i)) > -1 + # this makes attributes forgiving about url args + return '&' + # ok, and besides the weird exceptions for attributes... + # return the matching char cur += i # consume entity chars - return [TYPE_TEXT, c] - return null - + parse_error() # because no terminating ";" + return c + parse_error() + return '&' + return # never reached # the functions below impliment the Tree Contstruction algorithm here: # http://www.w3.org/TR/html5/syntax.html#tree-construction + # FIXME this is just a bit of a hack that makes sense... read spec and do it that way tree_append = (t) -> - if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT - tree_append_point[tree_append_point.length - 1][1] += t[1] - else - tree_append_point.push t + switch t[0] + when TYPE_TEXT + if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT + open_tags[0][3][open_tags[0][3].length - 1][1] += t[1] + else + open_tags[0][3].push t + when TYPE_OPEN_TAG + t[0] = TYPE_TAG + # convert attributes into a hash + attrs = {} + while t[2].length + a = t[2].pop() + attrs[a[0]] = a[1] + t[2] = attrs + # FIXME probs have to auto-close things first + open_tags[0][3].push t + open_tags.unshift t + # TODO implement formatting elements thing + when TYPE_CLOSE_TAG + # FIXME this is just a hack for now + if open_tags.length < 2 + parse_error() + return + if open_tags[0][1] isnt t[1] + parse_error() + # fall through and close something anyway + open_tags.shift() + when TYPE_EOF + return + # TODO implement close tags + # TODO implement self-closing tags + else + console.log "UNIMPLEMENTED tag type: #{t[0]}" + return # tree constructor initialization - tree = [] # see comments on TYPE_TAG/etc for the structure of this data - tree_append_point = tree + # see comments on TYPE_TAG/etc for the structure of this data + tree = [TYPE_TAG, 'html', {}, []] + open_tags = [tree] tree_state = tree_append # tokenizer initialization tok_state = tok_state_data # proccess input - while cur < txt.length + loop t = tok_state() if t? tree_state t - - return tree + if t[0] is TYPE_EOF + return tree[3] + return # never reached # everything below is tests on the above test_equals = (description, fn, args..., expected_output) -> @@ -263,7 +567,9 @@ test_equals = (description, fn, args..., expected_output) -> if output is expected_output console.log "passed: #{description}." else - console.log "FAILED: #{description}. Expected: #{expected_output}, actual: #{output}" + console.log "FAILED: #{description}..." + console.log " Expected: #{expected_output}" + console.log " Actual: #{output}" html_to_json = (html) -> return JSON.stringify parse_html html test_equals "empty", html_to_json, "", '[]' @@ -271,3 +577,10 @@ test_equals "just text", html_to_json, "abc", '[[1,"abc"]]' test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]' test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]' test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1€€ ƒ"]]' +test_equals "open tag", html_to_json, "foobar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]' +test_equals "open tag with attributes", html_to_json, "foobar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]' +test_equals "open tag with attributes of various quotings", html_to_json, "foobar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]' +test_equals "attribute entity exceptions dq", html_to_json, "foobar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]' +test_equals "attribute entity exceptions sq", html_to_json, "foobar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]' +test_equals "attribute entity exceptions uq", html_to_json, "foobar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]' +test_equals "matching closing tags", html_to_json, "foohi
1
foo
2
bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]'