From: Jason Woofenden Date: Mon, 14 Dec 2015 01:49:40 +0000 (-0500) Subject: start parsing open tags X-Git-Url: https://jasonwoof.com/gitweb/?a=commitdiff_plain;h=47d40ff2cb949e10270189a1b902d6ce7f4bf1f0;p=peach-html5-editor.git start parsing open tags --- diff --git a/parse-html.coffee b/parse-html.coffee index 04733fb..318422b 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -30,10 +30,14 @@ TYPE_TAG = 0 # name, {attributes}, [children] TYPE_TEXT = 1 # "text" TYPE_WHITESPACE = 2 TYPE_COMMENT = 3 +# the following types are emited by the tokenizer, but shouldn't end up in the tree: +TYPE_OPEN_TAG = 4 -alnum = "abcdefghijklmnopqrstuvwxqzABCDEFGHIJKLMNOPQRSTUVWXQZ0123456789" -hex_chars = "0123456789abcdefABCDEF" +lc_alpha = "abcdefghijklmnopqrstuvwxqz" +uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ" digits = "0123456789" +alnum = lc_alpha + uc_alpha + digits +hex_chars = digits + "abcdefABCDEF" # some SVG elements have dashes in them tag_name_chars = alnum + "-" @@ -153,14 +157,14 @@ parse_html = (txt) -> tree_append_point = null tree_state = null tok_state = null + tok_cur = null # partially parsed tag # the functions below implement the tokenizer stats described here: # http://www.w3.org/TR/html5/syntax.html#tokenization + # http://www.w3.org/TR/html5/syntax.html#data-state tok_state_data = -> - if cur >= txt.length - return null switch c = txt.charAt(cur++) when '&' tok_state = tok_state_character_reference_in_data @@ -173,6 +177,53 @@ parse_html = (txt) -> return [TYPE_TEXT, c] return null + # http://www.w3.org/TR/html5/syntax.html#tag-open-state + tok_state_tag_open = -> + switch c = txt.charAt(cur++) + when '!' + tok_state = tok_state_markup_declaration_open + when '/' + tok_state = tok_state_end_tag_open + when '?' + # Parse error + tok_state = tok_state_bogus_comment + else + if lc_alpha.indexOf(c) > -1 + tok_cur = [TYPE_OPEN_TAG, c, {}, []] + tok_state = tok_state_tag_name + else if uc_alpha.indexOf(c) > -1 + tok_cur = [TYPE_OPEN_TAG, c.toLowerCase(), {}, []] + tok_state = tok_state_tag_name + else + # Parse error + tok_state = tok_state_data + cur -= 1 # we didn't parse/handle the char after < + return [TYPE_TEXT, '<'] + return null + + # http://www.w3.org/TR/html5/syntax.html#tag-name-state + tok_state_tag_name = -> + switch c = txt.charAt(cur++) + when "\t", "\n", ' ' + tok_state = tok_state_before_attribute_name + when '/' + tok_state = tok_state_self_closing_start_tag + when '>' + tok_state = tok_state_data + tmp = tok_cur + tok_cur = null + return tmp + when "\u0000" + # Parse error + tok_cur[1] += "\ufffd" + else + if uc_alpha.indexOf(c) > -1 + tok_cur[1] += c.toLowerCase() + else + tok_cur[1] += c + return null + + # http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state # & just got consumed tok_state_character_reference_in_data = -> tok_state = tok_state_data @@ -231,15 +282,18 @@ parse_html = (txt) -> cur += i # consume entity chars return [TYPE_TEXT, c] return null - # the functions below impliment the Tree Contstruction algorithm here: # http://www.w3.org/TR/html5/syntax.html#tree-construction + # FIXME this is just a bit of a hack that makes sense... read spec and do it that way tree_append = (t) -> if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT tree_append_point[tree_append_point.length - 1][1] += t[1] else tree_append_point.push t + if t[0] is TYPE_OPEN_TAG + t[0] = TYPE_TAG + tree_append_point = t[3] # tree constructor initialization tree = [] # see comments on TYPE_TAG/etc for the structure of this data @@ -254,7 +308,7 @@ parse_html = (txt) -> t = tok_state() if t? tree_state t - + return tree # everything below is tests on the above @@ -271,3 +325,4 @@ test_equals "just text", html_to_json, "abc", '[[1,"abc"]]' test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]' test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]' test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1€€ ƒ"]]' +test_equals "open_tag", html_to_json, "foobar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'