JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
parse unquoted and singlequoted attributes
[peach-html5-editor.git] / parse-html.coffee
index 318422b..1ef077a 100644 (file)
@@ -31,7 +31,7 @@ TYPE_TEXT = 1 # "text"
 TYPE_WHITESPACE = 2
 TYPE_COMMENT = 3
 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
-TYPE_OPEN_TAG = 4
+TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
 
 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
@@ -157,13 +157,13 @@ parse_html = (txt) ->
        tree_append_point = null
        tree_state = null
        tok_state = null
-       tok_cur = null # partially parsed tag
+       tok_cur_tag = null # partially parsed tag
 
 
        # the functions below implement the tokenizer stats described here:
        # http://www.w3.org/TR/html5/syntax.html#tokenization
 
-       # http://www.w3.org/TR/html5/syntax.html#data-state
+       # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
        tok_state_data = ->
                switch c = txt.charAt(cur++)
                        when '&'
@@ -177,53 +177,7 @@ parse_html = (txt) ->
                                return [TYPE_TEXT, c]
                return null
 
-       # http://www.w3.org/TR/html5/syntax.html#tag-open-state
-       tok_state_tag_open = ->
-               switch c = txt.charAt(cur++)
-                       when '!'
-                               tok_state = tok_state_markup_declaration_open
-                       when '/'
-                               tok_state = tok_state_end_tag_open
-                       when '?'
-                               # Parse error
-                               tok_state = tok_state_bogus_comment
-                       else
-                               if lc_alpha.indexOf(c) > -1
-                                       tok_cur = [TYPE_OPEN_TAG, c, {}, []]
-                                       tok_state = tok_state_tag_name
-                               else if uc_alpha.indexOf(c) > -1
-                                       tok_cur = [TYPE_OPEN_TAG, c.toLowerCase(), {}, []]
-                                       tok_state = tok_state_tag_name
-                               else
-                                       # Parse error
-                                       tok_state = tok_state_data
-                                       cur -= 1 # we didn't parse/handle the char after <
-                                       return [TYPE_TEXT, '<']
-               return null
-
-       # http://www.w3.org/TR/html5/syntax.html#tag-name-state
-       tok_state_tag_name = ->
-               switch c = txt.charAt(cur++)
-                       when "\t", "\n", ' '
-                               tok_state = tok_state_before_attribute_name
-                       when '/'
-                               tok_state = tok_state_self_closing_start_tag
-                       when '>'
-                               tok_state = tok_state_data
-                               tmp = tok_cur
-                               tok_cur = null
-                               return tmp
-                       when "\u0000"
-                               # Parse error
-                               tok_cur[1] += "\ufffd"
-                       else
-                               if uc_alpha.indexOf(c) > -1
-                                       tok_cur[1] += c.toLowerCase()
-                               else
-                                       tok_cur[1] += c
-               return null
-
-       # http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
+       # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
        # & just got consumed
        tok_state_character_reference_in_data = ->
                tok_state = tok_state_data
@@ -283,6 +237,202 @@ parse_html = (txt) ->
                                                        return [TYPE_TEXT, c]
                return null
 
+       # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
+       tok_state_tag_open = ->
+               switch c = txt.charAt(cur++)
+                       when '!'
+                               tok_state = tok_state_markup_declaration_open
+                       when '/'
+                               tok_state = tok_state_end_tag_open
+                       when '?'
+                               # Parse error
+                               tok_state = tok_state_bogus_comment
+                       else
+                               if lc_alpha.indexOf(c) > -1
+                                       tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
+                                       tok_state = tok_state_tag_name
+                               else if uc_alpha.indexOf(c) > -1
+                                       tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
+                                       tok_state = tok_state_tag_name
+                               else
+                                       # Parse error
+                                       tok_state = tok_state_data
+                                       cur -= 1 # we didn't parse/handle the char after <
+                                       return [TYPE_TEXT, '<']
+               return null
+
+       # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
+       tok_state_tag_name = ->
+               switch c = txt.charAt(cur++)
+                       when "\t", "\n", "\u000c", ' '
+                               tok_state = tok_state_before_attribute_name
+                       when '/'
+                               tok_state = tok_state_self_closing_start_tag
+                       when '>'
+                               tok_state = tok_state_data
+                               tmp = tok_cur_tag
+                               tok_cur_tag = null
+                               return tmp
+                       when "\u0000"
+                               # Parse error
+                               tok_cur_tag[1] += "\ufffd"
+                       else
+                               if uc_alpha.indexOf(c) > -1
+                                       tok_cur_tag[1] += c.toLowerCase()
+                               else
+                                       tok_cur_tag[1] += c
+               return null
+
+       # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
+       tok_state_before_attribute_name = ->
+               attr_name = null
+               switch c = txt.charAt(cur++)
+                       when "\t", "\n", "\u000c", ' '
+                               return null
+                       when '/'
+                               tok_state = tok_state_self_closing_start_tag
+                               return null
+                       when '>'
+                               tok_state = tok_state_data
+                               tmp = tok_cur_tag
+                               tok_cur_tag = null
+                               return tmp
+                       when "\u0000"
+                               # Parse error
+                               attr_name = "\ufffd"
+                       when '"', "'", '<', '='
+                               # Parse error
+                               attr_name = c
+                       else
+                               if uc_alpha.indexOf(c) > -1
+                                       attr_name = c.toLowerCase()
+                               else
+                                       attr_name = c
+               if attr_name?
+                       tok_cur_tag[2].unshift [attr_name, '']
+                       tok_state = tok_state_attribute_name
+               return null
+
+       # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
+       tok_state_attribute_name = ->
+               switch c = txt.charAt(cur++)
+                       when "\t", "\n", "\u000c", ' '
+                               tok_state = tok_state_after_attribute_name
+                       when '/'
+                               tok_state = tok_state_self_closing_start_tag
+                       when '='
+                               tok_state = tok_state_before_attribute_value
+                       when '>'
+                               tok_state = tok_state_data
+                               tmp = tok_cur_tag
+                               tok_cur_tag = null
+                               return tmp
+                       when "\u0000"
+                               # Parse error
+                               tok_cur_tag[2][0][0] += "\ufffd"
+                       else
+                               if uc_alpha.indexOf(c) > -1
+                                       tok_cur_tag[2][0][0] += c.toLowerCase()
+                               else
+                                       # Parse error if ", ' or <
+                                       tok_cur_tag[2][0][0] += c
+               return null
+
+       # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
+       tok_state_before_attribute_value = ->
+               switch c = txt.charAt(cur++)
+                       when "\t", "\n", "\u000c", ' '
+                               return null
+                       when '"'
+                               tok_state = tok_state_attribute_value_double_quoted
+                       when '&'
+                               tok_state = tok_state_attribute_value_unquoted
+                               cur -= 1
+                       when "'"
+                               tok_state = tok_state_attribute_value_single_quoted
+                       when "\u0000"
+                               # Parse error
+                               tok_cur_tag[2][0][1] += "\ufffd"
+                               tok_state = tok_state_attribute_value_unquoted
+                       when '>'
+                               # Parse error
+                               tok_state = tok_state_data
+                               tmp = tok_cur_tag
+                               tok_cur_tag = null
+                               return tmp
+                       else
+                               tok_cur_tag[2][0][1] += c
+                               tok_state = tok_state_attribute_value_unquoted
+               return null
+
+       # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
+       tok_state_attribute_value_double_quoted = ->
+               switch c = txt.charAt(cur++)
+                       when '"'
+                               tok_state = tok_state_after_attribute_value_quoted
+                       when '&'
+                               tok_state = tok_state_character_reference_in_attribute_value
+                               tok_char_ref_addl_allowed = '"' # FIXME
+                       when "\u0000"
+                               # Parse error
+                               tok_cur_tag[2][0][1] += "\ufffd"
+                       else
+                               tok_cur_tag[2][0][1] += c
+               return null
+
+       # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
+       tok_state_attribute_value_single_quoted = ->
+               switch c = txt.charAt(cur++)
+                       when "'"
+                               tok_state = tok_state_after_attribute_value_quoted
+                       when '&'
+                               tok_state = tok_state_character_reference_in_attribute_value
+                               tok_char_ref_addl_allowed = "'" # FIXME
+                       when "\u0000"
+                               # Parse error
+                               tok_cur_tag[2][0][1] += "\ufffd"
+                       else
+                               tok_cur_tag[2][0][1] += c
+               return null
+
+       # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
+       tok_state_attribute_value_unquoted = ->
+               switch c = txt.charAt(cur++)
+                       when "\t", "\n", "\u000c", ' '
+                               tok_state = tok_state_before_attribute_name
+                       when '&'
+                               tok_state = tok_state_character_reference_in_attribute_value
+                               tok_char_ref_addl_allowed = '>' # FIXME
+                       when '>'
+                               tok_state = tok_state_data
+                               tmp = tok_cur_tag
+                               tok_cur_tag = null
+                               return tmp
+                       when "\u0000"
+                               tok_cur_tag[2][0][1] += "\ufffd"
+                       else
+                               # Parse Error if ', <, = or ` (backtick)
+                               tok_cur_tag[2][0][1] += c
+               return null
+
+       # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
+       tok_state_after_attribute_value_quoted = ->
+               switch c = txt.charAt(cur++)
+                       when "\t", "\n", "\u000c", ' '
+                               tok_state = tok_state_before_attribute_name
+                       when '/'
+                               tok_state = tok_state_self_closing_start_tag
+                       when '>'
+                               tok_state = tok_state_data
+                               tmp = tok_cur_tag
+                               tok_cur_tag = null
+                               return tmp
+                       else
+                               # Parse Error
+                               tok_state = tok_state_before_attribute_name
+                               cur -= 1 # we didn't handle that char
+               return null
+
        # the functions below impliment the Tree Contstruction algorithm here:
        # http://www.w3.org/TR/html5/syntax.html#tree-construction
        # FIXME this is just a bit of a hack that makes sense... read spec and do it that way
@@ -293,6 +443,11 @@ parse_html = (txt) ->
                        tree_append_point.push t
                        if t[0] is TYPE_OPEN_TAG
                                t[0] = TYPE_TAG
+                               attrs = {}
+                               while t[2].length
+                                       a = t[2].pop()
+                                       attrs[a[0]] = a[1]
+                               t[2] = attrs
                                tree_append_point = t[3]
 
        # tree constructor initialization
@@ -325,4 +480,6 @@ test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
 test_equals "named entity", html_to_json, "a&amp;1234", '[[1,"a&1234"]]'
 test_equals "broken named character references", html_to_json, "1&amp2&&amp;3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
 test_equals "numbered entity overrides", html_to_json, "1&#X80&#x80; &#x83", '[[1,"1€€ ƒ"]]'
-test_equals "open_tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
+test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
+test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
+test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'