JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
parse unquoted and singlequoted attributes
[peach-html5-editor.git] / parse-html.coffee
index 7ed736e..1ef077a 100644 (file)
@@ -177,6 +177,66 @@ parse_html = (txt) ->
                                return [TYPE_TEXT, c]
                return null
 
+       # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
+       # & just got consumed
+       tok_state_character_reference_in_data = ->
+               tok_state = tok_state_data
+               if cur >= txt.length
+                       return [TYPE_TEXT, '&']
+               switch c = txt.charAt(cur)
+                       when ';'
+                               return [TYPE_TEXT, '&']
+                       when '#'
+                               if cur + 1 >= txt.length
+                                       return [TYPE_TEXT, '&']
+                               if txt.charAt(cur + 1).toLowerCase() is 'x'
+                                       prefix = '#x'
+                                       charset = hex_chars
+                                       start = cur + 2
+                               else
+                                       charset = digits
+                                       start = cur + 1
+                                       prefix = '#'
+                               i = 0
+                               while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
+                                       i += 1
+                               if i is 0
+                                       return [TYPE_TEXT, '&']
+                               if txt.charAt(start + i) is ';'
+                                       i += 1
+                               decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
+                               if decoded?
+                                       cur = start + i
+                                       return [TYPE_TEXT, decoded]
+                               return [TYPE_TEXT, '&']
+                       else
+                               for i in [0...31]
+                                       if alnum.indexOf(txt.charAt(cur + i)) is -1
+                                               break
+                               if i is 0
+                                       return [TYPE_TEXT, '&']
+                               if txt.charAt(cur + i) is ';'
+                                       i += 1 # include ';' terminator in value
+                                       decoded = decode_named_char_ref txt.substr(cur, i)
+                                       if decoded?
+                                               cur += i
+                                               return [TYPE_TEXT, decoded]
+                                       return [TYPE_TEXT, '&']
+                               else
+                                       # no ';' terminator (only legacy char refs)
+                                       if i < 2 or i > 6
+                                               return [TYPE_TEXT, '&']
+                                       # FIXME: if we're inside an attribute:
+                                       # 1.    don't parse refs that are followed by =
+                                       # 2.    don't parse refs that are followed by alnum
+                                       max = i
+                                       for i in [2..max] # no prefix matches, so ok to check shortest first
+                                               c = legacy_char_refs[txt.substr(cur, i)]
+                                               if c?
+                                                       cur += i # consume entity chars
+                                                       return [TYPE_TEXT, c]
+               return null
+
        # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
        tok_state_tag_open = ->
                switch c = txt.charAt(cur++)
@@ -301,11 +361,8 @@ parse_html = (txt) ->
                                tok_cur_tag = null
                                return tmp
                        else
-                               if uc_alpha.indexOf(c) > -1
-                                       tok_cur_tag[2][0][1] += c.toLowerCase()
-                               else
-                                       # Parse error if ", ` or < (that's a backtick)
-                                       tok_cur_tag[2][0][1] += c
+                               tok_cur_tag[2][0][1] += c
+                               tok_state = tok_state_attribute_value_unquoted
                return null
 
        # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
@@ -319,11 +376,45 @@ parse_html = (txt) ->
                        when "\u0000"
                                # Parse error
                                tok_cur_tag[2][0][1] += "\ufffd"
-                               tok_state = tok_state_attribute_value_unquoted
                        else
                                tok_cur_tag[2][0][1] += c
                return null
 
+       # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
+       tok_state_attribute_value_single_quoted = ->
+               switch c = txt.charAt(cur++)
+                       when "'"
+                               tok_state = tok_state_after_attribute_value_quoted
+                       when '&'
+                               tok_state = tok_state_character_reference_in_attribute_value
+                               tok_char_ref_addl_allowed = "'" # FIXME
+                       when "\u0000"
+                               # Parse error
+                               tok_cur_tag[2][0][1] += "\ufffd"
+                       else
+                               tok_cur_tag[2][0][1] += c
+               return null
+
+       # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
+       tok_state_attribute_value_unquoted = ->
+               switch c = txt.charAt(cur++)
+                       when "\t", "\n", "\u000c", ' '
+                               tok_state = tok_state_before_attribute_name
+                       when '&'
+                               tok_state = tok_state_character_reference_in_attribute_value
+                               tok_char_ref_addl_allowed = '>' # FIXME
+                       when '>'
+                               tok_state = tok_state_data
+                               tmp = tok_cur_tag
+                               tok_cur_tag = null
+                               return tmp
+                       when "\u0000"
+                               tok_cur_tag[2][0][1] += "\ufffd"
+                       else
+                               # Parse Error if ', <, = or ` (backtick)
+                               tok_cur_tag[2][0][1] += c
+               return null
+
        # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
        tok_state_after_attribute_value_quoted = ->
                switch c = txt.charAt(cur++)
@@ -342,67 +433,6 @@ parse_html = (txt) ->
                                cur -= 1 # we didn't handle that char
                return null
 
-
-       # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
-       # & just got consumed
-       tok_state_character_reference_in_data = ->
-               tok_state = tok_state_data
-               if cur >= txt.length
-                       return [TYPE_TEXT, '&']
-               switch c = txt.charAt(cur)
-                       when ';'
-                               return [TYPE_TEXT, '&']
-                       when '#'
-                               if cur + 1 >= txt.length
-                                       return [TYPE_TEXT, '&']
-                               if txt.charAt(cur + 1).toLowerCase() is 'x'
-                                       prefix = '#x'
-                                       charset = hex_chars
-                                       start = cur + 2
-                               else
-                                       charset = digits
-                                       start = cur + 1
-                                       prefix = '#'
-                               i = 0
-                               while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
-                                       i += 1
-                               if i is 0
-                                       return [TYPE_TEXT, '&']
-                               if txt.charAt(start + i) is ';'
-                                       i += 1
-                               decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
-                               if decoded?
-                                       cur = start + i
-                                       return [TYPE_TEXT, decoded]
-                               return [TYPE_TEXT, '&']
-                       else
-                               for i in [0...31]
-                                       if alnum.indexOf(txt.charAt(cur + i)) is -1
-                                               break
-                               if i is 0
-                                       return [TYPE_TEXT, '&']
-                               if txt.charAt(cur + i) is ';'
-                                       i += 1 # include ';' terminator in value
-                                       decoded = decode_named_char_ref txt.substr(cur, i)
-                                       if decoded?
-                                               cur += i
-                                               return [TYPE_TEXT, decoded]
-                                       return [TYPE_TEXT, '&']
-                               else
-                                       # no ';' terminator (only legacy char refs)
-                                       if i < 2 or i > 6
-                                               return [TYPE_TEXT, '&']
-                                       # FIXME: if we're inside an attribute:
-                                       # 1.    don't parse refs that are followed by =
-                                       # 2.    don't parse refs that are followed by alnum
-                                       max = i
-                                       for i in [2..max] # no prefix matches, so ok to check shortest first
-                                               c = legacy_char_refs[txt.substr(cur, i)]
-                                               if c?
-                                                       cur += i # consume entity chars
-                                                       return [TYPE_TEXT, c]
-               return null
-
        # the functions below impliment the Tree Contstruction algorithm here:
        # http://www.w3.org/TR/html5/syntax.html#tree-construction
        # FIXME this is just a bit of a hack that makes sense... read spec and do it that way
@@ -451,4 +481,5 @@ test_equals "named entity", html_to_json, "a&amp;1234", '[[1,"a&1234"]]'
 test_equals "broken named character references", html_to_json, "1&amp2&&amp;3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
 test_equals "numbered entity overrides", html_to_json, "1&#X80&#x80; &#x83", '[[1,"1€€ ƒ"]]'
 test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
-test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar"},[[1,"bar"]]]]'
+test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
+test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'