X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=1ef077ad26323307c25ea6ee4ec2c338a2ff2d78;hb=06466aac55914c23fc6ba986c28ccf069386767c;hp=7ed736ed1dae9720a73be6ebb849bf534b72d7a9;hpb=b1041cd8d6358a3dcc545cb25acac30fb87f281b;p=peach-html5-editor.git

diff --git a/parse-html.coffee b/parse-html.coffee
index 7ed736e..1ef077a 100644
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -177,6 +177,66 @@ parse_html = (txt) ->
 				return [TYPE_TEXT, c]
 		return null
 
+	# 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
+	# & just got consumed
+	tok_state_character_reference_in_data = ->
+		tok_state = tok_state_data
+		if cur >= txt.length
+			return [TYPE_TEXT, '&']
+		switch c = txt.charAt(cur)
+			when ';'
+				return [TYPE_TEXT, '&']
+			when '#'
+				if cur + 1 >= txt.length
+					return [TYPE_TEXT, '&']
+				if txt.charAt(cur + 1).toLowerCase() is 'x'
+					prefix = '#x'
+					charset = hex_chars
+					start = cur + 2
+				else
+					charset = digits
+					start = cur + 1
+					prefix = '#'
+				i = 0
+				while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
+					i += 1
+				if i is 0
+					return [TYPE_TEXT, '&']
+				if txt.charAt(start + i) is ';'
+					i += 1
+				decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
+				if decoded?
+					cur = start + i
+					return [TYPE_TEXT, decoded]
+				return [TYPE_TEXT, '&']
+			else
+				for i in [0...31]
+					if alnum.indexOf(txt.charAt(cur + i)) is -1
+						break
+				if i is 0
+					return [TYPE_TEXT, '&']
+				if txt.charAt(cur + i) is ';'
+					i += 1 # include ';' terminator in value
+					decoded = decode_named_char_ref txt.substr(cur, i)
+					if decoded?
+						cur += i
+						return [TYPE_TEXT, decoded]
+					return [TYPE_TEXT, '&']
+				else
+					# no ';' terminator (only legacy char refs)
+					if i < 2 or i > 6
+						return [TYPE_TEXT, '&']
+					# FIXME: if we're inside an attribute:
+					# 1.	don't parse refs that are followed by =
+					# 2.	don't parse refs that are followed by alnum
+					max = i
+					for i in [2..max] # no prefix matches, so ok to check shortest first
+						c = legacy_char_refs[txt.substr(cur, i)]
+						if c?
+							cur += i # consume entity chars
+							return [TYPE_TEXT, c]
+		return null
+
 	# 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
 	tok_state_tag_open = ->
 		switch c = txt.charAt(cur++)
@@ -301,11 +361,8 @@ parse_html = (txt) ->
 				tok_cur_tag = null
 				return tmp
 			else
-				if uc_alpha.indexOf(c) > -1
-					tok_cur_tag[2][0][1] += c.toLowerCase()
-				else
-					# Parse error if ", ` or < (that's a backtick)
-					tok_cur_tag[2][0][1] += c
+				tok_cur_tag[2][0][1] += c
+				tok_state = tok_state_attribute_value_unquoted
 		return null
 
 	# 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
@@ -319,11 +376,45 @@ parse_html = (txt) ->
 			when "\u0000"
 				# Parse error
 				tok_cur_tag[2][0][1] += "\ufffd"
-				tok_state = tok_state_attribute_value_unquoted
 			else
 				tok_cur_tag[2][0][1] += c
 		return null
 
+	# 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
+	tok_state_attribute_value_single_quoted = ->
+		switch c = txt.charAt(cur++)
+			when "'"
+				tok_state = tok_state_after_attribute_value_quoted
+			when '&'
+				tok_state = tok_state_character_reference_in_attribute_value
+				tok_char_ref_addl_allowed = "'" # FIXME
+			when "\u0000"
+				# Parse error
+				tok_cur_tag[2][0][1] += "\ufffd"
+			else
+				tok_cur_tag[2][0][1] += c
+		return null
+
+	# 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
+	tok_state_attribute_value_unquoted = ->
+		switch c = txt.charAt(cur++)
+			when "\t", "\n", "\u000c", ' '
+				tok_state = tok_state_before_attribute_name
+			when '&'
+				tok_state = tok_state_character_reference_in_attribute_value
+				tok_char_ref_addl_allowed = '>' # FIXME
+			when '>'
+				tok_state = tok_state_data
+				tmp = tok_cur_tag
+				tok_cur_tag = null
+				return tmp
+			when "\u0000"
+				tok_cur_tag[2][0][1] += "\ufffd"
+			else
+				# Parse Error if ', <, = or ` (backtick)
+				tok_cur_tag[2][0][1] += c
+		return null
+
 	# 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
 	tok_state_after_attribute_value_quoted = ->
 		switch c = txt.charAt(cur++)
@@ -342,67 +433,6 @@ parse_html = (txt) ->
 				cur -= 1 # we didn't handle that char
 		return null
 
-
-	# 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
-	# & just got consumed
-	tok_state_character_reference_in_data = ->
-		tok_state = tok_state_data
-		if cur >= txt.length
-			return [TYPE_TEXT, '&']
-		switch c = txt.charAt(cur)
-			when ';'
-				return [TYPE_TEXT, '&']
-			when '#'
-				if cur + 1 >= txt.length
-					return [TYPE_TEXT, '&']
-				if txt.charAt(cur + 1).toLowerCase() is 'x'
-					prefix = '#x'
-					charset = hex_chars
-					start = cur + 2
-				else
-					charset = digits
-					start = cur + 1
-					prefix = '#'
-				i = 0
-				while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
-					i += 1
-				if i is 0
-					return [TYPE_TEXT, '&']
-				if txt.charAt(start + i) is ';'
-					i += 1
-				decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
-				if decoded?
-					cur = start + i
-					return [TYPE_TEXT, decoded]
-				return [TYPE_TEXT, '&']
-			else
-				for i in [0...31]
-					if alnum.indexOf(txt.charAt(cur + i)) is -1
-						break
-				if i is 0
-					return [TYPE_TEXT, '&']
-				if txt.charAt(cur + i) is ';'
-					i += 1 # include ';' terminator in value
-					decoded = decode_named_char_ref txt.substr(cur, i)
-					if decoded?
-						cur += i
-						return [TYPE_TEXT, decoded]
-					return [TYPE_TEXT, '&']
-				else
-					# no ';' terminator (only legacy char refs)
-					if i < 2 or i > 6
-						return [TYPE_TEXT, '&']
-					# FIXME: if we're inside an attribute:
-					# 1.	don't parse refs that are followed by =
-					# 2.	don't parse refs that are followed by alnum
-					max = i
-					for i in [2..max] # no prefix matches, so ok to check shortest first
-						c = legacy_char_refs[txt.substr(cur, i)]
-						if c?
-							cur += i # consume entity chars
-							return [TYPE_TEXT, c]
-		return null
-
 	# the functions below impliment the Tree Contstruction algorithm here:
 	# http://www.w3.org/TR/html5/syntax.html#tree-construction
 	# FIXME this is just a bit of a hack that makes sense... read spec and do it that way
@@ -451,4 +481,5 @@ test_equals "named entity", html_to_json, "a&amp;1234", '[[1,"a&1234"]]'
 test_equals "broken named character references", html_to_json, "1&amp2&&amp;3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
 test_equals "numbered entity overrides", html_to_json, "1&#X80&#x80; &#x83", '[[1,"1â¬â¬ Æ"]]'
 test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
-test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar"},[[1,"bar"]]]]'
+test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
+test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'