X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=db1837bd6d02bfc0ef5f6a5a7cd52e0a90750cf3;hb=50c30e8b7c0663890369776992b1f57876e6226e;hp=04733fbc6dcf33b149ed81c61094a78601c87dfc;hpb=cc2911623e5e9a50e19ae8233a990c2e633c218e;p=peach-html5-editor.git

diff --git a/parse-html.coffee b/parse-html.coffee
index 04733fb..db1837b 100644
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -30,10 +30,16 @@ TYPE_TAG = 0 # name, {attributes}, [children]
 TYPE_TEXT = 1 # "text"
 TYPE_WHITESPACE = 2
 TYPE_COMMENT = 3
+# the following types are emited by the tokenizer, but shouldn't end up in the tree:
+TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
+TYPE_CLOSE_TAG = 5 # name
+TYPE_EOF = 6
 
-alnum = "abcdefghijklmnopqrstuvwxqzABCDEFGHIJKLMNOPQRSTUVWXQZ0123456789"
-hex_chars = "0123456789abcdefABCDEF"
+lc_alpha = "abcdefghijklmnopqrstuvwxqz"
+uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
 digits = "0123456789"
+alnum = lc_alpha + uc_alpha + digits
+hex_chars = digits + "abcdefABCDEF"
 
 # some SVG elements have dashes in them
 tag_name_chars = alnum + "-"
@@ -150,40 +156,294 @@ parse_html = (txt) ->
 	cur = 0 # index of next char in txt to be parsed
 	# declare tree and tokenizer variables so they're in scope below
 	tree = null
-	tree_append_point = null
+	open_tags = [] # stack of open elements
 	tree_state = null
 	tok_state = null
+	tok_cur_tag = null # partially parsed tag
 
+	parse_error = ->
+		console.log "Parse error at character #{cur} of #{txt.length}"
 
 	# the functions below implement the tokenizer stats described here:
 	# http://www.w3.org/TR/html5/syntax.html#tokenization
 
+	# 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
 	tok_state_data = ->
-		if cur >= txt.length
-			return null
 		switch c = txt.charAt(cur++)
 			when '&'
-				tok_state = tok_state_character_reference_in_data
+				return [TYPE_TEXT, tokenize_character_reference()]
 			when '<'
 				tok_state = tok_state_tag_open
 			when "\u0000"
-				# Parse error
+				parse_error()
 				return [TYPE_TEXT, c]
+			when '' # EOF
+				return [TYPE_EOF]
 			else
 				return [TYPE_TEXT, c]
 		return null
 
-	# & just got consumed
-	tok_state_character_reference_in_data = ->
-		tok_state = tok_state_data
+	# 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
+	# not needed: tok_state_character_reference_in_data = ->
+	# just call tok_state_character_reference_in_data()
+
+	# 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
+	tok_state_tag_open = ->
+		switch c = txt.charAt(cur++)
+			when '!'
+				tok_state = tok_state_markup_declaration_open
+			when '/'
+				tok_state = tok_state_end_tag_open
+			when '?'
+				parse_error()
+				tok_state = tok_state_bogus_comment
+			else
+				if lc_alpha.indexOf(c) > -1
+					tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
+					tok_state = tok_state_tag_name
+				else if uc_alpha.indexOf(c) > -1
+					tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
+					tok_state = tok_state_tag_name
+				else
+					parse_error()
+					tok_state = tok_state_data
+					cur -= 1 # we didn't parse/handle the char after <
+					return [TYPE_TEXT, '<']
+		return null
+
+	# 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
+	tok_state_end_tag_open = ->
+		switch c = txt.charAt(cur++)
+			when '>'
+				parse_error()
+				tok_state = tok_state_data
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+				return [TYPE_TEXT, '</']
+			else
+				if uc_alpha.indexOf(c) > -1
+					tok_cur_tag = [TYPE_CLOSE_TAG, c.toLowerCase(), [], []]
+					tok_state = tok_state_tag_name
+				else if lc_alpha.indexOf(c) > -1
+					tok_cur_tag = [TYPE_CLOSE_TAG, c, [], []]
+					tok_state = tok_state_tag_name
+				else
+					parse_error()
+					tok_state = tok_state_bogus_comment
+		return null
+
+	# 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
+	tok_state_tag_name = ->
+		switch c = txt.charAt(cur++)
+			when "\t", "\n", "\u000c", ' '
+				tok_state = tok_state_before_attribute_name
+			when '/'
+				tok_state = tok_state_self_closing_start_tag
+			when '>'
+				tok_state = tok_state_data
+				tmp = tok_cur_tag
+				tok_cur_tag = null
+				return tmp
+			when "\u0000"
+				parse_error()
+				tok_cur_tag[1] += "\ufffd"
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+			else
+				if uc_alpha.indexOf(c) > -1
+					tok_cur_tag[1] += c.toLowerCase()
+				else
+					tok_cur_tag[1] += c
+		return null
+
+	# 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
+	tok_state_before_attribute_name = ->
+		attr_name = null
+		switch c = txt.charAt(cur++)
+			when "\t", "\n", "\u000c", ' '
+				return null
+			when '/'
+				tok_state = tok_state_self_closing_start_tag
+				return null
+			when '>'
+				tok_state = tok_state_data
+				tmp = tok_cur_tag
+				tok_cur_tag = null
+				return tmp
+			when "\u0000"
+				parse_error()
+				attr_name = "\ufffd"
+			when '"', "'", '<', '='
+				parse_error()
+				attr_name = c
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+			else
+				if uc_alpha.indexOf(c) > -1
+					attr_name = c.toLowerCase()
+				else
+					attr_name = c
+		if attr_name?
+			tok_cur_tag[2].unshift [attr_name, '']
+			tok_state = tok_state_attribute_name
+		return null
+
+	# 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
+	tok_state_attribute_name = ->
+		switch c = txt.charAt(cur++)
+			when "\t", "\n", "\u000c", ' '
+				tok_state = tok_state_after_attribute_name
+			when '/'
+				tok_state = tok_state_self_closing_start_tag
+			when '='
+				tok_state = tok_state_before_attribute_value
+			when '>'
+				tok_state = tok_state_data
+				tmp = tok_cur_tag
+				tok_cur_tag = null
+				return tmp
+			when "\u0000"
+				parse_error()
+				tok_cur_tag[2][0][0] = "\ufffd"
+			when '"', "'", '<'
+				parse_error()
+				tok_cur_tag[2][0][0] = c
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+			else
+				if uc_alpha.indexOf(c) > -1
+					tok_cur_tag[2][0][0] = c.toLowerCase()
+				else
+					tok_cur_tag[2][0][0] += c
+		return null
+
+	# 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
+	tok_state_before_attribute_value = ->
+		switch c = txt.charAt(cur++)
+			when "\t", "\n", "\u000c", ' '
+				return null
+			when '"'
+				tok_state = tok_state_attribute_value_double_quoted
+			when '&'
+				tok_state = tok_state_attribute_value_unquoted
+				cur -= 1
+			when "'"
+				tok_state = tok_state_attribute_value_single_quoted
+			when "\u0000"
+				# Parse error
+				tok_cur_tag[2][0][1] += "\ufffd"
+				tok_state = tok_state_attribute_value_unquoted
+			when '>'
+				# Parse error
+				tok_state = tok_state_data
+				tmp = tok_cur_tag
+				tok_cur_tag = null
+				return tmp
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+			else
+				tok_cur_tag[2][0][1] += c
+				tok_state = tok_state_attribute_value_unquoted
+		return null
+
+	# 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
+	tok_state_attribute_value_double_quoted = ->
+		switch c = txt.charAt(cur++)
+			when '"'
+				tok_state = tok_state_after_attribute_value_quoted
+			when '&'
+				tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
+			when "\u0000"
+				# Parse error
+				tok_cur_tag[2][0][1] += "\ufffd"
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+			else
+				tok_cur_tag[2][0][1] += c
+		return null
+
+	# 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
+	tok_state_attribute_value_single_quoted = ->
+		switch c = txt.charAt(cur++)
+			when "'"
+				tok_state = tok_state_after_attribute_value_quoted
+			when '&'
+				tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
+			when "\u0000"
+				# Parse error
+				tok_cur_tag[2][0][1] += "\ufffd"
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+			else
+				tok_cur_tag[2][0][1] += c
+		return null
+
+	# 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
+	tok_state_attribute_value_unquoted = ->
+		switch c = txt.charAt(cur++)
+			when "\t", "\n", "\u000c", ' '
+				tok_state = tok_state_before_attribute_name
+			when '&'
+				tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
+			when '>'
+				tok_state = tok_state_data
+				tmp = tok_cur_tag
+				tok_cur_tag = null
+				return tmp
+			when "\u0000"
+				tok_cur_tag[2][0][1] += "\ufffd"
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+			else
+				# Parse Error if ', <, = or ` (backtick)
+				tok_cur_tag[2][0][1] += c
+		return null
+
+	# 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
+	tok_state_after_attribute_value_quoted = ->
+		switch c = txt.charAt(cur++)
+			when "\t", "\n", "\u000c", ' '
+				tok_state = tok_state_before_attribute_name
+			when '/'
+				tok_state = tok_state_self_closing_start_tag
+			when '>'
+				tok_state = tok_state_data
+				tmp = tok_cur_tag
+				tok_cur_tag = null
+				return tmp
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+			else
+				# Parse Error
+				tok_state = tok_state_before_attribute_name
+				cur -= 1 # we didn't handle that char
+		return null
+
+	# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
+	# Don't set this as a state, just call it
+	# returns a string (NOT a text node)
+	tokenize_character_reference = (allowed_char = null, in_attr = false) ->
 		if cur >= txt.length
-			return [TYPE_TEXT, '&']
+			return '&'
 		switch c = txt.charAt(cur)
+			when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
+				# explicitly not a parse error
+				return '&'
 			when ';'
-				return [TYPE_TEXT, '&']
+				# there has to be "one or more" alnums between & and ; to be a parse error
+				return '&'
 			when '#'
 				if cur + 1 >= txt.length
-					return [TYPE_TEXT, '&']
+					return '&'
 				if txt.charAt(cur + 1).toLowerCase() is 'x'
 					prefix = '#x'
 					charset = hex_chars
@@ -196,66 +456,110 @@ parse_html = (txt) ->
 				while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
 					i += 1
 				if i is 0
-					return [TYPE_TEXT, '&']
+					return '&'
 				if txt.charAt(start + i) is ';'
 					i += 1
+				# FIXME This is supposed to generate parse errors for some chars
 				decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
 				if decoded?
 					cur = start + i
-					return [TYPE_TEXT, decoded]
-				return [TYPE_TEXT, '&']
+					return decoded
+				return '&'
 			else
 				for i in [0...31]
 					if alnum.indexOf(txt.charAt(cur + i)) is -1
 						break
 				if i is 0
-					return [TYPE_TEXT, '&']
+					# exit early, because parse_error() below needs at least one alnum
+					return '&'
 				if txt.charAt(cur + i) is ';'
 					i += 1 # include ';' terminator in value
 					decoded = decode_named_char_ref txt.substr(cur, i)
 					if decoded?
 						cur += i
-						return [TYPE_TEXT, decoded]
-					return [TYPE_TEXT, '&']
+						return decoded
+					parse_error()
+					return '&'
 				else
 					# no ';' terminator (only legacy char refs)
-					if i < 2 or i > 6
-						return [TYPE_TEXT, '&']
-					# FIXME: if we're inside an attribute:
-					# 1.	don't parse refs that are followed by =
-					# 2.	don't parse refs that are followed by alnum
 					max = i
 					for i in [2..max] # no prefix matches, so ok to check shortest first
 						c = legacy_char_refs[txt.substr(cur, i)]
 						if c?
+							if in_attr
+								if txt.charAt(cur + i) is '='
+									# "because some legacy user agents will
+									# misinterpret the markup in those cases"
+									parse_error()
+									return '&'
+								if alnum.indexOf(txt.charAt(cur + i)) > -1
+									# this makes attributes forgiving about url args
+									return '&'
+							# ok, and besides the weird exceptions for attributes...
+							# return the matching char
 							cur += i # consume entity chars
-							return [TYPE_TEXT, c]
-		return null
-				
+							parse_error() # because no terminating ";"
+							return c
+					parse_error()
+					return '&'
+		return # never reached
 
 	# the functions below impliment the Tree Contstruction algorithm here:
 	# http://www.w3.org/TR/html5/syntax.html#tree-construction
+	# FIXME this is just a bit of a hack that makes sense... read spec and do it that way
 	tree_append = (t) ->
-		if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
-			tree_append_point[tree_append_point.length - 1][1] += t[1]
-		else
-			tree_append_point.push t
+		switch t[0]
+			when TYPE_TEXT
+				if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
+					open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
+				else
+					open_tags[0][3].push t
+			when TYPE_OPEN_TAG
+				t[0] = TYPE_TAG
+				# convert attributes into a hash
+				attrs = {}
+				while t[2].length
+					a = t[2].pop()
+					attrs[a[0]] = a[1]
+				t[2] = attrs
+				# FIXME probs have to auto-close things first
+				open_tags[0][3].push t
+				open_tags.unshift t
+				# TODO implement formatting elements thing
+			when TYPE_CLOSE_TAG
+				# FIXME this is just a hack for now
+				if open_tags.length < 2
+					parse_error()
+					return
+				if open_tags[0][1] isnt t[1]
+					parse_error()
+					# fall through and close something anyway
+				open_tags.shift()
+			when TYPE_EOF
+				return
+			# TODO implement close tags
+			# TODO implement self-closing tags
+			else
+				console.log "UNIMPLEMENTED tag type: #{t[0]}"
+		return
 
 	# tree constructor initialization
-	tree = [] # see comments on TYPE_TAG/etc for the structure of this data
-	tree_append_point = tree
+	# see comments on TYPE_TAG/etc for the structure of this data
+	tree = [TYPE_TAG, 'html', {}, []]
+	open_tags = [tree]
 	tree_state = tree_append
 
 	# tokenizer initialization
 	tok_state = tok_state_data
 
 	# proccess input
-	while cur < txt.length
+	loop
 		t = tok_state()
 		if t?
 			tree_state t
-	
-	return tree
+			if t[0] is TYPE_EOF
+				return tree[3]
+	return # never reached
 
 # everything below is tests on the above
 test_equals = (description, fn, args..., expected_output) ->
@@ -263,7 +567,9 @@ test_equals = (description, fn, args..., expected_output) ->
 	if output is expected_output
 		console.log "passed: #{description}."
 	else
-		console.log "FAILED: #{description}. Expected: #{expected_output}, actual: #{output}"
+		console.log "FAILED: #{description}..."
+		console.log "   Expected: #{expected_output}"
+		console.log "     Actual: #{output}"
 html_to_json = (html) ->
 	return JSON.stringify parse_html html
 test_equals "empty", html_to_json, "", '[]'
@@ -271,3 +577,10 @@ test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
 test_equals "named entity", html_to_json, "a&amp;1234", '[[1,"a&1234"]]'
 test_equals "broken named character references", html_to_json, "1&amp2&&amp;3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
 test_equals "numbered entity overrides", html_to_json, "1&#X80&#x80; &#x83", '[[1,"1â¬â¬ Æ"]]'
+test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
+test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
+test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'
+test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
+test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
+test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
+test_equals "matching closing tags", html_to_json, "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]'