From b1041cd8d6358a3dcc545cb25acac30fb87f281b Mon Sep 17 00:00:00 2001
From: Jason Woofenden <jason@jasonwoof.com>
Date: Sun, 13 Dec 2015 21:36:29 -0500
Subject: [PATCH] parsing some attributes

---
 parse-html.coffee |  156 +++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 141 insertions(+), 15 deletions(-)

diff --git a/parse-html.coffee b/parse-html.coffee
index 318422b..7ed736e 100644
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -31,7 +31,7 @@ TYPE_TEXT = 1 # "text"
 TYPE_WHITESPACE = 2
 TYPE_COMMENT = 3
 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
-TYPE_OPEN_TAG = 4
+TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
 
 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
@@ -157,13 +157,13 @@ parse_html = (txt) ->
 	tree_append_point = null
 	tree_state = null
 	tok_state = null
-	tok_cur = null # partially parsed tag
+	tok_cur_tag = null # partially parsed tag
 
 
 	# the functions below implement the tokenizer stats described here:
 	# http://www.w3.org/TR/html5/syntax.html#tokenization
 
-	# http://www.w3.org/TR/html5/syntax.html#data-state
+	# 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
 	tok_state_data = ->
 		switch c = txt.charAt(cur++)
 			when '&'
@@ -177,7 +177,7 @@ parse_html = (txt) ->
 				return [TYPE_TEXT, c]
 		return null
 
-	# http://www.w3.org/TR/html5/syntax.html#tag-open-state
+	# 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
 	tok_state_tag_open = ->
 		switch c = txt.charAt(cur++)
 			when '!'
@@ -189,10 +189,10 @@ parse_html = (txt) ->
 				tok_state = tok_state_bogus_comment
 			else
 				if lc_alpha.indexOf(c) > -1
-					tok_cur = [TYPE_OPEN_TAG, c, {}, []]
+					tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
 					tok_state = tok_state_tag_name
 				else if uc_alpha.indexOf(c) > -1
-					tok_cur = [TYPE_OPEN_TAG, c.toLowerCase(), {}, []]
+					tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
 					tok_state = tok_state_tag_name
 				else
 					# Parse error
@@ -201,29 +201,149 @@ parse_html = (txt) ->
 					return [TYPE_TEXT, '<']
 		return null
 
-	# http://www.w3.org/TR/html5/syntax.html#tag-name-state
+	# 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
 	tok_state_tag_name = ->
 		switch c = txt.charAt(cur++)
-			when "\t", "\n", ' '
+			when "\t", "\n", "\u000c", ' '
 				tok_state = tok_state_before_attribute_name
 			when '/'
 				tok_state = tok_state_self_closing_start_tag
 			when '>'
 				tok_state = tok_state_data
-				tmp = tok_cur
-				tok_cur = null
+				tmp = tok_cur_tag
+				tok_cur_tag = null
 				return tmp
 			when "\u0000"
 				# Parse error
-				tok_cur[1] += "\ufffd"
+				tok_cur_tag[1] += "\ufffd"
 			else
 				if uc_alpha.indexOf(c) > -1
-					tok_cur[1] += c.toLowerCase()
+					tok_cur_tag[1] += c.toLowerCase()
 				else
-					tok_cur[1] += c
+					tok_cur_tag[1] += c
 		return null
 
-	# http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
+	# 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
+	tok_state_before_attribute_name = ->
+		attr_name = null
+		switch c = txt.charAt(cur++)
+			when "\t", "\n", "\u000c", ' '
+				return null
+			when '/'
+				tok_state = tok_state_self_closing_start_tag
+				return null
+			when '>'
+				tok_state = tok_state_data
+				tmp = tok_cur_tag
+				tok_cur_tag = null
+				return tmp
+			when "\u0000"
+				# Parse error
+				attr_name = "\ufffd"
+			when '"', "'", '<', '='
+				# Parse error
+				attr_name = c
+			else
+				if uc_alpha.indexOf(c) > -1
+					attr_name = c.toLowerCase()
+				else
+					attr_name = c
+		if attr_name?
+			tok_cur_tag[2].unshift [attr_name, '']
+			tok_state = tok_state_attribute_name
+		return null
+
+	# 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
+	tok_state_attribute_name = ->
+		switch c = txt.charAt(cur++)
+			when "\t", "\n", "\u000c", ' '
+				tok_state = tok_state_after_attribute_name
+			when '/'
+				tok_state = tok_state_self_closing_start_tag
+			when '='
+				tok_state = tok_state_before_attribute_value
+			when '>'
+				tok_state = tok_state_data
+				tmp = tok_cur_tag
+				tok_cur_tag = null
+				return tmp
+			when "\u0000"
+				# Parse error
+				tok_cur_tag[2][0][0] += "\ufffd"
+			else
+				if uc_alpha.indexOf(c) > -1
+					tok_cur_tag[2][0][0] += c.toLowerCase()
+				else
+					# Parse error if ", ' or <
+					tok_cur_tag[2][0][0] += c
+		return null
+
+	# 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
+	tok_state_before_attribute_value = ->
+		switch c = txt.charAt(cur++)
+			when "\t", "\n", "\u000c", ' '
+				return null
+			when '"'
+				tok_state = tok_state_attribute_value_double_quoted
+			when '&'
+				tok_state = tok_state_attribute_value_unquoted
+				cur -= 1
+			when "'"
+				tok_state = tok_state_attribute_value_single_quoted
+			when "\u0000"
+				# Parse error
+				tok_cur_tag[2][0][1] += "\ufffd"
+				tok_state = tok_state_attribute_value_unquoted
+			when '>'
+				# Parse error
+				tok_state = tok_state_data
+				tmp = tok_cur_tag
+				tok_cur_tag = null
+				return tmp
+			else
+				if uc_alpha.indexOf(c) > -1
+					tok_cur_tag[2][0][1] += c.toLowerCase()
+				else
+					# Parse error if ", ` or < (that's a backtick)
+					tok_cur_tag[2][0][1] += c
+		return null
+
+	# 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
+	tok_state_attribute_value_double_quoted = ->
+		switch c = txt.charAt(cur++)
+			when '"'
+				tok_state = tok_state_after_attribute_value_quoted
+			when '&'
+				tok_state = tok_state_character_reference_in_attribute_value
+				tok_char_ref_addl_allowed = '"' # FIXME
+			when "\u0000"
+				# Parse error
+				tok_cur_tag[2][0][1] += "\ufffd"
+				tok_state = tok_state_attribute_value_unquoted
+			else
+				tok_cur_tag[2][0][1] += c
+		return null
+
+	# 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
+	tok_state_after_attribute_value_quoted = ->
+		switch c = txt.charAt(cur++)
+			when "\t", "\n", "\u000c", ' '
+				tok_state = tok_state_before_attribute_name
+			when '/'
+				tok_state = tok_state_self_closing_start_tag
+			when '>'
+				tok_state = tok_state_data
+				tmp = tok_cur_tag
+				tok_cur_tag = null
+				return tmp
+			else
+				# Parse Error
+				tok_state = tok_state_before_attribute_name
+				cur -= 1 # we didn't handle that char
+		return null
+
+
+	# 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
 	# & just got consumed
 	tok_state_character_reference_in_data = ->
 		tok_state = tok_state_data
@@ -293,6 +413,11 @@ parse_html = (txt) ->
 			tree_append_point.push t
 			if t[0] is TYPE_OPEN_TAG
 				t[0] = TYPE_TAG
+				attrs = {}
+				while t[2].length
+					a = t[2].pop()
+					attrs[a[0]] = a[1]
+				t[2] = attrs
 				tree_append_point = t[3]
 
 	# tree constructor initialization
@@ -325,4 +450,5 @@ test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
 test_equals "named entity", html_to_json, "a&amp;1234", '[[1,"a&1234"]]'
 test_equals "broken named character references", html_to_json, "1&amp2&&amp;3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
 test_equals "numbered entity overrides", html_to_json, "1&#X80&#x80; &#x83", '[[1,"1â¬â¬ Æ"]]'
-test_equals "open_tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
+test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
+test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar"},[[1,"bar"]]]]'
-- 
1.7.10.4