From adc7477c34f3a2aa480e7f2af5ea954d2421d000 Mon Sep 17 00:00:00 2001
From: Jason Woofenden <jason@jasonwoof.com>
Date: Tue, 22 Dec 2015 17:30:45 -0500
Subject: [PATCH] implement rest of tokenizer states

---
 parse-html.coffee |  421 +++++++++++++++++++++++++++++++++++++++++++++++++----
 test.coffee       |   11 +-
 2 files changed, 398 insertions(+), 34 deletions(-)

diff --git a/parse-html.coffee b/parse-html.coffee
index e193118..f64c734 100644
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -181,6 +181,11 @@ digits = "0123456789"
 alnum = lc_alpha + uc_alpha + digits
 hex_chars = digits + "abcdefABCDEF"
 
+is_uc_alpha = (str) ->
+	return str.length is 1 and uc_alpha.indexOf(str) > -1
+is_lc_alpha = (str) ->
+	return str.length is 1 and lc_alpha.indexOf(str) > -1
+
 # some SVG elements have dashes in them
 tag_name_chars = alnum + "-"
 
@@ -191,6 +196,15 @@ is_space = (txt) ->
 is_space_tok = (t) ->
 	return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
 
+is_input_hidden_tok = (t) ->
+	return unless t.type is TYPE_START_TAG
+	for a of t.attrs_a
+		if a[0] is 'type'
+			if a[1].toLowerCase() is 'hidden'
+				return true
+			return false
+	return false
+
 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
 
@@ -450,7 +464,7 @@ parse_html = (txt, parse_error_cb = null) ->
 		for t in open_els
 			if t.name is tag_name and (namespace is null or namespace is t.namespace)
 				return true
-			if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
+			if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
 				return false
 		return false
 	# this checks for a particular element, not by name
@@ -499,9 +513,11 @@ parse_html = (txt, parse_error_cb = null) ->
 		return
 	clear_afe_to_marker = ->
 		loop
+			return unless afe.length > 0 # this happens in fragment case, ?spec error
 			el = afe.shift()
 			if el.type is TYPE_AFE_MARKER
 				return
+		return
 
 	# 8.2.3.1 ...
 	# http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
@@ -1018,7 +1034,7 @@ parse_html = (txt, parse_error_cb = null) ->
 				# last template's template contents, after its last child (if
 				# any), and abort these substeps.
 				if last_template and (last_table is null or last_template_i < last_table_i)
-					target = template # fixfull should be it's contents
+					target = last_template # fixfull should be it's contents
 					target_i = target.children.length
 					break
 				# 4. If there is no last table, then let adjusted insertion
@@ -1272,7 +1288,7 @@ parse_html = (txt, parse_error_cb = null) ->
 			return
 		if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
 			insert_html_element t
-			insertion_mode = in_head_noscript # FIXME implement
+			insertion_mode = ins_mode_in_head_noscript # FIXME implement
 			return
 		if t.type is TYPE_START_TAG and t.name is 'script'
 			ail = adjusted_insertion_location()
@@ -1614,7 +1630,7 @@ parse_html = (txt, parse_error_cb = null) ->
 					when 'style', 'script', 'template'
 						ins_mode_in_head t
 					when 'input'
-						if token_is_input_hidden t
+						if is_input_hidden_tok t
 							ins_mode_in_table_else t
 						else
 							parse_error()
@@ -1691,7 +1707,7 @@ parse_html = (txt, parse_error_cb = null) ->
 					if el.name is 'caption'
 						break
 				clear_afe_to_marker()
-				insertion_mode = in_table
+				insertion_mode = ins_mode_in_table
 			else
 				parse_error()
 				# fragment case
@@ -1704,7 +1720,7 @@ parse_html = (txt, parse_error_cb = null) ->
 					if el.name is 'caption'
 						break
 				clear_afe_to_marker()
-				insertion_mode = in_table
+				insertion_mode = ins_mode_in_table
 				insertion_mode t
 			# else fragment case
 			return
@@ -1735,7 +1751,7 @@ parse_html = (txt, parse_error_cb = null) ->
 			return
 		if t.type is TYPE_END_TAG and t.name is 'colgroup'
 			if open_els[0].name is 'colgroup'
-				open_els[0].shift()
+				open_els.shift()
 				insertion_mode = ins_mode_in_table
 			else
 				parse_error()
@@ -2049,7 +2065,7 @@ parse_html = (txt, parse_error_cb = null) ->
 		if t.type is TYPE_END_TAG
 			parse_error()
 			return
-		if t.type is EOF
+		if t.type is TYPE_EOF
 			unless template_tag_is_open()
 				stop_parsing()
 				return
@@ -2120,7 +2136,7 @@ parse_html = (txt, parse_error_cb = null) ->
 			open_els.shift()
 			t.acknowledge_self_closing()
 			return
-		if t.type is TYPE_START TAG and t.name is 'noframes'
+		if t.type is TYPE_START_TAG and t.name is 'noframes'
 			ins_mode_in_head t
 			return
 		if t.type is TYPE_EOF
@@ -2291,10 +2307,10 @@ parse_html = (txt, parse_error_cb = null) ->
 				tok_cur_tag = new_comment_token '?'
 				tok_state = tok_state_bogus_comment
 			else
-				if lc_alpha.indexOf(c) > -1
+				if is_lc_alpha(c)
 					tok_cur_tag = new_open_tag c
 					tok_state = tok_state_tag_name
-				else if uc_alpha.indexOf(c) > -1
+				else if is_uc_alpha(c)
 					tok_cur_tag = new_open_tag c.toLowerCase()
 					tok_state = tok_state_tag_name
 				else
@@ -2315,10 +2331,10 @@ parse_html = (txt, parse_error_cb = null) ->
 				tok_state = tok_state_data
 				return new_text_node '</'
 			else
-				if uc_alpha.indexOf(c) > -1
+				if is_uc_alpha(c)
 					tok_cur_tag = new_end_tag c.toLowerCase()
 					tok_state = tok_state_tag_name
-				else if lc_alpha.indexOf(c) > -1
+				else if is_lc_alpha(c)
 					tok_cur_tag = new_end_tag c
 					tok_state = tok_state_tag_name
 				else
@@ -2346,7 +2362,7 @@ parse_html = (txt, parse_error_cb = null) ->
 				parse_error()
 				tok_state = tok_state_data
 			else
-				if uc_alpha.indexOf(c) > -1
+				if is_uc_alpha(c)
 					tok_cur_tag.name += c.toLowerCase()
 				else
 					tok_cur_tag.name += c
@@ -2367,12 +2383,12 @@ parse_html = (txt, parse_error_cb = null) ->
 	# 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
 	tok_state_rcdata_end_tag_open = ->
 		c = txt.charAt(cur++)
-		if uc_alpha.indexOf(c) > -1
+		if is_uc_alpha(c)
 			tok_cur_tag = new_end_tag c.toLowerCase()
 			temporary_buffer += c
 			tok_state = tok_state_rcdata_end_tag_name
 			return null
-		if lc_alpha.indexOf(c) > -1
+		if is_lc_alpha(c)
 			tok_cur_tag = new_end_tag c
 			temporary_buffer += c
 			tok_state = tok_state_rcdata_end_tag_name
@@ -2410,11 +2426,11 @@ parse_html = (txt, parse_error_cb = null) ->
 				tok_state = tok_state_data
 				return tok_cur_tag
 			# else fall through to "Anything else"
-		if uc_alpha.indexOf(c) > -1
+		if is_uc_alpha(c)
 			tok_cur_tag.name += c.toLowerCase()
 			temporary_buffer += c
 			return null
-		if lc_alpha.indexOf(c) > -1
+		if is_lc_alpha(c)
 			tok_cur_tag.name += c
 			temporary_buffer += c
 			return null
@@ -2438,12 +2454,12 @@ parse_html = (txt, parse_error_cb = null) ->
 	# 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
 	tok_state_rawtext_end_tag_open = ->
 		c = txt.charAt(cur++)
-		if uc_alpha.indexOf(c) > -1
+		if is_uc_alpha(c)
 			tok_cur_tag = new_end_tag c.toLowerCase()
 			temporary_buffer += c
 			tok_state = tok_state_rawtext_end_tag_name
 			return null
-		if lc_alpha.indexOf(c) > -1
+		if is_lc_alpha(c)
 			tok_cur_tag = new_end_tag c
 			temporary_buffer += c
 			tok_state = tok_state_rawtext_end_tag_name
@@ -2471,11 +2487,11 @@ parse_html = (txt, parse_error_cb = null) ->
 				tok_state = tok_state_data
 				return tok_cur_tag
 			# else fall through to "Anything else"
-		if uc_alpha.indexOf(c) > -1
+		if is_uc_alpha(c)
 			tok_cur_tag.name += c.toLowerCase()
 			temporary_buffer += c
 			return null
-		if lc_alpha.indexOf(c) > -1
+		if is_lc_alpha(c)
 			tok_cur_tag.name += c
 			temporary_buffer += c
 			return null
@@ -2484,7 +2500,334 @@ parse_html = (txt, parse_error_cb = null) ->
 		cur -= 1 # reconsume the input character
 		return new_character_token '</' + temporary_buffer # fixfull separate these
 
-	# TODO _all_ of the missing states here (17-33) are for parsing script tags
+	# 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
+	tok_state_script_data_less_than_sign = ->
+		c = txt.charAt(cur++)
+		if c is '/'
+			temporary_buffer = ''
+			tok_state = tok_state_script_data_end_tag_open
+			return
+		if c is '!'
+			tok_state = tok_state_script_data_escape_start
+			return new_character_token '<!' # fixfull split
+		# Anything else
+		tok_state = tok_state_script_data
+		cur -= 1 # Reconsume
+		return new_character_token '<'
+
+	# 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
+	tok_state_script_data_end_tag_open = ->
+		c = txt.charAt(cur++)
+		if is_uc_alpha(c)
+			tok_cur_tag = new_end_tag c.toLowerCase()
+			temporary_buffer += c
+			tok_state = tok_state_script_data_end_tag_name
+			return
+		if is_lc_alpha(c)
+			tok_cur_tag = new_end_tag c
+			temporary_buffer += c
+			tok_state = tok_state_script_data_end_tag_name
+			return
+		# Anything else
+		tok_state = tok_state_script_data
+		cur -= 1 # Reconsume
+		return new_character_token '</'
+
+	# 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
+	tok_state_script_data_end_tag_name = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
+			if is_appropriate_end_tag tok_cur_tag
+				tok_state = tok_state_before_attribute_name
+				return
+			# fall through
+		if c is '/'
+			if is_appropriate_end_tag tok_cur_tag
+				tok_state = tok_state_self_closing_start_tag
+				return
+			# fall through
+		if is_uc_alpha(c)
+			tok_cur_tag.name += c.toLowerCase()
+			temporary_buffer += c
+			return
+		if is_lc_alpha(c)
+			tok_cur_tag.name += c
+			temporary_buffer += c
+			return
+		# Anything else
+		tok_state = tok_state_script_data
+		cur -= 1 # Reconsume
+		return new_character_token "</#{temporary_buffer}" # fixfull split
+
+	# 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
+	tok_state_script_data_escape_start = ->
+		c = txt.charAt(cur++)
+		if c is '-'
+			tok_state = tok_state_script_data_escape_start_dash
+			return new_character_token '-'
+		# Anything else
+		tok_state = tok_state_script_data
+		cur -= 1 # Reconsume
+		return
+
+	# 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
+	tok_state_script_data_escape_start_dash = ->
+		c = txt.charAt(cur++)
+		if c is '-'
+			tok_state = tok_state_script_data_escaped_dash_dash
+			return new_character_token '-'
+		# Anything else
+		tok_state = tok_state_script_data
+		cur -= 1 # Reconsume
+		return
+
+	# 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
+	tok_state_script_data_escaped = ->
+		c = txt.charAt(cur++)
+		if c is '-'
+			tok_state = tok_state_script_data_escaped_dash
+			return new_character_token '-'
+		if c is '<'
+			tok_state = tok_state_script_data_escaped_less_than_sign
+			return
+		if c is "\u0000"
+			parse_error()
+			return new_character_token "\ufffd"
+		if c is '' # EOF
+			tok_state = tok_state_data
+			parse_error()
+			cur -= 1 # Reconsume
+			return
+		# Anything else
+		return new_character_token c
+
+	# 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
+	tok_state_script_data_escaped_dash = ->
+		c = txt.charAt(cur++)
+		if c is '-'
+			tok_state = tok_state_script_data_escaped_dash_dash
+			return new_character_token '-'
+		if c is '<'
+			tok_state = tok_state_script_data_escaped_less_than_sign
+			return
+		if c is "\u0000"
+			parse_error()
+			tok_state = tok_state_script_data_escaped
+			return new_character_token "\ufffd"
+		if c is '' # EOF
+			tok_state = tok_state_data
+			parse_error()
+			cur -= 1 # Reconsume
+			return
+		# Anything else
+		tok_state = tok_state_script_data_escaped
+		return new_character_token c
+
+	# 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
+	tok_state_script_data_escaped_dash_dash = ->
+		c = txt.charAt(cur++)
+		if c is '-'
+			return new_character_token '-'
+		if c is '<'
+			tok_state = tok_state_script_data_escaped_less_than_sign
+			return
+		if c is '>'
+			tok_state = tok_state_script_data
+			return new_character_token '>'
+		if c is "\u0000"
+			parse_error()
+			tok_state = tok_state_script_data_escaped
+			return new_character_token "\ufffd"
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			cur -= 1 # Reconsume
+			return
+		# Anything else
+		tok_state = tok_state_script_data_escaped
+		return new_character_token c
+
+	# 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
+	tok_state_script_data_escaped_less_than_sign = ->
+		c = txt.charAt(cur++)
+		if c is '/'
+			temporary_buffer = ''
+			tok_state = tok_state_script_data_escaped_end_tag_open
+			return
+		if is_uc_alpha(c)
+			temporary_buffer = c.toLowerCase() # yes, really
+			tok_state = tok_state_script_data_double_escape_start
+			return new_character_token "<#{c}" # fixfull split
+		if is_lc_alpha(c)
+			temporary_buffer = c
+			tok_state = tok_state_script_data_double_escape_start
+			return new_character_token "<#{c}" # fixfull split
+		# Anything else
+		tok_state = tok_state_script_data_escaped
+		cur -= 1 # Reconsume
+		return new_character_token c
+
+	# 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
+	tok_state_script_data_escaped_end_tag_open = ->
+		c = txt.charAt(cur++)
+		if is_uc_alpha(c)
+			tok_cur_tag = new_end_tag c.toLowerCase()
+			temporary_buffer += c
+			tok_state = tok_state_script_data_escaped_end_tag_name
+			return
+		if is_lc_alpha(c)
+			tok_cur_tag = new_end_tag c
+			temporary_buffer += c
+			tok_state = tok_state_script_data_escaped_end_tag_name
+			return
+		# Anything else
+		tok_state = tok_state_script_data_escaped
+		cur -= 1 # Reconsume
+		return new_character_token '</' # fixfull split
+
+	# 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
+	tok_state_script_data_escaped_end_tag_name = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+			if is_appropriate_end_tag tok_cur_tag
+				tok_state = tok_state_before_attribute_name
+				return
+			# fall through
+		if c is '/'
+			if is_appropriate_end_tag tok_cur_tag
+				tok_state = tok_state_self_closing_start_tag
+				return
+			# fall through
+		if is_uc_alpha(c)
+			tok_cur_tag.name += c.toLowerCase()
+			temporary_buffer += c.toLowerCase()
+			return
+		if is_lc_alpha(c)
+			tok_cur_tag.name += c
+			temporary_buffer += c.toLowerCase()
+			return
+		# Anything else
+		tok_state = tok_state_script_data_escaped
+		cur -= 1 # Reconsume
+		return new_character_token "</#{temporary_buffer}" # fixfull split
+
+	# 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
+	tok_state_script_data_double_escape_start = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
+			if temporary_buffer is 'script'
+				tok_state = tok_state_script_data_double_escaped
+			else
+				tok_state = tok_state_script_data_escaped
+			return new_character_token c
+		if is_uc_alpha(c)
+			temporary_buffer += c.toLowerCase() # yes, really lowercase
+			return new_character_token c
+		if is_lc_alpha(c)
+			temporary_buffer += c
+			return new_character_token c
+		# Anything else
+		tok_state = tok_state_script_data_escaped
+		cur -= 1 # Reconsume
+		return
+
+	# 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
+	tok_state_script_data_double_escaped = ->
+		c = txt.charAt(cur++)
+		if c is '-'
+			tok_state = tok_state_script_data_double_escaped_dash
+			return new_character_token '-'
+		if c is '<'
+			tok_state = tok_state_script_data_double_escaped_less_than_sign
+			return new_character_token '<'
+		if c is "\u0000"
+			parse_error()
+			return new_character_token "\ufffd"
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			cur -= 1 # Reconsume
+			return
+		# Anything else
+		return new_character_token c
+
+	# 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
+	tok_state_script_data_double_escaped_dash = ->
+		c = txt.charAt(cur++)
+		if c is '-'
+			tok_state = tok_state_script_data_double_escaped_dash_dash
+			return new_character_token '-'
+		if c is '<'
+			tok_state = tok_state_script_data_double_escaped_less_than_sign
+			return new_character_token '<'
+		if c is "\u0000"
+			parse_error()
+			tok_state = tok_state_script_data_double_escaped
+			return new_character_token "\ufffd"
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			cur -= 1 # Reconsume
+			return
+		# Anything else
+		tok_state = tok_state_script_data_double_escaped
+		return new_character_token c
+
+	# 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
+	tok_state_script_data_double_escaped_dash_dash = ->
+		c = txt.charAt(cur++)
+		if c is '-'
+			return new_character_token '-'
+		if c is '<'
+			tok_state = tok_state_script_data_double_escaped_less_than_sign
+			return new_character_token '<'
+		if c is '>'
+			tok_state = tok_state_script_data
+			return new_character_token '>'
+		if c is "\u0000"
+			parse_error()
+			tok_state = tok_state_script_data_double_escaped
+			return new_character_token "\ufffd"
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			cur -= 1 # Reconsume
+			return
+		# Anything else
+		tok_state = tok_state_script_data_double_escaped
+		return new_character_token c
+
+	# 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
+	tok_state_script_data_double_escaped_less_than_sign = ->
+		c = txt.charAt(cur++)
+		if c is '/'
+			temporary_buffer = ''
+			tok_state = tok_state_script_data_double_escape_end
+			return new_character_token '/'
+		# Anything else
+		tok_state = tok_state_script_data_double_escaped
+		cur -= 1 # Reconsume
+		return
+
+	# 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
+	tok_state_script_data_double_escape_end = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
+			if temporary_buffer is 'script'
+				tok_state = tok_state_script_data_escaped
+			else
+				tok_state = tok_state_script_data_double_escaped
+			return new_character_token c
+		if is_uc_alpha(c)
+			temporary_buffer += c.toLowerCase() # yes, really lowercase
+			return new_character_token c
+		if is_lc_alpha(c)
+			temporary_buffer += c
+			return new_character_token c
+		# Anything else
+		tok_state = tok_state_script_data_double_escaped
+		cur -= 1 # Reconsume
+		return
 
 	# 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
 	tok_state_before_attribute_name = ->
@@ -2510,7 +2853,7 @@ parse_html = (txt, parse_error_cb = null) ->
 				parse_error()
 				tok_state = tok_state_data
 			else
-				if uc_alpha.indexOf(c) > -1
+				if is_uc_alpha(c)
 					attr_name = c.toLowerCase()
 				else
 					attr_name = c
@@ -2543,7 +2886,7 @@ parse_html = (txt, parse_error_cb = null) ->
 				parse_error()
 				tok_state = tok_state_data
 			else
-				if uc_alpha.indexOf(c) > -1
+				if is_uc_alpha(c)
 					tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
 				else
 					tok_cur_tag.attrs_a[0][0] += c
@@ -2563,7 +2906,7 @@ parse_html = (txt, parse_error_cb = null) ->
 		if c is '>'
 			tok_state = tok_state_data
 			return
-		if uc_alpha.indexOf(c) > -1
+		if is_uc_alpha(c)
 			tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
 			tok_state = tok_state_attribute_name
 			return
@@ -2691,6 +3034,24 @@ parse_html = (txt, parse_error_cb = null) ->
 				cur -= 1 # we didn't handle that char
 		return null
 
+	# 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
+	tok_state_self_closing_start_tag = ->
+		c = txt.charAt(cur++)
+		if c is '>'
+			tok_cur_tag.flag 'self-closing'
+			tok_state = tok_state_data
+			return tok_cur_tag
+		if c is ''
+			parse_error()
+			tok_state = tok_state_data
+			cur -= 1 # Reconsume
+			return
+		# Anything else
+		parse_error()
+		tok_state = tok_state_before_attribute_name
+		cur -= 1 # Reconsume
+		return
+
 	# 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
 	# WARNING: put a comment token in tok_cur_tag before setting this state
 	tok_state_bogus_comment = ->
@@ -2718,7 +3079,7 @@ parse_html = (txt, parse_error_cb = null) ->
 			tok_state = tok_state_doctype
 			return
 		acn = adjusted_current_node()
-		if acn and acn.namespace isnt NS_HTML and text.substr(cur, 7) is '[CDATA['
+		if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
 			cur += 7
 			tok_state = tok_state_cdata_section
 			return
@@ -2881,7 +3242,7 @@ parse_html = (txt, parse_error_cb = null) ->
 		c = txt.charAt(cur++)
 		if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
 			return
-		if uc_alpha.indexOf(c) > -1
+		if is_uc_alpha(c)
 			tok_cur_tag = new_doctype_token c.toLowerCase()
 			tok_state = tok_state_doctype_name
 			return
@@ -2917,7 +3278,7 @@ parse_html = (txt, parse_error_cb = null) ->
 		if c is '>'
 			tok_state = tok_state_data
 			return tok_cur_tag
-		if uc_alpha.indexOf(c) > -1
+		if is_uc_alpha(c)
 			tok_cur_tag.name += c.toLowerCase()
 			return
 		if c is "\u0000"
@@ -3379,7 +3740,7 @@ parse_html = (txt, parse_error_cb = null) ->
 	pending_table_character_tokens = []
 	head_element_pointer = null
 	flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
-	context_element = null # FIXME initialize from args.fragment
+	context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
 
 	# tokenizer initialization
 	tok_state = tok_state_data
diff --git a/test.coffee b/test.coffee
index 9e82bdd..3098bbd 100644
--- a/test.coffee
+++ b/test.coffee
@@ -7390,6 +7390,7 @@ tests = [
 		expected: "| <frame>\n"
 	}, {
 		name: "tests_innerHTML_1.dat #85"
+		html: ""
 		fragment: "html"
 		expected: "| <head>\n| <body>\n"
 	}, {
@@ -7832,6 +7833,8 @@ test_parser = (args) ->
 		#	console.log str
 		console.log "FAILED: \"#{args.name}\""
 		console.log "      Input: #{args.html}"
+		if args.fragment?
+			console.log "   Fragment: #{args.fragment}"
 		console.log "    Correct: #{args.expected}"
 		console.log "     Output: #{serialized}"
 		if parse_errors.length > 0
@@ -7843,8 +7846,7 @@ test_parser = (args) ->
 		console.log "passed \"#{args.name}\""
 		test_results.passed += 1
 test_summary = ->
-	console.log "Tests passed: #{test_results.passed}"
-	console.log "Tests Failed: #{test_results.failed}"
+	console.log "Tests passed: #{test_results.passed}, Tests Failed: #{test_results.failed}"
 
 
 next_test = 0
@@ -7852,11 +7854,12 @@ run_tests_and_breathe = ->
 	start_time = new Date()
 	loop
 		if next_test >= tests.length
+			test_summary()
 			return
 		test_parser tests[next_test]
 		next_test += 1
 		now = new Date()
 		if now - start_time > 100 # miliseconds
-			setTimeout run_tests_and_breathe, 1
+			break
+	setTimeout run_tests_and_breathe, 1
 run_tests_and_breathe()
-test_summary()
-- 
1.7.10.4