X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=f64c734e13e7dab898248150aad813cacce83041;hb=adc7477c34f3a2aa480e7f2af5ea954d2421d000;hp=c6ed9a5769cdb683f41dbe747f3fd9b82a4033ba;hpb=ffc91832d8b2c91ddd4407cf4036b6fc0eeca928;p=peach-html5-editor.git

diff --git a/parse-html.coffee b/parse-html.coffee
index c6ed9a5..f64c734 100644
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -47,6 +47,12 @@
 #   0: a "end of the list", "current node", "bottommost", "last"
 
 
+# browser
+# note: to get this to run outside a browser, you'll have to write a native
+# implementation of decode_named_char_ref()
+unless module?.exports?
+	window.wheic = {}
+	module = exports: window.wheic
 
 # Each node is an obect of the Node class. Here are the Node types:
 TYPE_TAG = 0 # name, {attributes}, [children]
@@ -158,8 +164,10 @@ new_element = (name) ->
 new_text_node = (txt) ->
 	return new Node TYPE_TEXT, text: txt
 new_character_token = new_text_node
-new_comment_node = (txt) ->
+new_comment_token = (txt) ->
 	return new Node TYPE_COMMENT, text: txt
+new_doctype_token = (name) ->
+	return new Node TYPE_DOCTYPE, name: name
 new_eof_token = ->
 	return new Node TYPE_EOF
 new_afe_marker = ->
@@ -173,6 +181,11 @@ digits = "0123456789"
 alnum = lc_alpha + uc_alpha + digits
 hex_chars = digits + "abcdefABCDEF"
 
+is_uc_alpha = (str) ->
+	return str.length is 1 and uc_alpha.indexOf(str) > -1
+is_lc_alpha = (str) ->
+	return str.length is 1 and lc_alpha.indexOf(str) > -1
+
 # some SVG elements have dashes in them
 tag_name_chars = alnum + "-"
 
@@ -183,6 +196,15 @@ is_space = (txt) ->
 is_space_tok = (t) ->
 	return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
 
+is_input_hidden_tok = (t) ->
+	return unless t.type is TYPE_START_TAG
+	for a of t.attrs_a
+		if a[0] is 'type'
+			if a[1].toLowerCase() is 'hidden'
+				return true
+			return false
+	return false
+
 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
 
@@ -367,6 +389,7 @@ parse_html = (txt, parse_error_cb = null) ->
 	pending_table_character_tokens = null
 	head_element_pointer = null
 	flag_fragment_parsing = null
+	context_element = null
 
 	stop_parsing = ->
 		flag_parsing = false
@@ -441,7 +464,7 @@ parse_html = (txt, parse_error_cb = null) ->
 		for t in open_els
 			if t.name is tag_name and (namespace is null or namespace is t.namespace)
 				return true
-			if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
+			if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
 				return false
 		return false
 	# this checks for a particular element, not by name
@@ -490,9 +513,11 @@ parse_html = (txt, parse_error_cb = null) ->
 		return
 	clear_afe_to_marker = ->
 		loop
+			return unless afe.length > 0 # this happens in fragment case, ?spec error
 			el = afe.shift()
 			if el.type is TYPE_AFE_MARKER
 				return
+		return
 
 	# 8.2.3.1 ...
 	# http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
@@ -617,6 +642,14 @@ parse_html = (txt, parse_error_cb = null) ->
 			node = open_els[node_i]
 			# 19. Return to the step labeled loop.
 
+	# 8.2.3.2
+
+	# http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
+	adjusted_current_node = ->
+		if open_els.length is 1 and flag_fragment_parsing
+			return context_element
+		return open_els[0]
+
 	# http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
 	# this implementation is structured (mostly) as described at the link above.
 	# capitalized comments are the "labels" described at the link above.
@@ -1001,7 +1034,7 @@ parse_html = (txt, parse_error_cb = null) ->
 				# last template's template contents, after its last child (if
 				# any), and abort these substeps.
 				if last_template and (last_table is null or last_template_i < last_table_i)
-					target = template # fixfull should be it's contents
+					target = last_template # fixfull should be it's contents
 					target_i = target.children.length
 					break
 				# 4. If there is no last table, then let adjusted insertion
@@ -1140,12 +1173,12 @@ parse_html = (txt, parse_error_cb = null) ->
 		if is_space_tok t
 			return
 		if t.type is TYPE_COMMENT
-			# fixfull this is supposed to be "the last child of the document object"
+			# ?fixfull
 			doc.children.push t
 			return
 		if t.type is TYPE_DOCTYPE
+			# FIXME check identifiers, set quirks, etc
 			# fixfull
-			t.name = 'html'
 			doc.children.push t
 			insertion_mode = ins_mode_before_html
 			return
@@ -1167,6 +1200,7 @@ parse_html = (txt, parse_error_cb = null) ->
 			return
 		if t.type is TYPE_START_TAG and t.name is 'html'
 			el = token_to_element t, NS_HTML, doc
+			doc.children.push el
 			open_els.unshift(el)
 			# fixfull (big paragraph in spec about manifest, fragment, urls, etc)
 			insertion_mode = ins_mode_before_head
@@ -1247,19 +1281,19 @@ parse_html = (txt, parse_error_cb = null) ->
 			# fixfull encoding stuff
 			return
 		if t.type is TYPE_START_TAG and t.name is 'title'
-			parse_generic_rcdata_element t
+			parse_generic_rcdata_text t
 			return
 		if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
 			parse_generic_raw_text t
 			return
 		if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
 			insert_html_element t
-			insertion_mode = in_head_noscript # FIXME implement
+			insertion_mode = ins_mode_in_head_noscript # FIXME implement
 			return
 		if t.type is TYPE_START_TAG and t.name is 'script'
 			ail = adjusted_insertion_location()
 			el = token_to_element t, NS_HTML, ail
-			el.flag_parser_inserted true # FIXME implement
+			el.flag 'parser-inserted', true # FIXME implement
 			# fixfull frament case
 			ail[0].children.splice ail[1], 0, el
 			open_els.unshift el
@@ -1300,12 +1334,12 @@ parse_html = (txt, parse_error_cb = null) ->
 			parse_error()
 			return
 		ins_mode_in_head_else t
-	
+
 	# 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
 	ins_mode_in_head_noscript = (t) ->
 		# FIXME ?fixfull
 		console.log "ins_mode_in_head_noscript unimplemented"
-	
+
 	# 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
 	ins_mode_after_head_else = (t) ->
 		body_tok = new_open_tag 'body'
@@ -1596,7 +1630,7 @@ parse_html = (txt, parse_error_cb = null) ->
 					when 'style', 'script', 'template'
 						ins_mode_in_head t
 					when 'input'
-						if token_is_input_hidden t
+						if is_input_hidden_tok t
 							ins_mode_in_table_else t
 						else
 							parse_error()
@@ -1673,7 +1707,7 @@ parse_html = (txt, parse_error_cb = null) ->
 					if el.name is 'caption'
 						break
 				clear_afe_to_marker()
-				insertion_mode = in_table
+				insertion_mode = ins_mode_in_table
 			else
 				parse_error()
 				# fragment case
@@ -1686,7 +1720,7 @@ parse_html = (txt, parse_error_cb = null) ->
 					if el.name is 'caption'
 						break
 				clear_afe_to_marker()
-				insertion_mode = in_table
+				insertion_mode = ins_mode_in_table
 				insertion_mode t
 			# else fragment case
 			return
@@ -1717,7 +1751,7 @@ parse_html = (txt, parse_error_cb = null) ->
 			return
 		if t.type is TYPE_END_TAG and t.name is 'colgroup'
 			if open_els[0].name is 'colgroup'
-				open_els[0].shift()
+				open_els.shift()
 				insertion_mode = ins_mode_in_table
 			else
 				parse_error()
@@ -2031,7 +2065,7 @@ parse_html = (txt, parse_error_cb = null) ->
 		if t.type is TYPE_END_TAG
 			parse_error()
 			return
-		if t.type is EOF
+		if t.type is TYPE_EOF
 			unless template_tag_is_open()
 				stop_parsing()
 				return
@@ -2102,7 +2136,7 @@ parse_html = (txt, parse_error_cb = null) ->
 			open_els.shift()
 			t.acknowledge_self_closing()
 			return
-		if t.type is TYPE_START TAG and t.name is 'noframes'
+		if t.type is TYPE_START_TAG and t.name is 'noframes'
 			ins_mode_in_head t
 			return
 		if t.type is TYPE_EOF
@@ -2270,12 +2304,13 @@ parse_html = (txt, parse_error_cb = null) ->
 				tok_state = tok_state_end_tag_open
 			when '?'
 				parse_error()
+				tok_cur_tag = new_comment_token '?'
 				tok_state = tok_state_bogus_comment
 			else
-				if lc_alpha.indexOf(c) > -1
+				if is_lc_alpha(c)
 					tok_cur_tag = new_open_tag c
 					tok_state = tok_state_tag_name
-				else if uc_alpha.indexOf(c) > -1
+				else if is_uc_alpha(c)
 					tok_cur_tag = new_open_tag c.toLowerCase()
 					tok_state = tok_state_tag_name
 				else
@@ -2296,14 +2331,15 @@ parse_html = (txt, parse_error_cb = null) ->
 				tok_state = tok_state_data
 				return new_text_node '</'
 			else
-				if uc_alpha.indexOf(c) > -1
+				if is_uc_alpha(c)
 					tok_cur_tag = new_end_tag c.toLowerCase()
 					tok_state = tok_state_tag_name
-				else if lc_alpha.indexOf(c) > -1
+				else if is_lc_alpha(c)
 					tok_cur_tag = new_end_tag c
 					tok_state = tok_state_tag_name
 				else
 					parse_error()
+					tok_cur_tag = new_comment_token '/'
 					tok_state = tok_state_bogus_comment
 		return null
 
@@ -2326,7 +2362,7 @@ parse_html = (txt, parse_error_cb = null) ->
 				parse_error()
 				tok_state = tok_state_data
 			else
-				if uc_alpha.indexOf(c) > -1
+				if is_uc_alpha(c)
 					tok_cur_tag.name += c.toLowerCase()
 				else
 					tok_cur_tag.name += c
@@ -2347,12 +2383,12 @@ parse_html = (txt, parse_error_cb = null) ->
 	# 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
 	tok_state_rcdata_end_tag_open = ->
 		c = txt.charAt(cur++)
-		if uc_alpha.indexOf(c) > -1
+		if is_uc_alpha(c)
 			tok_cur_tag = new_end_tag c.toLowerCase()
 			temporary_buffer += c
 			tok_state = tok_state_rcdata_end_tag_name
 			return null
-		if lc_alpha.indexOf(c) > -1
+		if is_lc_alpha(c)
 			tok_cur_tag = new_end_tag c
 			temporary_buffer += c
 			tok_state = tok_state_rcdata_end_tag_name
@@ -2390,11 +2426,11 @@ parse_html = (txt, parse_error_cb = null) ->
 				tok_state = tok_state_data
 				return tok_cur_tag
 			# else fall through to "Anything else"
-		if uc_alpha.indexOf(c) > -1
+		if is_uc_alpha(c)
 			tok_cur_tag.name += c.toLowerCase()
 			temporary_buffer += c
 			return null
-		if lc_alpha.indexOf(c) > -1
+		if is_lc_alpha(c)
 			tok_cur_tag.name += c
 			temporary_buffer += c
 			return null
@@ -2418,12 +2454,12 @@ parse_html = (txt, parse_error_cb = null) ->
 	# 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
 	tok_state_rawtext_end_tag_open = ->
 		c = txt.charAt(cur++)
-		if uc_alpha.indexOf(c) > -1
+		if is_uc_alpha(c)
 			tok_cur_tag = new_end_tag c.toLowerCase()
 			temporary_buffer += c
 			tok_state = tok_state_rawtext_end_tag_name
 			return null
-		if lc_alpha.indexOf(c) > -1
+		if is_lc_alpha(c)
 			tok_cur_tag = new_end_tag c
 			temporary_buffer += c
 			tok_state = tok_state_rawtext_end_tag_name
@@ -2451,11 +2487,11 @@ parse_html = (txt, parse_error_cb = null) ->
 				tok_state = tok_state_data
 				return tok_cur_tag
 			# else fall through to "Anything else"
-		if uc_alpha.indexOf(c) > -1
+		if is_uc_alpha(c)
 			tok_cur_tag.name += c.toLowerCase()
 			temporary_buffer += c
 			return null
-		if lc_alpha.indexOf(c) > -1
+		if is_lc_alpha(c)
 			tok_cur_tag.name += c
 			temporary_buffer += c
 			return null
@@ -2464,7 +2500,334 @@ parse_html = (txt, parse_error_cb = null) ->
 		cur -= 1 # reconsume the input character
 		return new_character_token '</' + temporary_buffer # fixfull separate these
 
-	# TODO _all_ of the missing states here (17-33) are for parsing script tags
+	# 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
+	tok_state_script_data_less_than_sign = ->
+		c = txt.charAt(cur++)
+		if c is '/'
+			temporary_buffer = ''
+			tok_state = tok_state_script_data_end_tag_open
+			return
+		if c is '!'
+			tok_state = tok_state_script_data_escape_start
+			return new_character_token '<!' # fixfull split
+		# Anything else
+		tok_state = tok_state_script_data
+		cur -= 1 # Reconsume
+		return new_character_token '<'
+
+	# 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
+	tok_state_script_data_end_tag_open = ->
+		c = txt.charAt(cur++)
+		if is_uc_alpha(c)
+			tok_cur_tag = new_end_tag c.toLowerCase()
+			temporary_buffer += c
+			tok_state = tok_state_script_data_end_tag_name
+			return
+		if is_lc_alpha(c)
+			tok_cur_tag = new_end_tag c
+			temporary_buffer += c
+			tok_state = tok_state_script_data_end_tag_name
+			return
+		# Anything else
+		tok_state = tok_state_script_data
+		cur -= 1 # Reconsume
+		return new_character_token '</'
+
+	# 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
+	tok_state_script_data_end_tag_name = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
+			if is_appropriate_end_tag tok_cur_tag
+				tok_state = tok_state_before_attribute_name
+				return
+			# fall through
+		if c is '/'
+			if is_appropriate_end_tag tok_cur_tag
+				tok_state = tok_state_self_closing_start_tag
+				return
+			# fall through
+		if is_uc_alpha(c)
+			tok_cur_tag.name += c.toLowerCase()
+			temporary_buffer += c
+			return
+		if is_lc_alpha(c)
+			tok_cur_tag.name += c
+			temporary_buffer += c
+			return
+		# Anything else
+		tok_state = tok_state_script_data
+		cur -= 1 # Reconsume
+		return new_character_token "</#{temporary_buffer}" # fixfull split
+
+	# 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
+	tok_state_script_data_escape_start = ->
+		c = txt.charAt(cur++)
+		if c is '-'
+			tok_state = tok_state_script_data_escape_start_dash
+			return new_character_token '-'
+		# Anything else
+		tok_state = tok_state_script_data
+		cur -= 1 # Reconsume
+		return
+
+	# 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
+	tok_state_script_data_escape_start_dash = ->
+		c = txt.charAt(cur++)
+		if c is '-'
+			tok_state = tok_state_script_data_escaped_dash_dash
+			return new_character_token '-'
+		# Anything else
+		tok_state = tok_state_script_data
+		cur -= 1 # Reconsume
+		return
+
+	# 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
+	tok_state_script_data_escaped = ->
+		c = txt.charAt(cur++)
+		if c is '-'
+			tok_state = tok_state_script_data_escaped_dash
+			return new_character_token '-'
+		if c is '<'
+			tok_state = tok_state_script_data_escaped_less_than_sign
+			return
+		if c is "\u0000"
+			parse_error()
+			return new_character_token "\ufffd"
+		if c is '' # EOF
+			tok_state = tok_state_data
+			parse_error()
+			cur -= 1 # Reconsume
+			return
+		# Anything else
+		return new_character_token c
+
+	# 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
+	tok_state_script_data_escaped_dash = ->
+		c = txt.charAt(cur++)
+		if c is '-'
+			tok_state = tok_state_script_data_escaped_dash_dash
+			return new_character_token '-'
+		if c is '<'
+			tok_state = tok_state_script_data_escaped_less_than_sign
+			return
+		if c is "\u0000"
+			parse_error()
+			tok_state = tok_state_script_data_escaped
+			return new_character_token "\ufffd"
+		if c is '' # EOF
+			tok_state = tok_state_data
+			parse_error()
+			cur -= 1 # Reconsume
+			return
+		# Anything else
+		tok_state = tok_state_script_data_escaped
+		return new_character_token c
+
+	# 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
+	tok_state_script_data_escaped_dash_dash = ->
+		c = txt.charAt(cur++)
+		if c is '-'
+			return new_character_token '-'
+		if c is '<'
+			tok_state = tok_state_script_data_escaped_less_than_sign
+			return
+		if c is '>'
+			tok_state = tok_state_script_data
+			return new_character_token '>'
+		if c is "\u0000"
+			parse_error()
+			tok_state = tok_state_script_data_escaped
+			return new_character_token "\ufffd"
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			cur -= 1 # Reconsume
+			return
+		# Anything else
+		tok_state = tok_state_script_data_escaped
+		return new_character_token c
+
+	# 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
+	tok_state_script_data_escaped_less_than_sign = ->
+		c = txt.charAt(cur++)
+		if c is '/'
+			temporary_buffer = ''
+			tok_state = tok_state_script_data_escaped_end_tag_open
+			return
+		if is_uc_alpha(c)
+			temporary_buffer = c.toLowerCase() # yes, really
+			tok_state = tok_state_script_data_double_escape_start
+			return new_character_token "<#{c}" # fixfull split
+		if is_lc_alpha(c)
+			temporary_buffer = c
+			tok_state = tok_state_script_data_double_escape_start
+			return new_character_token "<#{c}" # fixfull split
+		# Anything else
+		tok_state = tok_state_script_data_escaped
+		cur -= 1 # Reconsume
+		return new_character_token c
+
+	# 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
+	tok_state_script_data_escaped_end_tag_open = ->
+		c = txt.charAt(cur++)
+		if is_uc_alpha(c)
+			tok_cur_tag = new_end_tag c.toLowerCase()
+			temporary_buffer += c
+			tok_state = tok_state_script_data_escaped_end_tag_name
+			return
+		if is_lc_alpha(c)
+			tok_cur_tag = new_end_tag c
+			temporary_buffer += c
+			tok_state = tok_state_script_data_escaped_end_tag_name
+			return
+		# Anything else
+		tok_state = tok_state_script_data_escaped
+		cur -= 1 # Reconsume
+		return new_character_token '</' # fixfull split
+
+	# 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
+	tok_state_script_data_escaped_end_tag_name = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+			if is_appropriate_end_tag tok_cur_tag
+				tok_state = tok_state_before_attribute_name
+				return
+			# fall through
+		if c is '/'
+			if is_appropriate_end_tag tok_cur_tag
+				tok_state = tok_state_self_closing_start_tag
+				return
+			# fall through
+		if is_uc_alpha(c)
+			tok_cur_tag.name += c.toLowerCase()
+			temporary_buffer += c.toLowerCase()
+			return
+		if is_lc_alpha(c)
+			tok_cur_tag.name += c
+			temporary_buffer += c.toLowerCase()
+			return
+		# Anything else
+		tok_state = tok_state_script_data_escaped
+		cur -= 1 # Reconsume
+		return new_character_token "</#{temporary_buffer}" # fixfull split
+
+	# 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
+	tok_state_script_data_double_escape_start = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
+			if temporary_buffer is 'script'
+				tok_state = tok_state_script_data_double_escaped
+			else
+				tok_state = tok_state_script_data_escaped
+			return new_character_token c
+		if is_uc_alpha(c)
+			temporary_buffer += c.toLowerCase() # yes, really lowercase
+			return new_character_token c
+		if is_lc_alpha(c)
+			temporary_buffer += c
+			return new_character_token c
+		# Anything else
+		tok_state = tok_state_script_data_escaped
+		cur -= 1 # Reconsume
+		return
+
+	# 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
+	tok_state_script_data_double_escaped = ->
+		c = txt.charAt(cur++)
+		if c is '-'
+			tok_state = tok_state_script_data_double_escaped_dash
+			return new_character_token '-'
+		if c is '<'
+			tok_state = tok_state_script_data_double_escaped_less_than_sign
+			return new_character_token '<'
+		if c is "\u0000"
+			parse_error()
+			return new_character_token "\ufffd"
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			cur -= 1 # Reconsume
+			return
+		# Anything else
+		return new_character_token c
+
+	# 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
+	tok_state_script_data_double_escaped_dash = ->
+		c = txt.charAt(cur++)
+		if c is '-'
+			tok_state = tok_state_script_data_double_escaped_dash_dash
+			return new_character_token '-'
+		if c is '<'
+			tok_state = tok_state_script_data_double_escaped_less_than_sign
+			return new_character_token '<'
+		if c is "\u0000"
+			parse_error()
+			tok_state = tok_state_script_data_double_escaped
+			return new_character_token "\ufffd"
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			cur -= 1 # Reconsume
+			return
+		# Anything else
+		tok_state = tok_state_script_data_double_escaped
+		return new_character_token c
+
+	# 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
+	tok_state_script_data_double_escaped_dash_dash = ->
+		c = txt.charAt(cur++)
+		if c is '-'
+			return new_character_token '-'
+		if c is '<'
+			tok_state = tok_state_script_data_double_escaped_less_than_sign
+			return new_character_token '<'
+		if c is '>'
+			tok_state = tok_state_script_data
+			return new_character_token '>'
+		if c is "\u0000"
+			parse_error()
+			tok_state = tok_state_script_data_double_escaped
+			return new_character_token "\ufffd"
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			cur -= 1 # Reconsume
+			return
+		# Anything else
+		tok_state = tok_state_script_data_double_escaped
+		return new_character_token c
+
+	# 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
+	tok_state_script_data_double_escaped_less_than_sign = ->
+		c = txt.charAt(cur++)
+		if c is '/'
+			temporary_buffer = ''
+			tok_state = tok_state_script_data_double_escape_end
+			return new_character_token '/'
+		# Anything else
+		tok_state = tok_state_script_data_double_escaped
+		cur -= 1 # Reconsume
+		return
+
+	# 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
+	tok_state_script_data_double_escape_end = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
+			if temporary_buffer is 'script'
+				tok_state = tok_state_script_data_escaped
+			else
+				tok_state = tok_state_script_data_double_escaped
+			return new_character_token c
+		if is_uc_alpha(c)
+			temporary_buffer += c.toLowerCase() # yes, really lowercase
+			return new_character_token c
+		if is_lc_alpha(c)
+			temporary_buffer += c
+			return new_character_token c
+		# Anything else
+		tok_state = tok_state_script_data_double_escaped
+		cur -= 1 # Reconsume
+		return
 
 	# 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
 	tok_state_before_attribute_name = ->
@@ -2490,7 +2853,7 @@ parse_html = (txt, parse_error_cb = null) ->
 				parse_error()
 				tok_state = tok_state_data
 			else
-				if uc_alpha.indexOf(c) > -1
+				if is_uc_alpha(c)
 					attr_name = c.toLowerCase()
 				else
 					attr_name = c
@@ -2523,7 +2886,7 @@ parse_html = (txt, parse_error_cb = null) ->
 				parse_error()
 				tok_state = tok_state_data
 			else
-				if uc_alpha.indexOf(c) > -1
+				if is_uc_alpha(c)
 					tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
 				else
 					tok_cur_tag.attrs_a[0][0] += c
@@ -2543,7 +2906,7 @@ parse_html = (txt, parse_error_cb = null) ->
 		if c is '>'
 			tok_state = tok_state_data
 			return
-		if uc_alpha.indexOf(c) > -1
+		if is_uc_alpha(c)
 			tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
 			tok_state = tok_state_attribute_name
 			return
@@ -2671,6 +3034,619 @@ parse_html = (txt, parse_error_cb = null) ->
 				cur -= 1 # we didn't handle that char
 		return null
 
+	# 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
+	tok_state_self_closing_start_tag = ->
+		c = txt.charAt(cur++)
+		if c is '>'
+			tok_cur_tag.flag 'self-closing'
+			tok_state = tok_state_data
+			return tok_cur_tag
+		if c is ''
+			parse_error()
+			tok_state = tok_state_data
+			cur -= 1 # Reconsume
+			return
+		# Anything else
+		parse_error()
+		tok_state = tok_state_before_attribute_name
+		cur -= 1 # Reconsume
+		return
+
+	# 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
+	# WARNING: put a comment token in tok_cur_tag before setting this state
+	tok_state_bogus_comment = ->
+		next_gt = txt.indexOf '>', cur
+		if next_gt is -1
+			val = txt.substr cur
+			cur = txt.length
+		else
+			val = txt.substr cur, (next_gt - cur)
+			cur = next_gt + 1
+		val = val.replace "\u0000", "\ufffd"
+		tok_cur_tag.text += val
+		tok_state = tok_state_data
+		return tok_cur_tag
+
+	# 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
+	tok_state_markup_declaration_open = ->
+		if txt.substr(cur, 2) is '--'
+			cur += 2
+			tok_cur_tag = new_comment_token ''
+			tok_state = tok_state_comment_start
+			return
+		if txt.substr(cur, 7).toLowerCase() is 'doctype'
+			cur += 7
+			tok_state = tok_state_doctype
+			return
+		acn = adjusted_current_node()
+		if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
+			cur += 7
+			tok_state = tok_state_cdata_section
+			return
+		# Otherwise
+		parse_error()
+		tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
+		tok_state = tok_state_bogus_comment
+		return
+
+	# 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
+	tok_state_comment_start = ->
+		switch c = txt.charAt(cur++)
+			when '-'
+				tok_state = tok_state_comment_start_dash
+			when "\u0000"
+				parse_error()
+				return new_character_token "\ufffd"
+			when '>'
+				parse_error()
+				tok_state = tok_state_data
+				return tok_cur_tag
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+				cur -= 1 # Reconsume
+				return tok_cur_tag
+			else
+				tok_cur_tag.text += c
+		return null
+
+	# 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
+	tok_state_comment_start_dash = ->
+		switch c = txt.charAt(cur++)
+			when '-'
+				tok_state = tok_state_comment_end
+			when "\u0000"
+				parse_error()
+				tok_cur_tag.text += "-\ufffd"
+				tok_state = tok_state_comment
+			when '>'
+				parse_error()
+				tok_state = tok_state_data
+				return tok_cur_tag
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+				cur -= 1 # Reconsume
+				return tok_cur_tag
+			else
+				tok_cur_tag.text += "-#{c}"
+				tok_state = tok_state_comment
+		return null
+
+	# 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
+	tok_state_comment = ->
+		switch c = txt.charAt(cur++)
+			when '-'
+				tok_state = tok_state_comment_end_dash
+			when "\u0000"
+				parse_error()
+				tok_cur_tag.text += "\ufffd"
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+				cur -= 1 # Reconsume
+				return tok_cur_tag
+			else
+				tok_cur_tag.text += c
+		return null
+
+	# 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
+	tok_state_comment_end_dash = ->
+		switch c = txt.charAt(cur++)
+			when '-'
+				tok_state = tok_state_comment_end
+			when "\u0000"
+				parse_error()
+				tok_cur_tag.text += "-\ufffd"
+				tok_state = tok_state_comment
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+				cur -= 1 # Reconsume
+				return tok_cur_tag
+			else
+				tok_cur_tag.text += "-#{c}"
+				tok_state = tok_state_comment
+		return null
+
+	# 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
+	tok_state_comment_end = ->
+		switch c = txt.charAt(cur++)
+			when '>'
+				tok_state = tok_state_data
+				return tok_cur_tag
+			when "\u0000"
+				parse_error()
+				tok_cur_tag.text += "--\ufffd"
+				tok_state = tok_state_comment
+			when '!'
+				parse_error()
+				tok_state = tok_state_comment_end_bang
+			when '-'
+				parse_error()
+				tok_cur_tag.text += '-'
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+				cur -= 1 # Reconsume
+				return tok_cur_tag
+			else
+				parse_error()
+				tok_cur_tag.text += "--#{c}"
+				tok_state = tok_state_comment
+		return null
+
+	# 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
+	tok_state_comment_end_bang = ->
+		switch c = txt.charAt(cur++)
+			when '-'
+				tok_cur_tag.text += "--!#{c}"
+				tok_state = tok_state_comment_end_dash
+			when '>'
+				tok_state = tok_state_data
+				return tok_cur_tag
+			when "\u0000"
+				parse_error()
+				tok_cur_tag.text += "--!\ufffd"
+				tok_state = tok_state_comment
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+				cur -= 1 # Reconsume
+				return tok_cur_tag
+			else
+				tok_cur_tag.text += "--!#{c}"
+				tok_state = tok_state_comment
+		return null
+
+	# 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
+	tok_state_doctype = ->
+		switch c = txt.charAt(cur++)
+			when "\t", "\u000a", "\u000c", ' '
+				tok_state = tok_state_before_doctype_name
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+				el = new_doctype_token ''
+				el.flag 'force-quirks', true
+				cur -= 1 # Reconsume
+				return el
+			else
+				parse_error()
+				tok_state = tok_state_before_doctype_name
+				cur -= 1 # Reconsume
+		return null
+
+	# 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
+	tok_state_before_doctype_name = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+			return
+		if is_uc_alpha(c)
+			tok_cur_tag = new_doctype_token c.toLowerCase()
+			tok_state = tok_state_doctype_name
+			return
+		if c is "\u0000"
+			parse_error()
+			tok_cur_tag = new_doctype_token "\ufffd"
+			tok_state = tok_state_doctype_name
+			return
+		if c is '>'
+			parse_error()
+			el = new_doctype_token ''
+			el.flag 'force-quirks', true
+			tok_state = tok_state_data
+			return el
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			el = new_doctype_token ''
+			el.flag 'force-quirks', true
+			cur -= 1 # Reconsume
+			return el
+		# Anything else
+		tok_cur_tag = new_doctype_token c
+		tok_state = tok_state_doctype_name
+		return null
+
+	# 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
+	tok_state_doctype_name = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+			tok_state = tok_state_after_doctype_name
+			return
+		if c is '>'
+			tok_state = tok_state_data
+			return tok_cur_tag
+		if is_uc_alpha(c)
+			tok_cur_tag.name += c.toLowerCase()
+			return
+		if c is "\u0000"
+			parse_error()
+			tok_cur_tag.name += "\ufffd"
+			return
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			tok_cur_tag.flag 'force-quirks', true
+			cur -= 1 # Reconsume
+			return tok_cur_tag
+		# Anything else
+		tok_cur_tag.name += c
+		return null
+
+	# 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
+	tok_state_after_doctype_name = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+			return
+		if c is '>'
+			tok_state = tok_state_data
+			return tok_cur_tag
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			tok_cur_tag.flag 'force-quirks', true
+			cur -= 1 # Reconsume
+			return tok_cur_tag
+		# Anything else
+		if txt.substr(cur - 1, 6).toLowerCase() is 'public'
+			cur += 5
+			tok_state = tok_state_after_doctype_public_keyword
+			return
+		if txt.substr(cur - 1, 6).toLowerCase() is 'system'
+			cur += 5
+			tok_state = tok_state_after_doctype_system_keyword
+			return
+		parse_error()
+		tok_cur_tag.flag 'force-quirks', true
+		tok_state = tok_state_bogus_doctype
+		return null
+
+	# 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
+	tok_state_after_doctype_public_keyword = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+			tok_state = tok_state_before_doctype_public_identifier
+			return
+		if c is '"'
+			parse_error()
+			tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
+			tok_state = tok_state_doctype_public_identifier_double_quoted
+			return
+		if c is "'"
+			parse_error()
+			tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
+			tok_state = tok_state_doctype_public_identifier_single_quoted
+			return
+		if c is '>'
+			parse_error()
+			tok_cur_tag.flag 'force-quirks', true
+			tok_state = tok_state_data
+			return tok_cur_tag
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			tok_cur_tag.flag 'force-quirks', true
+			cur -= 1 # Reconsume
+			return tok_cur_tag
+		# Anything else
+		parse_error()
+		tok_cur_tag.flag 'force-quirks', true
+		tok_state = tok_state_bogus_doctype
+		return null
+
+	# 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
+	tok_state_before_doctype_public_identifier = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+			return
+		if c is '"'
+			parse_error()
+			tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
+			tok_state = tok_state_doctype_public_identifier_double_quoted
+			return
+		if c is "'"
+			parse_error()
+			tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
+			tok_state = tok_state_doctype_public_identifier_single_quoted
+			return
+		if c is '>'
+			parse_error()
+			tok_cur_tag.flag 'force-quirks', true
+			tok_state = tok_state_data
+			return tok_cur_tag
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			tok_cur_tag.flag 'force-quirks', true
+			cur -= 1 # Reconsume
+			return tok_cur_tag
+		# Anything else
+		parse_error()
+		tok_cur_tag.flag 'force-quirks', true
+		tok_state = tok_state_bogus_doctype
+		return null
+
+
+	# 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
+	tok_state_doctype_public_identifier_double_quoted = ->
+		c = txt.charAt(cur++)
+		if c is '"'
+			tok_state = tok_state_after_doctype_public_identifier
+			return
+		if c is "\u0000"
+			parse_error()
+			tok_cur_tag.public_identifier += "\ufffd"
+			return
+		if c is '>'
+			parse_error()
+			tok_cur_tag.flag 'force-quirks', true
+			tok_state = tok_state_data
+			return tok_cur_tag
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			tok_cur_tag.flag 'force-quirks', true
+			cur -= 1 # Reconsume
+			return tok_cur_tag
+		# Anything else
+		tok_cur_tag.public_identifier += c
+		return null
+
+	# 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
+	tok_state_doctype_public_identifier_single_quoted = ->
+		c = txt.charAt(cur++)
+		if c is "'"
+			tok_state = tok_state_after_doctype_public_identifier
+			return
+		if c is "\u0000"
+			parse_error()
+			tok_cur_tag.public_identifier += "\ufffd"
+			return
+		if c is '>'
+			parse_error()
+			tok_cur_tag.flag 'force-quirks', true
+			tok_state = tok_state_data
+			return tok_cur_tag
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			tok_cur_tag.flag 'force-quirks', true
+			cur -= 1 # Reconsume
+			return tok_cur_tag
+		# Anything else
+		tok_cur_tag.public_identifier += c
+		return null
+
+	# 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
+	tok_state_after_doctype_public_identifier = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+			tok_state = tok_state_between_doctype_public_and_system_identifiers
+			return
+		if c is '>'
+			tok_state = tok_state_data
+			return tok_cur_tag
+		if c is '"'
+			parse_error()
+			tok_cur_tag.system_identifier = ''
+			tok_state = tok_state_doctype_system_identifier_double_quoted
+			return
+		if c is "'"
+			parse_error()
+			tok_cur_tag.system_identifier = ''
+			tok_state = tok_state_doctype_system_identifier_single_quoted
+			return
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			tok_cur_tag.flag 'force-quirks', true
+			cur -= 1 # Reconsume
+			return tok_cur_tag
+		# Anything else
+		parse_error()
+		tok_cur_tag.flag 'force-quirks', true
+		tok_state = tok_state_bogus_doctype
+		return null
+
+	# 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
+	tok_state_between_doctype_public_and_system_identifiers = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+			return
+		if c is '>'
+			tok_state = tok_state_data
+			return tok_cur_tag
+		if c is '"'
+			parse_error()
+			tok_cur_tag.system_identifier = ''
+			tok_state = tok_state_doctype_system_identifier_double_quoted
+			return
+		if c is "'"
+			parse_error()
+			tok_cur_tag.system_identifier = ''
+			tok_state = tok_state_doctype_system_identifier_single_quoted
+			return
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			tok_cur_tag.flag 'force-quirks', true
+			cur -= 1 # Reconsume
+			return tok_cur_tag
+		# Anything else
+		parse_error()
+		tok_cur_tag.flag 'force-quirks', true
+		tok_state = tok_state_bogus_doctype
+		return null
+
+	# 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
+	tok_state_after_doctype_system_keyword = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+			tok_state = tok_state_before_doctype_system_identifier
+			return
+		if c is '"'
+			parse_error()
+			tok_cur_tag.system_identifier = ''
+			tok_state = tok_state_doctype_system_identifier_double_quoted
+			return
+		if c is "'"
+			parse_error()
+			tok_cur_tag.system_identifier = ''
+			tok_state = tok_state_doctype_system_identifier_single_quoted
+			return
+		if c is '>'
+			parse_error()
+			tok_cur_tag.flag 'force-quirks', true
+			tok_state = tok_state_data
+			return tok_cur_tag
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			tok_cur_tag.flag 'force-quirks', true
+			cur -= 1 # Reconsume
+			return tok_cur_tag
+		# Anything else
+		parse_error()
+		tok_cur_tag.flag 'force-quirks', true
+		tok_state = tok_state_bogus_doctype
+		return null
+
+	# 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
+	tok_state_before_doctype_system_identifier = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+			return
+		if c is '"'
+			tok_cur_tag.system_identifier = ''
+			tok_state = tok_state_doctype_system_identifier_double_quoted
+			return
+		if c is "'"
+			tok_cur_tag.system_identifier = ''
+			tok_state = tok_state_doctype_system_identifier_single_quoted
+			return
+		if c is '>'
+			parse_error()
+			tok_cur_tag.flag 'force-quirks', true
+			tok_state = tok_state_data
+			return tok_cur_tag
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			tok_cur_tag.flag 'force-quirks', true
+			cur -= 1 # Reconsume
+			return tok_cur_tag
+		# Anything else
+		parse_error()
+		tok_cur_tag.flag 'force-quirks', true
+		tok_state = tok_state_bogus_doctype
+		return null
+
+	# 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
+	tok_state_doctype_system_identifier_double_quoted = ->
+		c = txt.charAt(cur++)
+		if c is '"'
+			tok_state = tok_state_after_doctype_system_identifier
+			return
+		if c is "\u0000"
+			parse_error()
+			tok_cur_tag.system_identifier += "\ufffd"
+			return
+		if c is '>'
+			parse_error()
+			tok_cur_tag.flag 'force-quirks', true
+			tok_state = tok_state_data
+			return tok_cur_tag
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			tok_cur_tag.flag 'force-quirks', true
+			cur -= 1 # Reconsume
+			return tok_cur_tag
+		# Anything else
+		tok_cur_tag.system_identifier += c
+		return null
+
+	# 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
+	tok_state_doctype_system_identifier_single_quoted = ->
+		c = txt.charAt(cur++)
+		if c is "'"
+			tok_state = tok_state_after_doctype_system_identifier
+			return
+		if c is "\u0000"
+			parse_error()
+			tok_cur_tag.system_identifier += "\ufffd"
+			return
+		if c is '>'
+			parse_error()
+			tok_cur_tag.flag 'force-quirks', true
+			tok_state = tok_state_data
+			return tok_cur_tag
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			tok_cur_tag.flag 'force-quirks', true
+			cur -= 1 # Reconsume
+			return tok_cur_tag
+		# Anything else
+		tok_cur_tag.system_identifier += c
+		return null
+
+	# 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
+	tok_state_after_doctype_system_identifier = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+			return
+		if c is '>'
+			tok_state = tok_state_data
+			return tok_cur_tag
+		if c is '' # EOF
+			parse_error()
+			tok_state = tok_state_data
+			tok_cur_tag.flag 'force-quirks', true
+			cur -= 1 # Reconsume
+			return tok_cur_tag
+		# Anything else
+		parse_error()
+		# do _not_ tok_cur_tag.flag 'force-quirks', true
+		tok_state = tok_state_bogus_doctype
+		return null
+
+	# 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
+	tok_state_bogus_doctype = ->
+		c = txt.charAt(cur++)
+		if c is '>'
+			tok_state = tok_state_data
+			return tok_cur_tag
+		if c is '' # EOF
+			tok_state = tok_state_data
+			cur -= 1 # Reconsume
+			return tok_cur_tag
+		# Anything else
+		return null
+
+
 	# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
 	# Don't set this as a state, just call it
 	# returns a string (NOT a text node)
@@ -2750,7 +3726,7 @@ parse_html = (txt, parse_error_cb = null) ->
 	# tree constructor initialization
 	# see comments on TYPE_TAG/etc for the structure of this data
 	doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
-	open_els = [doc]
+	open_els = []
 	afe = [] # active formatting elements
 	template_insertion_modes = []
 	insertion_mode = ins_mode_initial
@@ -2764,6 +3740,7 @@ parse_html = (txt, parse_error_cb = null) ->
 	pending_table_character_tokens = []
 	head_element_pointer = null
 	flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
+	context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
 
 	# tokenizer initialization
 	tok_state = tok_state_data
@@ -2776,15 +3753,6 @@ parse_html = (txt, parse_error_cb = null) ->
 			# fixfull parse error if has self-closing flag, but it wasn't acknolwedged
 	return doc.children
 
-test_results = passed: 0, failed: 0
-# everything below is tests on the above
-test_equals = (description, output, expected_output) ->
-	if output is expected_output
-		console.log "passed." # don't say name, so smart consoles can merge all of these
-	else
-		console.log "FAILED: \"#{description}\""
-		console.log "   Expected: #{expected_output}"
-		console.log "     Actual: #{output}"
 serialize_els = (els, shallow, show_ids) ->
 	serialized = ''
 	sep = ''
@@ -2793,205 +3761,12 @@ serialize_els = (els, shallow, show_ids) ->
 		sep = ','
 		serialized += t.serialize shallow, show_ids
 	return serialized
-test_parser = (args) ->
-	debug_log_reset()
-	parse_errors = []
-	errors_cb = (i) ->
-		parse_errors.push i
-	prev_node_id = 0 # reset counter
-	parsed = parse_html args.html, errors_cb
-	serialized = serialize_els parsed, false, false
-	expected = 'tag:"html",{},[tag:"head",{},[],tag:"body",{},[' + args.expected + ']]'
-	if serialized isnt expected
-		debug_log_each (str) ->
-			console.log str
-		console.log "FAILED: \"#{args.name}\""
-		console.log "      Input: #{args.html}"
-		console.log "    Correct: #{expected}"
-		console.log "     Output: #{serialized}"
-		if parse_errors.length > 0
-			console.log " parse errs: #{JSON.stringify parse_errors}"
-		else
-			console.log "   No parse errors"
-		test_results.failed += 1
-	else
-		#console.log "passed \"#{args.name}\""
-		test_results.passed += 1
-test_summary = ->
-	console.log "Tests passed: #{test_results.passed}"
-	console.log "Tests Failed: #{test_results.failed}"
-
-test_parser name: "empty", \
-	html: "",
-	expected: ''
-test_parser name: "just text", \
-	html: "abc",
-	expected: 'text:"abc"'
-test_parser name: "named entity", \
-	html: "a&amp;1234",
-	expected: 'text:"a&1234"'
-test_parser name: "broken named character references", \
-	html: "1&amp2&&amp;3&aabbcc;",
-	expected: 'text:"1&2&&3&aabbcc;"'
-test_parser name: "numbered entity overrides", \
-	html: "1&#X80&#x80; &#x83",
-	expected: 'text:"1â¬â¬ Æ"'
-test_parser name: "open tag", \
-	html: "foo<span>bar",
-	expected: 'text:"foo",tag:"span",{},[text:"bar"]'
-test_parser name: "open tag with attributes", \
-	html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
-	expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
-test_parser name: "open tag with attributes of various quotings", \
-	html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
-	expected: 'text:"foo",tag:"span",{"abc":"def","autofocus":"","g":"hij","klm":"nopqrstuv\\""},[text:"bar"]'
-test_parser name: "attribute entity exceptions dq", \
-	html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
-	expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
-test_parser name: "attribute entity exceptions sq", \
-	html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
-	expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
-test_parser name: "attribute entity exceptions uq", \
-	html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
-	expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
-test_parser name: "matching closing tags", \
-	html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
-	expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"'
-test_parser name: "missing closing tag inside", \
-	html: "foo<div>bar<span>baz</div>qux",
-	expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"'
-test_parser name: "mis-matched closing tags", \
-	html: "<span>12<div>34</span>56</div>78",
-	expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]'
-test_parser name: "mis-matched formatting elements", \
-	html: "12<b>34<i>56</b>78</i>90",
-	expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"'
-test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
-	html: '<p>1<b>2<i>3</b>4</i>5</p>',
-	expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]'
-test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
-	html: '<b>1<p>2</b>3</p>',
-	expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]'
-test_parser name: "crazy formatting elements test", \
-	html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
-	# chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
-	# firefox does this:
-	expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
-# tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
-test_parser name: "html5lib aaa 1", \
-	html: '<a><p></a></p>',
-	expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]'
-test_parser name: "html5lib aaa 2", \
-	html: '<a>1<p>2</a>3</p>',
-	expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]'
-test_parser name: "html5lib aaa 3", \
-	html: '<a>1<button>2</a>3</button>',
-	expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]'
-test_parser name: "html5lib aaa 4", \
-	html: '<a>1<b>2</a>3</b>',
-	expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]'
-test_parser name: "html5lib aaa 5 (two divs deep)", \
-	html: '<a>1<div>2<div>3</a>4</div>5</div>',
-	expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]'
-test_parser name: "html5lib aaa 6 (foster parenting)", \
-	html: '<table><a>1<p>2</a>3</p>',
-	expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]'
-test_parser name: "html5lib aaa 7 (aaa, eof) 1", \
-	html: '<b><b><a><p></a>',
-	expected: 'tag:"b",{},[tag:"b",{},[tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]]]'
-test_parser name: "html5lib aaa 8 (aaa, eof) 2", \
-	html: '<b><a><b><p></a>',
-	expected: 'tag:"b",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
-test_parser name: "html5lib aaa 9 (aaa, eof) 3", \
-	html: '<a><b><b><p></a>',
-	expected: 'tag:"a",{},[tag:"b",{},[tag:"b",{},[]]],tag:"b",{},[tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
-test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \
-	html: '<p>1<s id="A">2<b id="B">3</p>4</s>5</b>',
-	expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]'
-test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
-	html: '<table><a>1<td>2</td>3</table>',
-	expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]'
-test_parser name: "html5lib aaa 12 (table with foster parenting, split text)", \
-	html: '<table>A<td>B</td>C</table>',
-	expected: 'text:"AC",tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
-# TODO implement svg and namespacing
-#test_parser name: "html5lib aaa 13 (svg tr input)", \
-#	html: '<a><svg><tr><input></a>',
-#	expected: 'tag:"a",{},[svg:"svg",{},[svg:"tr",{},[svg:"input"]]]'
-test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \
-	html: '<div><a><b><div><div><div><div><div><div><div><div><div><div></a>',
-	expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"div",{},[tag:"div",{},[]]]]]]]]]]]]]'
-test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \
-	html: '<div><a><b><u><i><code><div></a>',
-	expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]'
-test_parser name: "html5lib aaa 16 (correctly nested 4b)", \
-	html: '<b><b><b><b>x</b></b></b></b>y',
-	expected: 'tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]],text:"y"'
-test_parser name: "html5lib aaa 17 (formatting, implied /p, noah's ark)", \
-	html: '<p><b><b><b><b><p>x',
-	expected: 'tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[]]]]],tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]]'
-test_parser name: "variation on html5lib aaa 17 (with attributes in various orders)", \
-	html: '<p><b c="d" e="f"><b e="f" c="d"><b e="f" c="d"><b c="d" e="f"><p>x',
-	expected: 'tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[]]]]],tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[text:"x"]]]]'
-test_parser name: "junk after attribute close-quote", \
-	html: '<p><b c="d", e="f">foo<p>x',
-	expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'
-test_parser name: "html5lib aaa02 1", \
-	html: '<b>1<i>2<p>3</b>4',
-	expected: 'tag:"b",{},[text:"1",tag:"i",{},[text:"2"]],tag:"i",{},[tag:"p",{},[tag:"b",{},[text:"3"],text:"4"]]'
-test_parser name: "html5lib aaa02 2", \
-	html: '<a><div><style></style><address><a>',
-	expected: 'tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"style",{},[]],tag:"address",{},[tag:"a",{},[],tag:"a",{},[]]]'
-test_parser name: "html5lib tables 1", \
-	html: '<table><th>',
-	expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"th",{},[]]]]'
-test_parser name: "html5lib tables 2", \
-	html: '<table><td>',
-	expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
-test_parser name: "html5lib tables 3", \
-	html: "<table><col foo='bar'>",
-	expected: 'tag:"table",{},[tag:"colgroup",{},[tag:"col",{"foo":"bar"},[]]]'
-test_parser name: "html5lib tables 4", \
-	html: '<table><colgroup></html>foo',
-	expected: 'text:"foo",tag:"table",{},[tag:"colgroup",{},[]]'
-test_parser name: "html5lib tables 5", \
-	html: '<table></table><p>foo',
-	expected: 'tag:"table",{},[],tag:"p",{},[text:"foo"]'
-test_parser name: "html5lib tables 6", \
-	html: '<table></body></caption></col></colgroup></html></tbody></td></tfoot></th></thead></tr><td>',
-	expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
-test_parser name: "html5lib tables 7", \
-	html: '<table><select><option>3</select></table>',
-	expected: 'tag:"select",{},[tag:"option",{},[text:"3"]],tag:"table",{},[]'
-test_parser name: "html5lib tables 8", \
-	html: '<table><select><table></table></select></table>',
-	expected: 'tag:"select",{},[],tag:"table",{},[],tag:"table",{},[]'
-test_parser name: "html5lib tables 9", \
-	html: '<table><select></table>',
-	expected: 'tag:"select",{},[],tag:"table",{},[]'
-test_parser name: "html5lib tables 10", \
-	html: '<table><select><option>A<tr><td>B</td></tr></table>',
-	expected: 'tag:"select",{},[tag:"option",{},[text:"A"]],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
-test_parser name: "html5lib tables 11", \
-	html: '<table><td></body></caption></col></colgroup></html>foo',
-	expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
-test_parser name: "html5lib tables 12", \
-	html: '<table><td>A</table>B',
-	expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"A"]]]],text:"B"'
-test_parser name: "html5lib tables 13", \
-	html: '<table><tr><caption>',
-	expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[]],tag:"caption",{},[]]'
-test_parser name: "html5lib tables 14", \
-	html: '<table><tr></body></caption></col></colgroup></html></td></th><td>foo',
-	expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
-test_parser name: "html5lib tables 15", \
-	html: '<table><td><tr>',
-	expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]],tag:"tr",{},[]]]'
-test_parser name: "html5lib tables 16", \
-	html: '<table><td><button><td>',
-	expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[tag:"button",{},[]],tag:"td",{},[]]]]'
-# TODO implement svg parsing
-#test_parser name: "html5lib tables 17", \
-#	html: '<table><tr><td><svg><desc><td>',
-#	expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[svg:"svg",{},[svg:"desc",{},[]]],tag:"td",{},[]]]]'
-test_summary()
+
+# TODO export TYPE_*
+module.exports.parse_html = parse_html
+module.exports.debug_log_reset = debug_log_reset
+module.exports.debug_log_each = debug_log_each
+module.exports.TYPE_TAG = TYPE_TAG
+module.exports.TYPE_TEXT = TYPE_TEXT
+module.exports.TYPE_COMMENT = TYPE_COMMENT
+module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE