From: Jason Woofenden <jason@jasonwoof.com>
Date: Sun, 20 Dec 2015 14:49:10 +0000 (-0500)
Subject: implement lots of raw-ish text parsing
X-Git-Url: https://jasonwoof.com/gitweb/?a=commitdiff_plain;h=7d7b5713e2ae68559b1cfb1fcb86cfc7b83f7967;p=peach-html5-editor.git

implement lots of raw-ish text parsing
---

diff --git a/parse-html.coffee b/parse-html.coffee
index 86d0136..d271706 100644
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -94,6 +94,8 @@ class Node
 		attrs = {}
 		attrs[k] = v for k, v of @attrs
 		return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
+	acknowledge_self_closing: ->
+		# fixfull
 	serialize: (shallow = false, show_ids = false) -> # for unit tests
 		ret = ''
 		switch @type
@@ -149,6 +151,7 @@ new_element = (name) ->
 	return new Node TYPE_TAG, name: name
 new_text_node = (txt) ->
 	return new Node TYPE_TEXT, text: txt
+new_character_token = new_text_node
 new_comment_node = (txt) ->
 	return new Node TYPE_COMMENT, text: txt
 new_eof_token = ->
@@ -158,8 +161,8 @@ new_afe_marker = ->
 new_aaa_bookmark = ->
 	return new Node TYPE_AAA_BOOKMARK
 
-lc_alpha = "abcdefghijklmnopqrstuvwxqz"
-uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
+lc_alpha = "abcdefghijklmnopqrstuvwxyz"
+uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 digits = "0123456789"
 alnum = lc_alpha + uc_alpha + digits
 hex_chars = digits + "abcdefABCDEF"
@@ -340,15 +343,19 @@ parse_html = (txt, parse_error_cb = null) ->
 	cur = 0 # index of next char in txt to be parsed
 	# declare tree and tokenizer variables so they're in scope below
 	tree = null
-	open_els = [] # stack of open elements
+	open_els = null # stack of open elements
+	afe = null # active formatting elements
+	template_insertion_modes = null
 	insertion_mode = null
+	original_insertion_mode = null
 	tok_state = null
 	tok_cur_tag = null # partially parsed tag
+	flag_scripting = null
 	flag_frameset_ok = null
 	flag_parsing = null
 	flag_foster_parenting = null
 	form_element_pointer = null
-	afe = [] # active formatting elements
+	temporary_buffer = null
 
 	parse_error = ->
 		if parse_error_cb?
@@ -872,7 +879,6 @@ parse_html = (txt, parse_error_cb = null) ->
 		debug_log "AAA DONE"
 
 	# http://www.w3.org/TR/html5/syntax.html#close-a-p-element
-	# FIXME test this (particularly emplied end tags)
 	close_p_element = ->
 		generate_implied_end_tags 'p' # arg is exception
 		if open_els[0].name isnt 'p'
@@ -886,7 +892,8 @@ parse_html = (txt, parse_error_cb = null) ->
 			close_p_element()
 
 	# http://www.w3.org/TR/html5/syntax.html#insert-a-character
-	tree_insert_text = (t) ->
+	# aka insert_a_character = (t) ->
+	insert_character = (t) ->
 		dest = adjusted_insertion_location()
 		# fixfull check for Document node
 		if dest[1] > 0
@@ -1046,17 +1053,114 @@ parse_html = (txt, parse_error_cb = null) ->
 
 	# http://www.w3.org/TR/html5/syntax.html#insert-a-comment
 	# position should be [node, index_within_children]
-	tree_insert_a_comment = (t, position = null) ->
+	tree_insert_comment = (t, position = null) ->
 		position ?= adjusted_insertion_location()
 		position[0].children.splice position[1], 0, t
 
+	# 8.2.5.2
+	# http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
+	parse_generic_raw_text = (t) ->
+		insert_html_element t
+		tok_state = tok_state_rawtext
+		original_insertion_mode = insertion_mode
+		insertion_mode = ins_mode_text
+	parse_generic_rcdata_text = (t) ->
+		insert_html_element t
+		tok_state = tok_state_rcdata
+		original_insertion_mode = insertion_mode
+		insertion_mode = ins_mode_text
+
 	# 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
 	# http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
 	generate_implied_end_tags = (except = null) ->
 		while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
 			open_els.shift()
 
-	# 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
+	# 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
+	ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
+		open_els.shift() # spec says this will be a 'head' node
+		insertion_mode = ins_mode_after_head
+		insertion_mode t
+	ins_mode_in_head = (t) ->
+		if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
+			insert_character t
+			return
+		if t.type is TYPE_COMMENT
+			tree_insert_comment t
+			return
+		if t.type is TYPE_DOCTYPE
+			parse_error()
+			return
+		if t.type is TYPE_START_TAG and t.name is 'html'
+			ins_mode_in_body t
+			return
+		if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
+			el = insert_html_element t
+			open_els.shift()
+			el.acknowledge_self_closing()
+			return
+		if t.type is TYPE_START_TAG and t.name is 'meta'
+			el = insert_html_element t
+			open_els.shift()
+			el.acknowledge_self_closing()
+			# fixfull encoding stuff
+			return
+		if t.type is TYPE_START_TAG and t.name is 'title'
+			parse_generic_rcdata_element t
+			return
+		if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
+			parse_generic_raw_text t
+			return
+		if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
+			insert_html_element t
+			insertion_mode = in_head_noscript # FIXME implement
+			return
+		if t.type is TYPE_START_TAG and t.name is 'script'
+			ail = adjusted_insertion_location()
+			el = token_to_element t, NS_HTML, ail
+			el.flag_parser_inserted true # FIXME implement
+			# fixfull frament case
+			ail[0].children.splice ail[1], 0, el
+			open_els.unshift el
+			tok_state = tok_state_script_data
+			original_insertion_mode = insertion_mode # make sure orig... is defined
+			insertion_mode = ins_mode_text # FIXME implement
+			return
+		if t.type is TYPE_END_TAG and t.name is 'head'
+			open_els.shift() # will be a head element... spec says so
+			insertion_mode = ins_mode_after_head
+			return
+		if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
+			ins_mode_in_head_else t
+			return
+		if t.type is TYPE_START_TAG and t.name is 'template'
+			insert_html_element t
+			afe_push_marker()
+			flag_frameset_ok = false
+			insertion_mode = ins_mode_in_template
+			template_insertion_modes.unshift ins_mode_in_template # FIXME implement
+			return
+		if t.type is TYPE_END_TAG and t.name is 'template'
+			if template_tag_is_open()
+				generate_implied_end_tags
+				if open_els[0].name isnt 'template'
+					parse_error()
+				loop
+					el = open_els.shift()
+					if el.name is 'template'
+						break
+				clear_afe_to_marker()
+				template_insertion_modes.shift()
+				reset_insertion_mode()
+			else
+				parse_error()
+			return
+		if (t.type is TYPE_OPEN_TAG and t.name is 'head') or t.type is TYPE_END_TAG
+			parse_error()
+			return
+		ins_mode_in_head_else t
+
+	# 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
 	in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
 		for node, i in open_els
 			if node.name is name # FIXME check namespace too
@@ -1077,13 +1181,13 @@ parse_html = (txt, parse_error_cb = null) ->
 						parse_error()
 					when "\t", "\u000a", "\u000c", "\u000d", ' '
 						reconstruct_active_formatting_elements()
-						tree_insert_text t
+						insert_character t
 					else
 						reconstruct_active_formatting_elements()
-						tree_insert_text t
+						insert_character t
 						flag_frameset_ok = false
 			when TYPE_COMMENT
-				tree_insert_a_comment t
+				tree_insert_comment t
 			when TYPE_DOCTYPE
 				parse_error()
 			when TYPE_START_TAG
@@ -1096,7 +1200,7 @@ parse_html = (txt, parse_error_cb = null) ->
 							root_attrs[k] = v unless root_attrs[k]?
 					when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
 						# FIXME also do this for </template> (end tag)
-						return tree_in_head t
+						return ins_mode_in_head t
 					when 'body'
 						parse_error()
 						# TODO
@@ -1255,6 +1359,36 @@ parse_html = (txt, parse_error_cb = null) ->
 			el = afe.shift()
 			if el.type is TYPE_AFE_MARKER
 				return
+
+	# 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
+	ins_mode_text = (t) ->
+		if t.type is TYPE_TEXT
+			insert_character t
+			return
+		if t.type is TYPE_EOF
+			parse_error()
+			if open_els[0].name is 'script'
+				open_els[0].flag 'already started', true
+			open_els.shift()
+			insertion_mode = original_insertion_mode
+			insertion_mode t
+			return
+		if t.type is TYPE_END_TAG and t.name is 'script'
+			open_els.shift()
+			insertion_mode = original_insertion_mode
+			# fixfull the spec seems to assume that I'm going to run the script
+			# http://www.w3.org/TR/html5/syntax.html#scriptEndTag
+			return
+		if t.type is TYPE_END_TAG
+			open_els.shift()
+			insertion_mode = original_insertion_mode
+			return
+		console.log 'warning: end of ins_mode_text reached'
+
+	# the functions below implement the tokenizer stats described here:
+	# http://www.w3.org/TR/html5/syntax.html#tokenization
+
+	# 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
 	ins_mode_in_table = (t) ->
 		switch t.type
 			when TYPE_TEXT
@@ -1265,7 +1399,7 @@ parse_html = (txt, parse_error_cb = null) ->
 				else
 					ins_mode_in_table_else t
 			when TYPE_COMMENT
-				tree_insert_a_comment t
+				tree_insert_comment t
 			when TYPE_DOCTYPE
 				parse_error()
 			when TYPE_START_TAG
@@ -1309,9 +1443,9 @@ parse_html = (txt, parse_error_cb = null) ->
 							ins_mode_in_table_else t
 						else
 							parse_error()
-							insert_html_element t
+							el = insert_html_element t
 							open_els.shift()
-							# fixfull acknowledge sef-closing flag
+							el.acknowledge_self_closing()
 					when 'form'
 						parse_error()
 						if form_element_pointer?
@@ -1345,6 +1479,7 @@ parse_html = (txt, parse_error_cb = null) ->
 				ins_mode_in_table_else t
 
 
+	# 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
 	ins_mode_in_table_text = (t) ->
 		switch t.type
 			when TYPE_TEXT
@@ -1355,6 +1490,7 @@ parse_html = (txt, parse_error_cb = null) ->
 		console.log "unimplemented ins_mode_in_table_text"
 		# FIXME CONTINUE
 
+	# 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
 	ins_mode_in_table_body = (t) ->
 		if t.type is TYPE_START_TAG and t.name is 'tr'
 			clear_stack_to_table_body_context()
@@ -1397,6 +1533,7 @@ parse_html = (txt, parse_error_cb = null) ->
 		# Anything else
 		ins_mode_in_table t
 
+	# 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
 	ins_mode_in_row = (t) ->
 		if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
 			clear_stack_to_table_row_context()
@@ -1449,7 +1586,7 @@ parse_html = (txt, parse_error_cb = null) ->
 		clear_afe_to_marker()
 		insertion_mode = ins_mode_in_row
 
-	# http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
+	# 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
 	ins_mode_in_cell = (t) ->
 		if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
 			if is_in_table_scope t.name
@@ -1492,15 +1629,11 @@ parse_html = (txt, parse_error_cb = null) ->
 		# Anything Else
 		ins_mode_in_body t
 
-
-	# the functions below implement the tokenizer stats described here:
-	# http://www.w3.org/TR/html5/syntax.html#tokenization
-
 	# 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
 	tok_state_data = ->
 		switch c = txt.charAt(cur++)
 			when '&'
-				return new_text_node tokenize_character_reference()
+				return new_text_node parse_character_reference()
 			when '<'
 				tok_state = tok_state_tag_open
 			when "\u0000"
@@ -1514,7 +1647,68 @@ parse_html = (txt, parse_error_cb = null) ->
 
 	# 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
 	# not needed: tok_state_character_reference_in_data = ->
-	# just call tok_state_character_reference_in_data()
+	# just call parse_character_reference()
+
+	# 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
+	tok_state_rcdata = ->
+		switch c = txt.charAt(cur++)
+			when '&'
+				return new_text_node parse_character_reference()
+			when '<'
+				tok_state = tok_state_rcdata_less_than_sign
+			when "\u0000"
+				parse_error()
+				return new_character_token "\ufffd"
+			when '' # EOF
+				return new_eof_token()
+			else
+				return new_character_token c
+		return null
+
+	# 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
+	# not needed: tok_state_character_reference_in_rcdata = ->
+	# just call parse_character_reference()
+
+	# 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
+	tok_state_rawtext = ->
+		switch c = txt.charAt(cur++)
+			when '<'
+				tok_state = tok_state_rawtext_less_than_sign
+			when "\u0000"
+				parse_error()
+				return new_character_token "\ufffd"
+			when '' # EOF
+				return new_eof_token()
+			else
+				return new_character_token c
+		return null
+
+	# 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
+	tok_state_script_data = ->
+		switch c = txt.charAt(cur++)
+			when '<'
+				tok_state = tok_state_script_data_less_than_sign
+			when "\u0000"
+				parse_error()
+				return new_character_token "\ufffd"
+			when '' # EOF
+				return new_eof_token()
+			else
+				return new_character_token c
+		return null
+
+	# 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
+	tok_state_plaintext = ->
+		switch c = txt.charAt(cur++)
+			when "\u0000"
+				parse_error()
+				return new_character_token "\ufffd"
+			when '' # EOF
+				return new_eof_token()
+			else
+				return new_character_token c
+		return null
+
 
 	# 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
 	tok_state_tag_open = ->
@@ -1587,6 +1781,140 @@ parse_html = (txt, parse_error_cb = null) ->
 					tok_cur_tag.name += c
 		return null
 
+	# 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
+	tok_state_rcdata_less_than_sign = ->
+		c = txt.charAt(cur++)
+		if c is '/'
+			temporary_buffer = ''
+			tok_state = tok_state_rcdata_end_tag_open
+			return null
+		# Anything else
+		tok_state = tok_state_rcdata
+		cur -= 1 # reconsume the input character
+		return new_character_token '<'
+
+	# 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
+	tok_state_rcdata_end_tag_open = ->
+		c = txt.charAt(cur++)
+		if uc_alpha.indexOf(c) > -1
+			tok_cur_tag = new_end_tag c.toLowerCase()
+			temporary_buffer += c
+			tok_state = tok_state_rcdata_end_tag_name
+			return null
+		if lc_alpha.indexOf(c) > -1
+			tok_cur_tag = new_end_tag c
+			temporary_buffer += c
+			tok_state = tok_state_rcdata_end_tag_name
+			return null
+		# Anything else
+		tok_state = tok_state_rcdata
+		cur -= 1 # reconsume the input character
+		return new_character_token "</" # fixfull separate these
+
+	# http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
+	is_appropriate_end_tag = (t) ->
+		# spec says to check against "the tag name of the last start tag to
+		# have been emitted from this tokenizer", but this is only called from
+		# the various "raw" states, which I'm pretty sure all push the start
+		# token onto open_els. TODO: verify this after the script data states
+		# are implemented
+		debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
+		return t.type is TYPE_END_TAG and t.name is open_els[0].name
+
+	# 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
+	tok_state_rcdata_end_tag_name = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
+			if is_appropriate_end_tag tok_cur_tag
+				tok_state = tok_state_before_attribute_name
+				return
+			# else fall through to "Anything else"
+		if c is '/'
+			if is_appropriate_end_tag tok_cur_tag
+				tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
+				return
+			# else fall through to "Anything else"
+		if c is '>'
+			if is_appropriate_end_tag tok_cur_tag
+				tok_state = tok_state_data
+				return tok_cur_tag
+			# else fall through to "Anything else"
+		if uc_alpha.indexOf(c) > -1
+			tok_cur_tag.name += c.toLowerCase()
+			temporary_buffer += c
+			return null
+		if lc_alpha.indexOf(c) > -1
+			tok_cur_tag.name += c
+			temporary_buffer += c
+			return null
+		# Anything else
+		tok_state = tok_state_rcdata
+		cur -= 1 # reconsume the input character
+		return new_character_token '</' + temporary_buffer # fixfull separate these
+
+	# 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
+	tok_state_rawtext_less_than_sign = ->
+		c = txt.charAt(cur++)
+		if c is '/'
+			temporary_buffer = ''
+			tok_state = tok_state_rawtext_end_tag_open
+			return null
+		# Anything else
+		tok_state = tok_state_rawtext
+		cur -= 1 # reconsume the input character
+		return new_character_token '<'
+
+	# 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
+	tok_state_rawtext_end_tag_open = ->
+		c = txt.charAt(cur++)
+		if uc_alpha.indexOf(c) > -1
+			tok_cur_tag = new_end_tag c.toLowerCase()
+			temporary_buffer += c
+			tok_state = tok_state_rawtext_end_tag_name
+			return null
+		if lc_alpha.indexOf(c) > -1
+			tok_cur_tag = new_end_tag c
+			temporary_buffer += c
+			tok_state = tok_state_rawtext_end_tag_name
+			return null
+		# Anything else
+		tok_state = tok_state_rawtext
+		cur -= 1 # reconsume the input character
+		return new_character_token "</" # fixfull separate these
+
+	# 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
+	tok_state_rawtext_end_tag_name = ->
+		c = txt.charAt(cur++)
+		if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
+			if is_appropriate_end_tag tok_cur_tag
+				tok_state = tok_state_before_attribute_name
+				return
+			# else fall through to "Anything else"
+		if c is '/'
+			if is_appropriate_end_tag tok_cur_tag
+				tok_state = tok_state_self_closing_start_tag
+				return
+			# else fall through to "Anything else"
+		if c is '>'
+			if is_appropriate_end_tag tok_cur_tag
+				tok_state = tok_state_data
+				return tok_cur_tag
+			# else fall through to "Anything else"
+		if uc_alpha.indexOf(c) > -1
+			tok_cur_tag.name += c.toLowerCase()
+			temporary_buffer += c
+			return null
+		if lc_alpha.indexOf(c) > -1
+			tok_cur_tag.name += c
+			temporary_buffer += c
+			return null
+		# Anything else
+		tok_state = tok_state_rawtext
+		cur -= 1 # reconsume the input character
+		return new_character_token '</' + temporary_buffer # fixfull separate these
+
+	# TODO _all_ of the missing states here (17-33) are for parsing script tags
+
 	# 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
 	tok_state_before_attribute_name = ->
 		attr_name = null
@@ -1721,7 +2049,7 @@ parse_html = (txt, parse_error_cb = null) ->
 			when '"'
 				tok_state = tok_state_after_attribute_value_quoted
 			when '&'
-				tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
+				tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
 			when "\u0000"
 				# Parse error
 				tok_cur_tag.attrs_a[0][1] += "\ufffd"
@@ -1738,7 +2066,7 @@ parse_html = (txt, parse_error_cb = null) ->
 			when "'"
 				tok_state = tok_state_after_attribute_value_quoted
 			when '&'
-				tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
+				tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
 			when "\u0000"
 				# Parse error
 				tok_cur_tag.attrs_a[0][1] += "\ufffd"
@@ -1755,7 +2083,7 @@ parse_html = (txt, parse_error_cb = null) ->
 			when "\t", "\n", "\u000c", ' '
 				tok_state = tok_state_before_attribute_name
 			when '&'
-				tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
+				tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
 			when '>'
 				tok_state = tok_state_data
 				tmp = tok_cur_tag
@@ -1795,7 +2123,7 @@ parse_html = (txt, parse_error_cb = null) ->
 	# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
 	# Don't set this as a state, just call it
 	# returns a string (NOT a text node)
-	tokenize_character_reference = (allowed_char = null, in_attr = false) ->
+	parse_character_reference = (allowed_char = null, in_attr = false) ->
 		if cur >= txt.length
 			return '&'
 		switch c = txt.charAt(cur)
@@ -1872,12 +2200,16 @@ parse_html = (txt, parse_error_cb = null) ->
 	# see comments on TYPE_TAG/etc for the structure of this data
 	tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
 	open_els = [tree]
+	afe = [] # active formatting elements
+	template_insertion_modes = []
 	insertion_mode = ins_mode_in_body
+	original_insertion_mode = insertion_mode # TODO check spec
+	flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
 	flag_frameset_ok = true
 	flag_parsing = true
 	flag_foster_parenting = false
 	form_element_pointer = null
-	afe = [] # active formatting elements
+	temporary_buffer = null
 
 	# tokenizer initialization
 	tok_state = tok_state_data
@@ -2042,3 +2374,9 @@ test_parser name: "variation on html5lib aaa 17 (with attributes in various orde
 test_parser name: "junk after attribute close-quote", \
 	html: '<p><b c="d", e="f">foo<p>x',
 	expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'
+test_parser name: "html5lib aaa02 1", \
+	html: '<b>1<i>2<p>3</b>4',
+	expected: 'tag:"b",{},[text:"1",tag:"i",{},[text:"2"]],tag:"i",{},[tag:"p",{},[tag:"b",{},[text:"3"],text:"4"]]'
+test_parser name: "html5lib aaa02 2", \
+	html: '<a><div><style></style><address><a>',
+	expected: 'tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"style",{},[]],tag:"address",{},[tag:"a",{},[],tag:"a",{},[]]]'