X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=425fe3c6e32b7991ef51daabb137774f11225779;hb=a88ccdd930221ffd086134f2e3890602d9e17d9d;hp=3d4db8459018f7972650435da642c1982df727e8;hpb=9d56a837ea14fd1324617b1a85f8a1d52db76319;p=peach-html5-editor.git

diff --git a/parse-html.coffee b/parse-html.coffee
index 3d4db84..425fe3c 100644
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -668,18 +668,29 @@ parse_html = (args) ->
 		else
 			console.log "Parse error at character #{cur} of #{txt.length}"
 
+	# http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
+	# "Noah's Ark clause" but with three
 	afe_push = (new_el) ->
 		matches = 0
 		for el, i in afe
+			if el.type is TYPE_AFE_MARKER
+				break
 			if el.name is new_el.name and el.namespace is new_el.namespace
+				attrs_match = true
 				for k, v of el.attrs
-					continue unless new_el.attrs[k] is v
-				for k, v of new_el.attrs
-					continue unless el.attrs[k] is v
-				matches += 1
-				if matches is 3
-					afe.splice i, 1
-					break
+					unless new_el.attrs[k] is v
+						attrs_match = false
+						break
+				if attrs_match
+					for k, v of new_el.attrs
+						unless el.attrs[k] is v
+							attrs_match = false
+							break
+				if attrs_match
+					matches += 1
+					if matches is 3
+						afe.splice i, 1
+						break
 		afe.unshift new_el
 	afe_push_marker = ->
 		afe.unshift new_afe_marker()
@@ -806,8 +817,8 @@ parse_html = (args) ->
 		loop
 			if node_i is open_els.length - 1
 				last = true
-				# fixfull (fragment case)
-
+				if flag_fragment_parsing
+					node = context_element
 			# 4. If node is a select element, run these substeps:
 			if node.name is 'select' and node.namespace is NS_HTML
 				# 1. If last is true, jump to the step below labeled done.
@@ -1550,6 +1561,7 @@ parse_html = (args) ->
 		if t.type is TYPE_START_TAG and t.name is 'html'
 			el = token_to_element t, NS_HTML, doc
 			doc.children.push el
+			el.document = doc
 			open_els.unshift(el)
 			# fixfull (big paragraph in spec about manifest, fragment, urls, etc)
 			ins_mode = ins_mode_before_head
@@ -1563,7 +1575,7 @@ parse_html = (args) ->
 		# Anything else
 		el = token_to_element new_open_tag('html'), NS_HTML, doc
 		doc.children.push el
-		el.parent = doc
+		el.document = doc
 		open_els.unshift el
 		# ?fixfull browsing context
 		ins_mode = ins_mode_before_head
@@ -1769,17 +1781,23 @@ parse_html = (args) ->
 
 	# 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
 	in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
-		for el, i in open_els
-			if el.name is name and el.namespace is NS_HTML
+		node = open_els[0]
+		loop
+			if node.name is name and node.namespace is NS_HTML
 				generate_implied_end_tags name # arg is exception
-				parse_error() unless i is 0
-				while i >= 0
-					open_els.shift()
-					i -= 1
-				return
-			if special_elements[el.name] is el.namespace
+				unless node is open_els[0]
+					parse_error()
+				loop
+					el = open_els.shift()
+					if el is node
+						return
+			if special_elements[node.name] is node.namespace
 				parse_error()
 				return
+			for el, i in open_els
+				if node is el
+					node = open_els[i + 1]
+					break
 		return
 	ins_mode_in_body = (t) ->
 		if t.type is TYPE_TEXT and t.text is "\u0000"
@@ -1907,11 +1925,7 @@ parse_html = (args) ->
 		if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
 			close_p_if_in_button_scope()
 			insert_html_element t
-			# spec: If the next token is a "LF" (U+000A) character token, then
-			# ignore that token and move on to the next one. (Newlines at the
-			# start of pre blocks are ignored as an authoring convenience.)
-			if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
-				cur += 1
+			eat_next_token_if_newline()
 			flag_frameset_ok = false
 			return
 		if t.type is TYPE_START_TAG and t.name is 'form'
@@ -2106,6 +2120,10 @@ parse_html = (args) ->
 			return
 		if t.type is TYPE_START_TAG and t.name is 'nobr'
 			reconstruct_afe()
+			if is_in_scope 'nobr', NS_HTML
+				parse_error()
+				adoption_agency 'nobr'
+				reconstruct_afe()
 			el = insert_html_element t
 			afe_push el
 			return
@@ -2140,7 +2158,8 @@ parse_html = (args) ->
 			return
 		if t.type is TYPE_END_TAG and t.name is 'br'
 			parse_error()
-			t.type = TYPE_START_TAG
+			# W3C: t.type = TYPE_START_TAG
+			t = new_open_tag 'br' # WHATWG
 			# fall through
 		if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
 			reconstruct_afe()
@@ -2157,7 +2176,8 @@ parse_html = (args) ->
 			unless is_input_hidden_tok t
 				flag_frameset_ok = false
 			return
-		if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
+		if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
+			# WHATWG adds 'menuitem' for this block
 			insert_html_element t
 			open_els.shift()
 			t.acknowledge_self_closing()
@@ -2217,8 +2237,7 @@ parse_html = (args) ->
 			return
 		if t.type is TYPE_START_TAG and t.name is 'textarea'
 			insert_html_element t
-			if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
-				cur += 1
+			eat_next_token_if_newline()
 			tok_state = tok_state_rcdata
 			original_ins_mode = ins_mode
 			flag_frameset_ok = false
@@ -2713,7 +2732,7 @@ parse_html = (args) ->
 			insert_html_element t
 			return
 		if t.type is TYPE_END_TAG and t.name is 'optgroup'
-			if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
+			if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
 				if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
 					open_els.shift()
 			if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
@@ -2749,7 +2768,7 @@ parse_html = (args) ->
 			return
 		if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
 			parse_error()
-			if is_in_select_scope 'select', NS_HTML
+			unless is_in_select_scope 'select', NS_HTML
 				return
 			loop
 				el = open_els.shift()
@@ -3075,7 +3094,7 @@ parse_html = (args) ->
 				tok_state = tok_state_tag_open
 			when "\u0000"
 				parse_error()
-				return new_text_node "\ufffd"
+				return new_text_node c
 			when '' # EOF
 				return new_eof_token()
 			else
@@ -3772,7 +3791,7 @@ parse_html = (args) ->
 			return
 		if c is '>'
 			tok_state = tok_state_data
-			return
+			return tok_cur_tag
 		if is_uc_alpha(c)
 			tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
 			tok_state = tok_state_attribute_name
@@ -4525,7 +4544,10 @@ parse_html = (args) ->
 		else
 			val = txt.substr cur, (next_gt - cur)
 			cur = next_gt + 3
-		return new_character_token val # fixfull split
+		val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
+		if val.length > 0
+			return new_character_token val # fixfull split
+		return null
 
 	# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
 	# Don't set this as a state, just call it
@@ -4616,12 +4638,31 @@ parse_html = (args) ->
 					return '&'
 		return # never reached
 
+	eat_next_token_if_newline = ->
+		old_cur = cur
+		t = null
+		until t?
+			t = tok_state()
+		if t.type is TYPE_TEXT
+			# definition of a newline depends on whether it was a character ref or not
+			if cur - old_cur is 1
+				# not a character reference
+				if t.text is "\u000d" or t.text is "\u000a"
+					return
+			else
+				if t.text is "\u000a"
+					return
+		# not a "newline"
+		cur = old_cur
+		return
+
 	# tree constructor initialization
 	# see comments on TYPE_TAG/etc for the structure of this data
 	txt = args.html
 	cur = 0
-	doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+	doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
 	doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
+	fragment_root = null # fragment parsing algorithm returns children of this
 	open_els = []
 	afe = [] # active formatting elements
 	template_ins_modes = []
@@ -4635,28 +4676,105 @@ parse_html = (args) ->
 	temporary_buffer = null
 	pending_table_character_tokens = []
 	head_element_pointer = null
-	flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
-	context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
+	flag_fragment_parsing = false
+	context_element = null
 	prev_node_id = 0 # just for debugging
 
 	# tokenizer initialization
 	tok_state = tok_state_data
 
-	# text pre-processing
-	# FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
-	txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
-	txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
-	txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
+	parse_init = ->
+		# fragment parsing (text arg)
+		if args.fragment?
+			# this handles the fragment from the tests in the format described here:
+			# https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/README.md
+			f = args.fragment
+			ns = NS_HTML
+			if f.substr(0, 5) is 'math '
+				f = f.substr 5
+				ns = NS_MATHML
+			else if f.substr(0, 4) is 'svg '
+				f = f.substr 4
+				ns = NS_SVG
+			t = new_open_tag f
+			context_element = token_to_element t, ns
+			context_element.document = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
+			context_element.document.flag 'quirks mode', QUIRKS_NO
+		# fragment parsing (Node arg)
+		if args.context?
+			context_element = args.context
+
+		# http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
+		# fragment parsing algorithm
+		if context_element?
+			flag_fragment_parsing = true
+			doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+			# search up the tree from context, to try to find it's document,
+			# because this file only puts a "document" property on the root
+			# element.
+			old_doc = null
+			el = context_element
+			loop
+				if el.document?
+					old_doc = el.document
+					break
+				if el.parent
+					el = el.parent
+				else
+					break
+			if old_doc
+				doc.flag 'quirks mode', old_doc.flag 'quirks mode'
+			# set tok_state
+			if context_element.namespace is NS_HTML
+				switch context_element.name
+					when 'title', 'textarea'
+						tok_state = tok_state_rcdata
+					when 'style', 'xmp', 'iframe', 'noembed', 'noframes'
+						tok_state = tok_state_rawtext
+					when 'script'
+						tok_state = tok_state_script_data
+					when 'noscript'
+						if flag_scripting
+							tok_state = tok_state_rawtext
+					when 'plaintext'
+						tok_state = tok_state_plaintext
+			fragment_root = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+			doc.children.push fragment_root
+			fragment_root.document = doc
+			open_els = [fragment_root]
+			if context_element.name is 'template' and context_element.namespace is NS_HTML
+				template_ins_modes.unshift ins_mode_in_template
+			# fixfull create token for context (it should have it's original one already)
+			reset_ins_mode()
+			# set form_element pointer... in the foreign doc?!
+			el = context_element
+			loop
+				if el.name is 'form' and el.namespace is NS_HTML
+					form_element_pointer = el
+					break
+				if el.parent
+					el = el.parent
+				else
+					break
+
+		# text pre-processing
+		# FIXME check http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
+		txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
+		txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
 
-	if args.name is "tests20.dat #22"
-		console.log "hi"
-	# proccess input
 	# http://www.w3.org/TR/html5/syntax.html#tree-construction
-	while flag_parsing
-		t = tok_state()
-		if t?
-			process_token t
-			# fixfull parse error if has self-closing flag, but it wasn't acknolwedged
+	parse_main_loop = ->
+		while flag_parsing
+			t = tok_state()
+			if t?
+				process_token t
+				# fixfull parse error if has self-closing flag, but it wasn't acknolwedged
+		return
+	parse_init()
+	parse_main_loop()
+
+	if flag_fragment_parsing
+		return fragment_root.children
 	return doc.children
 
 serialize_els = (els, shallow, show_ids) ->