From: Jason Woofenden <jason@jasonwoof.com>
Date: Thu, 17 Dec 2015 03:16:08 +0000 (-0500)
Subject: implemented adoption agency algorithm, tested a littl
X-Git-Url: https://jasonwoof.com/gitweb/?a=commitdiff_plain;h=2fe7c46d8d2af3427d10fff4ac63d44d5314dedd;p=peach-html5-editor.git

implemented adoption agency algorithm, tested a littl
---

diff --git a/parse-html.coffee b/parse-html.coffee
index b7421e5..5b1b175 100644
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -31,6 +31,13 @@ TYPE_DOCTYPE = 3
 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
 TYPE_END_TAG = 5 # name
 TYPE_EOF = 6
+TYPE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
+TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
+
+# namespace constants
+NS_HTML = 1
+NS_MATHML = 2
+NS_SVG = 3
 
 class Node
 	constructor: (type, args = {}) ->
@@ -40,6 +47,13 @@ class Node
 		@attrs = args.attrs ? {}
 		@attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
 		@children = args.children ? []
+		@namespace = args.namespace ? NS_HTML
+		@parent = args.parent ? null
+	shallow_clone: -> # return a new node that's the same except without the children or parent
+		# WARNING this doesn't work right on open tags that are still being parsed
+		attrs = {}
+		attrs[k] = v for k, v of @attrs
+		return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace
 	serialize: -> # for unit tests
 		ret = ''
 		switch @type
@@ -66,9 +80,9 @@ class Node
 				# FIXME
 			else
 				ret += 'unknown:'
+				console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
 		return ret
 
-
 # helpers: (only take args that are normally known when parser creates nodes)
 new_open_tag = (name) ->
 	return new Node TYPE_OPEN_TAG, name: name
@@ -80,6 +94,8 @@ new_comment_node = (txt) ->
 	return new Node TYPE_COMMENT, text: txt
 new_eof_token = ->
 	return new Node TYPE_EOF
+new_aaa_bookmark = ->
+	return new Node TYPE_AAA_BOOKMARK
 
 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
@@ -176,29 +192,32 @@ mathml_elements = [
 #normal_elements = All other allowed HTML elements are normal elements.
 
 special_elements = {
-	# from HTML:
-	address: true, applet: true, area: true, article: true, aside: true,
-	base: true, basefont: true, bgsound: true, blockquote: true, body: true,
-	br: true, button: true, caption: true, center: true, col: true,
-	colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
-	dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
-	footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
-	h3: true, h4: true, h5: true, h6: true, head: true, header: true,
-	hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
-	isindex: true, li: true, link: true, listing: true, main: true,
-	marquee: true, meta: true, nav: true, noembed: true, noframes: true,
-	noscript: true, object: true, ol: true, p: true, param: true,
-	plaintext: true, pre: true, script: true, section: true, select: true,
-	source: true, style: true, summary: true, table: true, tbody: true,
-	td: true, template: true, textarea: true, tfoot: true, th: true,
-	thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
-	xmp: true,
-
-	# from MathML:
-	mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
-
-	# from SVG:
-	foreignObject: true, desc: true, title: true
+	# HTML:
+	address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
+	aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
+	blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
+	caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
+	details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
+	embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
+	footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
+	h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
+	header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
+	img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
+	listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
+	noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
+	ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
+	script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
+	style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
+	template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
+	thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
+	wbr:NS_HTML, xmp:NS_HTML,
+
+	# MathML:
+	mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
+	'annotation-xml':NS_MATHML,
+
+	# SVG:
+	foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
 }
 
 formatting_elements = {
@@ -207,6 +226,8 @@ formatting_elements = {
 	 u: true
 }
 
+el_is_special = (e) ->
+	return special_elements[e] is e.namespace
 
 # decode_named_char_ref()
 #
@@ -234,12 +255,13 @@ parse_html = (txt, parse_error_cb = null) ->
 	cur = 0 # index of next char in txt to be parsed
 	# declare tree and tokenizer variables so they're in scope below
 	tree = null
-	open_tags = [] # stack of open elements
+	open_els = [] # stack of open elements
 	tree_state = null
 	tok_state = null
 	tok_cur_tag = null # partially parsed tag
 	flag_frameset_ok = null
 	flag_parsing = null
+	afe = [] # active formatting elements
 
 	parse_error = ->
 		if parse_error_cb?
@@ -253,19 +275,19 @@ parse_html = (txt, parse_error_cb = null) ->
 
 	# But first... the helpers
 	template_tag_is_open = ->
-		for t in open_tags
+		for t in open_els
 			if t.type is TYPE_TAG and t.name is 'template'
 				return true
 		return false
 	is_in_scope_x = (tag_name, scope) ->
-		for t in open_tags
+		for t in open_els
 			if t.name is tag_name
 				return true
 			if t.name of scope
 				return false
 		return false
 	is_in_scope_x_y = (tag_name, scope, scope2) ->
-		for t in open_tags
+		for t in open_els
 			if t.name is tag_name
 				return true
 			if t.name of scope
@@ -289,56 +311,277 @@ parse_html = (txt, parse_error_cb = null) ->
 	is_in_table_scope = (tag_name) ->
 		return is_in_scope_x tag_name, table_scopers
 	is_in_select_scope = (tag_name) ->
-		for t in open_tags
+		for t in open_els
 			if t.name is tag_name
 				return true
 			if t.name isnt 'optgroup' and t.name isnt 'option'
 				return false
 		return false
+	# this checks for a particular element, not by name
+	el_is_in_scope = (el) ->
+		for t in open_els
+			if t is el
+				return true
+			if t.name of standard_scopers
+				return false
+		return false
 
+	# http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
+	# this implementation is structured (mostly) as described at the link above.
+	# capitalized comments are the "labels" described at the link above.
 	reconstruct_active_formatting_elements = ->
-		# FIXME implement this
+		return if afe.length is 0
+		if afe[0].type is TYPE_MARKER or afe[0] in open_els
+			return
+		# Rewind
+		i = 0
+		loop
+			if i is afe.length - 1
+				break
+			i += 1
+			if afe[i].type is TYPE_MARKER or afe[i] in open_els
+				i -= 1 # Advance
+				break
+		# Create
+		loop
+			el = afe[i].shallow_clone()
+			tree_insert_tag el
+			afe[i] = el
+			break if i is 0
+			i -= 1
+
+	# http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
+	# adoption agency algorithm
+	adoption_agency = (subject) ->
+		if open_els[0].name is subject
+			el = open_els[0]
+			open_els.shift()
+			# remove it from the list of active formatting elements (if found)
+			for t, i in afe
+				if t is el
+					afe.splice i, 1
+					break
+			return
+		outer = 0
+		loop
+			if outer >= 8
+				return
+			outer += 1
+			fe = null
+			for t, fe_index in afe
+				if t.type is TYPE_MARKER
+					break
+				if t.name is subject
+					fe = t
+					break
+			if fe is null
+				in_body_any_other_end_tag subject
+				return
+			in_open_els = false
+			for t in open_els
+				if t is fe
+					in_open_els = true
+					break
+			unless in_open_els
+				parse_error()
+				# "remove it from the list" must mean afe, since it's not in open_els
+				afe.splice fe_index, 1
+				return
+			unless el_is_in_scope fe
+				parse_error()
+				return
+			unless open_els[0] is fe
+				parse_error()
+				# continue
+			fb = null
+			fb_index
+			for t, i in open_els
+				if t is fe
+					break
+				if el_is_special t
+					fb = t
+					fb_index = i
+			if fb is null
+				loop
+					t = open_els.shift()
+					if t is fe
+						afe.splice fe_index, 1
+						return
+			ca = open_els[fe_index + 1] # common ancestor
+			node_above = open_els[fb_index + 1] # next node if node isn't in open_els anymore
+			# 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
+			bookmark = new_aaa_bookmark()
+			for t, i in afe
+				if t is fe
+					afe.splice i, 0, bookmark
+			node = last_node = fb
+			inner = 0
+			loop
+				inner += 1
+				node_next = null
+				for t, i in open_els
+					if t is node
+						node_next = open_els[i + 1]
+						break
+				node = node_next ? node_above
+				# TODO make sure node_above gets re-set if/when node is removed from open_els
+				if node is fe
+					break
+				node_in_afe = false
+				for t, i of afe
+					if t is node
+						if inner > 3
+							afe.splice i, 1
+						else
+							node_in_afe = true
+						break
+				unless node_in_afe
+					for t, i in open_els
+						if t is node
+							node_above = open_els[i + 1]
+							open_els.splice i, 1
+							break
+					continue
+				# 7. reate an element for the token for which the element node
+				# was created, in the HTML namespace, with common ancestor as
+				# the intended parent; replace the entry for node in the list
+				# of active formatting elements with an entry for the new
+				# element, replace the entry for node in the stack of open
+				# elements with an entry for the new element, and let node be
+				# the new element.
+				new_node = node.shallow_clone()
+				for t, i in afe
+					if t is node
+						afe[i] = new_node
+						break
+				for t, i in open_els
+					if t is node
+						open_els[i] = new_node
+						break
+				node = new_node
+				# 8. If last node is furthest block, then move the
+				# aforementioned bookmark to be immediately after the new node
+				# in the list of active formatting elements.
+				if last_node is fb
+					for t, i in afe
+						if t is bookmark
+							afe.splice i, 1
+					for t, i in afe
+						if t is node
+							# TODO test: position i gets you "after"?
+							afe.splice i, 0, new_aaa_bookmark()
+				# 9. Insert last node into node, first removing it from its
+				# previous parent node if any.
+				if last_node.parent?
+					for c, i of last_node.parent.children
+						if c is last_node
+							last_node.parent.children.splice i, 1
+				node.children.push last_node
+				last_node.parent = node
+				# 10. Let last node be node.
+				last_node = node
+				# 11. Return to the step labeled inner loop.
+			# 14. Insert whatever last node ended up being in the previous step
+			# at the appropriate place for inserting a node, but using common
+			# ancestor as the override target.
+			tree_insert_tag last_node, ca
+			# 15. Create an element for the token for which formatting element
+			# was created, in the HTML namespace, with furthest block as the
+			# intended parent.
+			new_element = fe.shallow_clone()
+			# 16. Take all of the child nodes of furthest block and append them
+			# to the element created in the last step.
+			while fb.children.length
+				t = fb.children.shift()
+				t.parent = new_element
+				new_element.children.push t
+			# 17. Append that new element to furthest block.
+			new_element.parent = fb
+			fb.children.push new_element
+			# 18. Remove formatting element from the list of active formatting
+			# elements, and insert the new element into the list of active
+			# formatting elements at the position of the aforementioned
+			# bookmark.
+			for t, i in afe
+				if t is fe
+					afe.splice i, 1
+					break
+			for t, i in afe
+				if t is bookmark
+					afe[i] = node
+					break
+			# 19. Remove formatting element from the stack of open elements,
+			# and insert the new element into the stack of open elements
+			# immediately below the position of furthest block in that stack.
+			for t, i of open_els
+				if t is fe
+					open_els.splice i, 1
+					break
+			for t, i of open_els
+				if t is fb
+					open_els.splice i, 0, new_element
+					break
+			# 20. Jump back to the step labeled outer loop.
 
 	# http://www.w3.org/TR/html5/syntax.html#close-a-p-element
 	# FIXME implement this
 	close_p_if_in_button_scope = ->
-		if open_tags[0].name is 'p'
-			open_tags.pop()
+		if open_els[0].name is 'p'
+			open_els.pop()
 		return
 		#p = find_button_scope 'p'
 		#if p?
 			# TODO generate_implied_end_tags except for p tags
-			# TODO parse_error unless open_tags[0].name is 'p'
+			# TODO parse_error unless open_els[0].name is 'p'
 			# TODO pop stack until 'p' popped
 
-
-
 	# http://www.w3.org/TR/html5/syntax.html#insert-a-character
 	tree_insert_a_character = (t) ->
 		# FIXME read spec for "adjusted insertion location, etc, this might be wrong
-		dest = open_tags[0].children
+		dest = open_els[0].children
 		if dest.length > 0 and dest[dest.length - 1].type is TYPE_TEXT
 			dest[dest.length - 1].text += t.text
 		else
 			dest.push t
 
 	# FIXME read spec, do this right
+	# FIXME implement the override target thing
 	# note: this assumes it's an open tag
-	tree_insert_tag = (t) ->
+	tree_insert_tag = (t, override_target = null) ->
 		t.type = TYPE_TAG # not TYPE_OPEN_TAG
 		# convert attributes into a hash
 		while t.attrs_a.length
 			a = t.attrs_a.pop()
 			t.attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
-		open_tags[0].children.push t
-		open_tags.unshift t
+		if t.parent?
+			for c, i of t.parent.children
+				if c is t
+					t.parent.children.splice i, 1
+		# FIXME spec says to do something to figure out what parent should be
+		parent = open_els[0]
+		open_els.unshift t
+		parent.children.push t
+		t.parent = parent
 
 	# http://www.w3.org/TR/html5/syntax.html#insert-a-comment
 	tree_insert_a_comment = (t) ->
 		# FIXME read spec for "adjusted insertion location, etc, this might be wrong
-		open_tags[0].children.push t
+		open_els[0].children.push t
 
 	# 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
+	in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
+		for node, i in open_els
+			if node.name is name
+				# FIXME generate implied end tags except those with name==name
+				parse_error() unless i is 0
+				while i > 0
+					open_els.shift()
+					i -= 1
+				open_els.shift()
+				return
+			if special_elements[node.name]?
+				parse_error()
+				return
 	tree_in_body = (t) ->
 		switch t.type
 			when TYPE_TEXT
@@ -361,7 +604,7 @@ parse_html = (txt, parse_error_cb = null) ->
 					when 'html'
 						parse_error()
 						return if template_tag_is_open()
-						root_attrs = open_tags[open_tags.length - 1].children
+						root_attrs = open_els[open_els.length - 1].children
 						for k, v of t.attrs
 							root_attrs[k] = v unless root_attrs[k]?
 					when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
@@ -378,11 +621,16 @@ parse_html = (txt, parse_error_cb = null) ->
 						tree_insert_tag t
 					when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
 						close_p_if_in_button_scope()
-						if open_tags[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+						if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
 							parse_error()
-							open_tags.shift()
+							open_els.shift()
 						tree_insert_tag t
 					# TODO lots more to implement here
+					when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
+						reconstruct_active_formatting_elements()
+						tree_insert_tag t
+						afe.push t
+					# TODO lots more to implement here
 					else # any other start tag
 						reconstruct_active_formatting_elements()
 						tree_insert_tag t
@@ -391,7 +639,7 @@ parse_html = (txt, parse_error_cb = null) ->
 					dd: true, dt: true, li: true, p: true, tbody: true, td: true,
 					tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
 				}
-				for t in open_tags
+				for t in open_els
 					unless ok_tags[t.name]?
 						parse_error()
 						break
@@ -410,19 +658,12 @@ parse_html = (txt, parse_error_cb = null) ->
 							return
 						# TODO implement parse error and move to tree_after_body, reprocess
 					# TODO lots more close tags to implement here
+					when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
+						adoption_agency t.name
+					# TODO lots more close tags to implement here
 					else
-						for node, i in open_tags
-							if node.name is t.name
-								# FIXME generate implied end tags except those with name==t.name
-								parse_error() unless i is 0
-								while i > 0
-									open_tags.shift()
-									i -= 1
-								open_tags.shift()
-								return
-							if special_elements[node.name]?
-								parse_error()
-								return
+						in_body_any_other_end_tag t.name
+		return
 
 
 	# the functions below implement the tokenizer stats described here:
@@ -768,10 +1009,11 @@ parse_html = (txt, parse_error_cb = null) ->
 	# tree constructor initialization
 	# see comments on TYPE_TAG/etc for the structure of this data
 	tree = new Node TYPE_TAG, name: 'html'
-	open_tags = [tree]
+	open_els = [tree]
 	tree_state = tree_in_body
 	flag_frameset_ok = true
 	flag_parsing = true
+	afe = [] # active formatting elements
 
 	# tokenizer initialization
 	tok_state = tok_state_data
@@ -872,4 +1114,10 @@ test_parser name: "mis-matched closing tags", \
 test_parser name: "mis-matched formatting elements", \
 	html: "12<b>34<i>56</b>78</i>90",
 	expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
-	errors: 2 # FIXME dunno how many there should be
+	errors: 1 # no idea how many their should be
+test_parser name: "crazy formatting elements test", \
+	html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
+	# chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
+	# firefox does this:
+	expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
+	errors: 6 # no idea how many there should be