X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=ef5545f0d603da284694c18c3dca69c857d62ccd;hb=41c743381bdcfdf5303ff0f23eaaf3e121e4ebef;hp=318422b89a0760fd8954a51824dcc2c273d59cbd;hpb=47d40ff2cb949e10270189a1b902d6ce7f4bf1f0;p=peach-html5-editor.git

diff --git a/parse-html.coffee b/parse-html.coffee
index 318422b..ef5545f 100644
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -22,22 +22,76 @@
 #
 # Instead, the data structure produced by this parser is an array of nodes.
 #
-# Each node is an array. The first element in the array is an integer (one of
-# the TYPE_* constants below) followed by the appropriate fields for that type
-# (shown below in the comments after the TYPE_* definition.)
-
+# Each node is an obect of the Node class. Here are the Node types:
 TYPE_TAG = 0 # name, {attributes}, [children]
 TYPE_TEXT = 1 # "text"
-TYPE_WHITESPACE = 2
-TYPE_COMMENT = 3
+TYPE_COMMENT = 2
+TYPE_DOCTYPE = 3
 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
-TYPE_OPEN_TAG = 4
+TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
+TYPE_END_TAG = 5 # name
+TYPE_EOF = 6
+
+class Node
+	constructor: (type, args = {}) ->
+		@type = type # one of the TYPE_* constants above
+		@name = args.name ? '' # tag name
+		@text = args.text ? '' # contents for text/comment nodes
+		@attrs = args.attrs ? {}
+		@attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
+		@children = args.children ? []
+	serialize: -> # for unit tests
+		ret = ''
+		switch @type
+			when TYPE_TAG
+				ret += 'tag:'
+				ret += JSON.stringify @name
+				ret += ','
+				ret += JSON.stringify @attrs
+				ret += ','
+				sep = '['
+				for c in @children
+					ret += sep
+					sep = ','
+					ret += c.serialize()
+				ret += ']'
+			when TYPE_TEXT
+				ret += 'text:'
+				ret += JSON.stringify @text
+			when TYPE_COMMENT
+				ret += 'comment:'
+				ret += JSON.stringify @text
+			when TYPE_DOCTYPE
+				ret += 'doctype'
+				# FIXME
+			else
+				ret += 'unknown:'
+		return ret
+
+
+# helpers: (only take args that are normally known when parser creates nodes)
+new_open_tag = (name) ->
+	return new Node TYPE_OPEN_TAG, name: name
+new_end_tag = (name) ->
+	return new Node TYPE_END_TAG, name: name
+new_text_node = (txt) ->
+	return new Node TYPE_TEXT, text: txt
+new_comment_node = (txt) ->
+	return new Node TYPE_COMMENT, text: txt
+new_eof_token = ->
+	return new Node TYPE_EOF
 
 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
 digits = "0123456789"
 alnum = lc_alpha + uc_alpha + digits
 hex_chars = digits + "abcdefABCDEF"
+scopers = { # FIXME these are supposed to be namespace specific
+	'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
+	'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
+	'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
+	'foreignObject': true, 'desc': true, 'title'
+}
 
 # some SVG elements have dashes in them
 tag_name_chars = alnum + "-"
@@ -127,6 +181,38 @@ mathml_elements = [
 # foreign_elements = [svg_elements..., mathml_elements...]
 #normal_elements = All other allowed HTML elements are normal elements.
 
+special_elements = {
+	# from HTML:
+	address: true, applet: true, area: true, article: true, aside: true,
+	base: true, basefont: true, bgsound: true, blockquote: true, body: true,
+	br: true, button: true, caption: true, center: true, col: true,
+	colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
+	dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
+	footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
+	h3: true, h4: true, h5: true, h6: true, head: true, header: true,
+	hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
+	isindex: true, li: true, link: true, listing: true, main: true,
+	marquee: true, meta: true, nav: true, noembed: true, noframes: true,
+	noscript: true, object: true, ol: true, p: true, param: true,
+	plaintext: true, pre: true, script: true, section: true, select: true,
+	source: true, style: true, summary: true, table: true, tbody: true,
+	td: true, template: true, textarea: true, tfoot: true, th: true,
+	thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
+	xmp: true,
+
+	# from MathML:
+	mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
+
+	# from SVG:
+	foreignObject: true, desc: true, title: true
+}
+
+formatting_elements = {
+	 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
+	 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
+	 u: true
+}
+
 
 # decode_named_char_ref()
 #
@@ -150,34 +236,194 @@ decode_named_char_ref = (txt) ->
 	return null if decoded is txt
 	return g_dncr.cache[txt] = decoded
 
-parse_html = (txt) ->
+parse_html = (txt, parse_error_cb = null) ->
 	cur = 0 # index of next char in txt to be parsed
 	# declare tree and tokenizer variables so they're in scope below
 	tree = null
-	tree_append_point = null
+	open_tags = [] # stack of open elements
 	tree_state = null
 	tok_state = null
-	tok_cur = null # partially parsed tag
+	tok_cur_tag = null # partially parsed tag
+	flag_frameset_ok = null
+	flag_parsing = null
+
+	parse_error = ->
+		if parse_error_cb?
+			parse_error_cb cur
+		else
+			console.log "Parse error at character #{cur} of #{txt.length}"
+
+
+	# the functions below impliment the Tree Contstruction algorithm
+	# http://www.w3.org/TR/html5/syntax.html#tree-construction
+
+	# But first... the helpers
+	template_tag_is_open = ->
+		for t of open_tags
+			if t.type is TYPE_TAG and t.name is 'template'
+				return true
+		return false
+	is_in_scope = (tag_name) ->
+		for t of open_tags
+			if t.name is tag_name
+				return true
+			if t.name of scopers
+				return false
+		return false
+
+	reconstruct_active_formatting_elements = ->
+		# FIXME implement this
+
+	# http://www.w3.org/TR/html5/syntax.html#close-a-p-element
+	# FIXME implement this
+	close_p_if_in_button_scope = ->
+		if open_tags[0].name is 'p'
+			open_tags.pop()
+		return
+		#p = find_button_scope 'p'
+		#if p?
+			# TODO generate_implied_end_tags except for p tags
+			# TODO parse_error unless open_tags[0].name is 'p'
+			# TODO pop stack until 'p' popped
+
+
+
+	# http://www.w3.org/TR/html5/syntax.html#insert-a-character
+	tree_insert_a_character = (t) ->
+		# FIXME read spec for "adjusted insertion location, etc, this might be wrong
+		dest = open_tags[0].children
+		if dest.length > 0 and dest[dest.length - 1].type is TYPE_TEXT
+			dest[dest.length - 1].text += t.text
+		else
+			dest.push t
+
+	# FIXME read spec, do this right
+	# note: this assumes it's an open tag
+	tree_insert_tag = (t) ->
+		t.type = TYPE_TAG # not TYPE_OPEN_TAG
+		# convert attributes into a hash
+		while t.attrs_a.length
+			a = t.attrs_a.pop()
+			t.attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
+		open_tags[0].children.push t
+		open_tags.unshift t
+
+	# http://www.w3.org/TR/html5/syntax.html#insert-a-comment
+	tree_insert_a_comment = (t) ->
+		# FIXME read spec for "adjusted insertion location, etc, this might be wrong
+		open_tags[0].children.push t
+
+	# 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
+	tree_in_body = (t) ->
+		switch t.type
+			when TYPE_TEXT
+				switch t.text
+					when "\u0000"
+						parse_error()
+					when "\t", "\u000a", "\u000c", "\u000d", ' '
+						reconstruct_active_formatting_elements()
+						tree_insert_a_character t
+					else
+						reconstruct_active_formatting_elements()
+						tree_insert_a_character t
+						flag_frameset_ok = false
+			when TYPE_COMMENT
+				tree_insert_a_comment t
+			when TYPE_DOCTYPE
+				parse_error()
+			when TYPE_OPEN_TAG
+				switch t.name
+					when 'html'
+						parse_error()
+						return if template_tag_is_open()
+						root_attrs = open_tags[open_tags.length - 1].children
+						for k, v of t.attrs
+							root_attrs[k] = v unless root_attrs[k]?
+					when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
+						# FIXME also do this for </template> (end tag)
+						return tree_in_head t
+					when 'body'
+						parse_error()
+						# TODO
+					when 'frameset'
+						parse_error()
+						# TODO
+					when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
+						close_p_if_in_button_scope()
+						tree_insert_tag t
+					when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
+						close_p_if_in_button_scope()
+						if open_tags[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+							parse_error()
+							open_tags.shift()
+						tree_insert_tag t
+					# TODO lots more to implement here
+					else # any other start tag
+						reconstruct_active_formatting_elements()
+						tree_insert_tag t
+			when TYPE_EOF
+				ok_tags = {
+					dd: true, dt: true, li: true, p: true, tbody: true, td: true,
+					tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
+				}
+				for t in open_tags
+					unless ok_tags[t.name]?
+						parse_error()
+						break
+				# TODO stack of template insertion modes thing
+				flag_parsing = false # stop parsing
+			when TYPE_END_TAG
+				switch t.name
+					when 'body'
+						unless is_in_scope 'body'
+							parse_error()
+							return
+						# TODO implement parse error and move to tree_after_body
+					when 'html'
+						unless is_in_scope 'body' # weird, but it's what the spec says
+							parse_error()
+							return
+						# TODO implement parse error and move to tree_after_body, reprocess
+					# TODO lots more close tags to implement here
+					else
+						for node, i in open_tags
+							if node.name is t.name
+								# FIXME generate implied end tags except those with name==t.name
+								parse_error() unless i is 0
+								while i > 0
+									open_tags.shift()
+									i -= 1
+								open_tags.shift()
+								return
+							if special_elements[node.name]?
+								parse_error()
+								return
 
 
 	# the functions below implement the tokenizer stats described here:
 	# http://www.w3.org/TR/html5/syntax.html#tokenization
 
-	# http://www.w3.org/TR/html5/syntax.html#data-state
+	# 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
 	tok_state_data = ->
 		switch c = txt.charAt(cur++)
 			when '&'
-				tok_state = tok_state_character_reference_in_data
+				return new_text_node tokenize_character_reference()
 			when '<'
 				tok_state = tok_state_tag_open
 			when "\u0000"
-				# Parse error
-				return [TYPE_TEXT, c]
+				parse_error()
+				return new_text_node c
+			when '' # EOF
+				return new_eof_token()
 			else
-				return [TYPE_TEXT, c]
+				return new_text_node c
 		return null
 
-	# http://www.w3.org/TR/html5/syntax.html#tag-open-state
+	# 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
+	# not needed: tok_state_character_reference_in_data = ->
+	# just call tok_state_character_reference_in_data()
+
+	# 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
 	tok_state_tag_open = ->
 		switch c = txt.charAt(cur++)
 			when '!'
@@ -185,56 +431,255 @@ parse_html = (txt) ->
 			when '/'
 				tok_state = tok_state_end_tag_open
 			when '?'
-				# Parse error
+				parse_error()
 				tok_state = tok_state_bogus_comment
 			else
 				if lc_alpha.indexOf(c) > -1
-					tok_cur = [TYPE_OPEN_TAG, c, {}, []]
+					tok_cur_tag = new_open_tag c
 					tok_state = tok_state_tag_name
 				else if uc_alpha.indexOf(c) > -1
-					tok_cur = [TYPE_OPEN_TAG, c.toLowerCase(), {}, []]
+					tok_cur_tag = new_open_tag c.toLowerCase()
 					tok_state = tok_state_tag_name
 				else
-					# Parse error
+					parse_error()
 					tok_state = tok_state_data
 					cur -= 1 # we didn't parse/handle the char after <
-					return [TYPE_TEXT, '<']
+					return new_text_node '<'
+		return null
+
+	# 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
+	tok_state_end_tag_open = ->
+		switch c = txt.charAt(cur++)
+			when '>'
+				parse_error()
+				tok_state = tok_state_data
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+				return new_text_node '</'
+			else
+				if uc_alpha.indexOf(c) > -1
+					tok_cur_tag = new_end_tag c.toLowerCase()
+					tok_state = tok_state_tag_name
+				else if lc_alpha.indexOf(c) > -1
+					tok_cur_tag = new_end_tag c
+					tok_state = tok_state_tag_name
+				else
+					parse_error()
+					tok_state = tok_state_bogus_comment
 		return null
 
-	# http://www.w3.org/TR/html5/syntax.html#tag-name-state
+	# 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
 	tok_state_tag_name = ->
 		switch c = txt.charAt(cur++)
-			when "\t", "\n", ' '
+			when "\t", "\n", "\u000c", ' '
 				tok_state = tok_state_before_attribute_name
 			when '/'
 				tok_state = tok_state_self_closing_start_tag
 			when '>'
 				tok_state = tok_state_data
-				tmp = tok_cur
-				tok_cur = null
+				tmp = tok_cur_tag
+				tok_cur_tag = null
 				return tmp
 			when "\u0000"
-				# Parse error
-				tok_cur[1] += "\ufffd"
+				parse_error()
+				tok_cur_tag.name += "\ufffd"
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+			else
+				if uc_alpha.indexOf(c) > -1
+					tok_cur_tag.name += c.toLowerCase()
+				else
+					tok_cur_tag.name += c
+		return null
+
+	# 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
+	tok_state_before_attribute_name = ->
+		attr_name = null
+		switch c = txt.charAt(cur++)
+			when "\t", "\n", "\u000c", ' '
+				return null
+			when '/'
+				tok_state = tok_state_self_closing_start_tag
+				return null
+			when '>'
+				tok_state = tok_state_data
+				tmp = tok_cur_tag
+				tok_cur_tag = null
+				return tmp
+			when "\u0000"
+				parse_error()
+				attr_name = "\ufffd"
+			when '"', "'", '<', '='
+				parse_error()
+				attr_name = c
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+			else
+				if uc_alpha.indexOf(c) > -1
+					attr_name = c.toLowerCase()
+				else
+					attr_name = c
+		if attr_name?
+			tok_cur_tag.attrs_a.unshift [attr_name, '']
+			tok_state = tok_state_attribute_name
+		return null
+
+	# 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
+	tok_state_attribute_name = ->
+		switch c = txt.charAt(cur++)
+			when "\t", "\n", "\u000c", ' '
+				tok_state = tok_state_after_attribute_name
+			when '/'
+				tok_state = tok_state_self_closing_start_tag
+			when '='
+				tok_state = tok_state_before_attribute_value
+			when '>'
+				tok_state = tok_state_data
+				tmp = tok_cur_tag
+				tok_cur_tag = null
+				return tmp
+			when "\u0000"
+				parse_error()
+				tok_cur_tag.attrs_a[0][0] = "\ufffd"
+			when '"', "'", '<'
+				parse_error()
+				tok_cur_tag.attrs_a[0][0] = c
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
 			else
 				if uc_alpha.indexOf(c) > -1
-					tok_cur[1] += c.toLowerCase()
+					tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
 				else
-					tok_cur[1] += c
+					tok_cur_tag.attrs_a[0][0] += c
+		return null
+
+	# 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
+	tok_state_before_attribute_value = ->
+		switch c = txt.charAt(cur++)
+			when "\t", "\n", "\u000c", ' '
+				return null
+			when '"'
+				tok_state = tok_state_attribute_value_double_quoted
+			when '&'
+				tok_state = tok_state_attribute_value_unquoted
+				cur -= 1
+			when "'"
+				tok_state = tok_state_attribute_value_single_quoted
+			when "\u0000"
+				# Parse error
+				tok_cur_tag.attrs_a[0][1] += "\ufffd"
+				tok_state = tok_state_attribute_value_unquoted
+			when '>'
+				# Parse error
+				tok_state = tok_state_data
+				tmp = tok_cur_tag
+				tok_cur_tag = null
+				return tmp
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+			else
+				tok_cur_tag.attrs_a[0][1] += c
+				tok_state = tok_state_attribute_value_unquoted
+		return null
+
+	# 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
+	tok_state_attribute_value_double_quoted = ->
+		switch c = txt.charAt(cur++)
+			when '"'
+				tok_state = tok_state_after_attribute_value_quoted
+			when '&'
+				tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
+			when "\u0000"
+				# Parse error
+				tok_cur_tag.attrs_a[0][1] += "\ufffd"
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+			else
+				tok_cur_tag.attrs_a[0][1] += c
+		return null
+
+	# 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
+	tok_state_attribute_value_single_quoted = ->
+		switch c = txt.charAt(cur++)
+			when "'"
+				tok_state = tok_state_after_attribute_value_quoted
+			when '&'
+				tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
+			when "\u0000"
+				# Parse error
+				tok_cur_tag.attrs_a[0][1] += "\ufffd"
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+			else
+				tok_cur_tag.attrs_a[0][1] += c
+		return null
+
+	# 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
+	tok_state_attribute_value_unquoted = ->
+		switch c = txt.charAt(cur++)
+			when "\t", "\n", "\u000c", ' '
+				tok_state = tok_state_before_attribute_name
+			when '&'
+				tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
+			when '>'
+				tok_state = tok_state_data
+				tmp = tok_cur_tag
+				tok_cur_tag = null
+				return tmp
+			when "\u0000"
+				tok_cur_tag.attrs_a[0][1] += "\ufffd"
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+			else
+				# Parse Error if ', <, = or ` (backtick)
+				tok_cur_tag.attrs_a[0][1] += c
+		return null
+
+	# 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
+	tok_state_after_attribute_value_quoted = ->
+		switch c = txt.charAt(cur++)
+			when "\t", "\n", "\u000c", ' '
+				tok_state = tok_state_before_attribute_name
+			when '/'
+				tok_state = tok_state_self_closing_start_tag
+			when '>'
+				tok_state = tok_state_data
+				tmp = tok_cur_tag
+				tok_cur_tag = null
+				return tmp
+			when '' # EOF
+				parse_error()
+				tok_state = tok_state_data
+			else
+				# Parse Error
+				tok_state = tok_state_before_attribute_name
+				cur -= 1 # we didn't handle that char
 		return null
 
-	# http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
-	# & just got consumed
-	tok_state_character_reference_in_data = ->
-		tok_state = tok_state_data
+	# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
+	# Don't set this as a state, just call it
+	# returns a string (NOT a text node)
+	tokenize_character_reference = (allowed_char = null, in_attr = false) ->
 		if cur >= txt.length
-			return [TYPE_TEXT, '&']
+			return '&'
 		switch c = txt.charAt(cur)
+			when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
+				# explicitly not a parse error
+				return '&'
 			when ';'
-				return [TYPE_TEXT, '&']
+				# there has to be "one or more" alnums between & and ; to be a parse error
+				return '&'
 			when '#'
 				if cur + 1 >= txt.length
-					return [TYPE_TEXT, '&']
+					return '&'
 				if txt.charAt(cur + 1).toLowerCase() is 'x'
 					prefix = '#x'
 					charset = hex_chars
@@ -247,82 +692,151 @@ parse_html = (txt) ->
 				while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
 					i += 1
 				if i is 0
-					return [TYPE_TEXT, '&']
+					return '&'
 				if txt.charAt(start + i) is ';'
 					i += 1
+				# FIXME This is supposed to generate parse errors for some chars
 				decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
 				if decoded?
 					cur = start + i
-					return [TYPE_TEXT, decoded]
-				return [TYPE_TEXT, '&']
+					return decoded
+				return '&'
 			else
 				for i in [0...31]
 					if alnum.indexOf(txt.charAt(cur + i)) is -1
 						break
 				if i is 0
-					return [TYPE_TEXT, '&']
+					# exit early, because parse_error() below needs at least one alnum
+					return '&'
 				if txt.charAt(cur + i) is ';'
 					i += 1 # include ';' terminator in value
 					decoded = decode_named_char_ref txt.substr(cur, i)
 					if decoded?
 						cur += i
-						return [TYPE_TEXT, decoded]
-					return [TYPE_TEXT, '&']
+						return decoded
+					parse_error()
+					return '&'
 				else
 					# no ';' terminator (only legacy char refs)
-					if i < 2 or i > 6
-						return [TYPE_TEXT, '&']
-					# FIXME: if we're inside an attribute:
-					# 1.	don't parse refs that are followed by =
-					# 2.	don't parse refs that are followed by alnum
 					max = i
 					for i in [2..max] # no prefix matches, so ok to check shortest first
 						c = legacy_char_refs[txt.substr(cur, i)]
 						if c?
+							if in_attr
+								if txt.charAt(cur + i) is '='
+									# "because some legacy user agents will
+									# misinterpret the markup in those cases"
+									parse_error()
+									return '&'
+								if alnum.indexOf(txt.charAt(cur + i)) > -1
+									# this makes attributes forgiving about url args
+									return '&'
+							# ok, and besides the weird exceptions for attributes...
+							# return the matching char
 							cur += i # consume entity chars
-							return [TYPE_TEXT, c]
-		return null
-
-	# the functions below impliment the Tree Contstruction algorithm here:
-	# http://www.w3.org/TR/html5/syntax.html#tree-construction
-	# FIXME this is just a bit of a hack that makes sense... read spec and do it that way
-	tree_append = (t) ->
-		if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
-			tree_append_point[tree_append_point.length - 1][1] += t[1]
-		else
-			tree_append_point.push t
-			if t[0] is TYPE_OPEN_TAG
-				t[0] = TYPE_TAG
-				tree_append_point = t[3]
+							parse_error() # because no terminating ";"
+							return c
+					parse_error()
+					return '&'
+		return # never reached
 
 	# tree constructor initialization
-	tree = [] # see comments on TYPE_TAG/etc for the structure of this data
-	tree_append_point = tree
-	tree_state = tree_append
+	# see comments on TYPE_TAG/etc for the structure of this data
+	tree = new Node TYPE_TAG, name: 'html'
+	open_tags = [tree]
+	tree_state = tree_in_body
+	flag_frameset_ok = true
+	flag_parsing = true
 
 	# tokenizer initialization
 	tok_state = tok_state_data
 
 	# proccess input
-	while cur < txt.length
+	while flag_parsing
 		t = tok_state()
 		if t?
 			tree_state t
-
-	return tree
+	return tree.children
 
 # everything below is tests on the above
-test_equals = (description, fn, args..., expected_output) ->
-	output = fn.apply this, args
+test_equals = (description, output, expected_output) ->
 	if output is expected_output
-		console.log "passed: #{description}."
+		console.log "passed." # don't say name, so smart consoles can merge all of these
 	else
-		console.log "FAILED: #{description}. Expected: #{expected_output}, actual: #{output}"
-html_to_json = (html) ->
-	return JSON.stringify parse_html html
-test_equals "empty", html_to_json, "", '[]'
-test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
-test_equals "named entity", html_to_json, "a&amp;1234", '[[1,"a&1234"]]'
-test_equals "broken named character references", html_to_json, "1&amp2&&amp;3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
-test_equals "numbered entity overrides", html_to_json, "1&#X80&#x80; &#x83", '[[1,"1â¬â¬ Æ"]]'
-test_equals "open_tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
+		console.log "FAILED: \"#{description}\""
+		console.log "   Expected: #{expected_output}"
+		console.log "     Actual: #{output}"
+test_parser = (args) ->
+	parse_errors = []
+	errors_cb = (i) ->
+		parse_errors.push i
+	parsed = parse_html args.html, errors_cb
+	serialized = ''
+	sep = ''
+	for t in parsed
+		serialized += sep
+		sep = ','
+		serialized += t.serialize()
+	if serialized isnt args.expected or parse_errors.length isnt args.errors
+		console.log "test FAILED: \"#{args.name}\""
+	else
+		console.log 'test passed'
+	if serialized isnt args.expected
+		console.log "      Input: #{args.html}"
+		console.log "    Correct: #{args.expected}"
+		console.log "     Output: #{serialized}"
+	if parse_errors.length isnt args.errors
+		console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
+
+test_parser name: "empty", \
+	html: "",
+	expected: '',
+	errors: 0
+test_parser name: "just text", \
+	html: "abc",
+	expected: 'text:"abc"',
+	errors: 0
+test_parser name: "named entity", \
+	html: "a&amp;1234",
+	expected: 'text:"a&1234"',
+	errors: 0
+test_parser name: "broken named character references", \
+	html: "1&amp2&&amp;3&aabbcc;",
+	expected: 'text:"1&2&&3&aabbcc;"',
+	errors: 2
+test_parser name: "numbered entity overrides", \
+	html: "1&#X80&#x80; &#x83",
+	expected: 'text:"1â¬â¬ Æ"',
+	errors: 0
+test_parser name: "open tag", \
+	html: "foo<span>bar",
+	expected: 'text:"foo",tag:"span",{},[text:"bar"]',
+	errors: 1 # no close tag
+test_parser name: "open tag with attributes", \
+	html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
+	expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
+	errors: 1 # no close tag
+test_parser name: "open tag with attributes of various quotings", \
+	html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
+	expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
+	errors: 1 # no close tag
+test_parser name: "attribute entity exceptions dq", \
+	html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
+	expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
+	errors: 2 # no close tag, &amp= in attr
+test_parser name: "attribute entity exceptions sq", \
+	html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
+	expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
+	errors: 2 # no close tag, &amp= in attr
+test_parser name: "attribute entity exceptions uq", \
+	html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
+	expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
+	errors: 2 # no close tag, &amp= in attr
+test_parser name: "matching closing tags", \
+	html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
+	expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
+	errors: 0
+test_parser name: "mis-matched closing tags", \
+	html: "foo<div>bar<span>baz</div>qux",
+	expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
+	errors: 1 # close tag mismatch