X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;ds=sidebyside;f=parse-html.coffee;h=fee4202f92205ab9df44d834ffacb3826534bd56;hb=7460e85442ced49600febffff4ac8fe16d7361e3;hp=31a46f413fc2447fd4e1d21b03531d9f5ae71c66;hpb=12e07fdf217eda724e703e32ec5c8b968bb3a727;p=peach-html5-editor.git

diff --git a/parse-html.coffee b/parse-html.coffee
index 31a46f4..fee4202 100644
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -195,8 +195,8 @@ is_space_tok = (t) ->
 	return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
 
 is_input_hidden_tok = (t) ->
-	return unless t.type is TYPE_START_TAG
-	for a of t.attrs_a
+	return false unless t.type is TYPE_START_TAG
+	for a in t.attrs_a
 		if a[0] is 'type'
 			if a[1].toLowerCase() is 'hidden'
 				return true
@@ -512,8 +512,9 @@ decode_named_char_ref = (txt) ->
 	return null if decoded is txt
 	return g_dncr.cache[txt] = decoded
 
-parse_html = (txt, parse_error_cb = null) ->
-	cur = 0 # index of next char in txt to be parsed
+parse_html = (args) ->
+	txt = null
+	cur = null # index of next char in txt to be parsed
 	# declare doc and tokenizer variables so they're in scope below
 	doc = null
 	open_els = null # stack of open elements
@@ -538,8 +539,8 @@ parse_html = (txt, parse_error_cb = null) ->
 		flag_parsing = false
 
 	parse_error = ->
-		if parse_error_cb?
-			parse_error_cb cur
+		if args.error_cb?
+			args.error_cb cur
 		else
 			console.log "Parse error at character #{cur} of #{txt.length}"
 
@@ -1191,7 +1192,7 @@ parse_html = (txt, parse_error_cb = null) ->
 				last_template = null
 				last_template_i = null
 				for el, i in open_els
-					if el.name is 'template'
+					if el.name is 'template' and el.namespace is NS_HTML
 						last_template = el
 						last_template_i = i
 						break
@@ -1200,7 +1201,7 @@ parse_html = (txt, parse_error_cb = null) ->
 				last_table = null
 				last_table_i
 				for el, i in open_els
-					if el.name is 'table'
+					if el.name is 'table' and el.namespace is NS_HTML
 						last_table = el
 						last_table_i = i
 						break
@@ -1222,6 +1223,7 @@ parse_html = (txt, parse_error_cb = null) ->
 					# this is odd
 					target = open_els[open_els.length - 1]
 					target_i = target.children.length
+					break
 				# 5. If last table has a parent element, then let adjusted
 				# insertion location be inside last table's parent element,
 				# immediately before last table, and abort these substeps.
@@ -1393,6 +1395,7 @@ parse_html = (txt, parse_error_cb = null) ->
 			el = insert_html_element t
 			head_element_pointer = el
 			ins_mode = ins_mode_in_head
+			return
 		if t.type is TYPE_END_TAG
 			if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
 				# fall through to Anything else below
@@ -1438,7 +1441,7 @@ parse_html = (txt, parse_error_cb = null) ->
 		if t.type is TYPE_START_TAG and t.name is 'title'
 			parse_generic_rcdata_text t
 			return
-		if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
+		if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
 			parse_generic_raw_text t
 			return
 		if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
@@ -1500,14 +1503,14 @@ parse_html = (txt, parse_error_cb = null) ->
 		if t.type is TYPE_DOCTYPE
 			parse_error()
 			return
-		if t.type is TYPE_START_TAG
+		if t.type is TYPE_START_TAG and t.name is 'html'
 			ins_mode_in_body t
 			return
 		if t.type is TYPE_END_TAG and t.name is 'noscript'
 			open_els.shift()
 			ins_mode = ins_mode_in_head
 			return
-		if (t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\u000a" or t.text is "\u000c" or t.text is "\u000d" or t.text is ' ')) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
+		if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
 			ins_mode_in_head t
 			return
 		if t.type is TYPE_END_TAG and t.name is 'br'
@@ -2100,19 +2103,6 @@ parse_html = (txt, parse_error_cb = null) ->
 			return
 		return
 
-	ins_mode_in_table_else = (t) ->
-		parse_error()
-		flag_foster_parenting = true # FIXME
-		ins_mode_in_body t
-		flag_foster_parenting = false
-	can_in_table = { # FIXME do this inline like everywhere else
-		'table': true
-		'tbody': true
-		'tfoot': true
-		'thead': true
-		'tr': true
-	}
-
 	# 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
 	ins_mode_text = (t) ->
 		if t.type is TYPE_TEXT
@@ -2142,6 +2132,19 @@ parse_html = (txt, parse_error_cb = null) ->
 	# http://www.w3.org/TR/html5/syntax.html#tokenization
 
 	# 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
+	ins_mode_in_table_else = (t) ->
+		parse_error()
+		flag_foster_parenting = true
+		ins_mode_in_body t
+		flag_foster_parenting = false
+		return
+	can_in_table = { # FIXME do this inline like everywhere else
+		'table': true
+		'tbody': true
+		'tfoot': true
+		'thead': true
+		'tr': true
+	}
 	ins_mode_in_table = (t) ->
 		switch t.type
 			when TYPE_TEXT
@@ -2192,7 +2195,7 @@ parse_html = (txt, parse_error_cb = null) ->
 					when 'style', 'script', 'template'
 						ins_mode_in_head t
 					when 'input'
-						if is_input_hidden_tok t
+						unless is_input_hidden_tok t
 							ins_mode_in_table_else t
 						else
 							parse_error()
@@ -3040,9 +3043,9 @@ parse_html = (txt, parse_error_cb = null) ->
 	is_appropriate_end_tag = (t) ->
 		# spec says to check against "the tag name of the last start tag to
 		# have been emitted from this tokenizer", but this is only called from
-		# the various "raw" states, which I'm pretty sure all push the start
-		# token onto open_els. TODO: verify this after the script data states
-		# are implemented
+		# the various "raw" states, so it's hopefully ok to assume that
+		# open_els[0].name will work instead TODO: verify this after the script
+		# data states are implemented
 		debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
 		return t.type is TYPE_END_TAG and t.name is open_els[0].name
 
@@ -3184,6 +3187,11 @@ parse_html = (txt, parse_error_cb = null) ->
 				tok_state = tok_state_self_closing_start_tag
 				return
 			# fall through
+		if c is '>'
+			if is_appropriate_end_tag tok_cur_tag
+				tok_state = tok_state_data
+				return tok_cur_tag
+			# fall through
 		if is_uc_alpha(c)
 			tok_cur_tag.name += c.toLowerCase()
 			temporary_buffer += c
@@ -3336,6 +3344,11 @@ parse_html = (txt, parse_error_cb = null) ->
 				tok_state = tok_state_self_closing_start_tag
 				return
 			# fall through
+		if c is '>'
+			if is_appropriate_end_tag tok_cur_tag
+				tok_state = tok_state_data
+				return tok_cur_tag
+			# fall through
 		if is_uc_alpha(c)
 			tok_cur_tag.name += c.toLowerCase()
 			temporary_buffer += c.toLowerCase()
@@ -3516,16 +3529,16 @@ parse_html = (txt, parse_error_cb = null) ->
 				return tmp
 			when "\u0000"
 				parse_error()
-				tok_cur_tag.attrs_a[0][0] = "\ufffd"
+				tok_cur_tag.attrs_a[0][0] += "\ufffd"
 			when '"', "'", '<'
 				parse_error()
-				tok_cur_tag.attrs_a[0][0] = c
+				tok_cur_tag.attrs_a[0][0] += c
 			when '' # EOF
 				parse_error()
 				tok_state = tok_state_data
 			else
 				if is_uc_alpha(c)
-					tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
+					tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
 				else
 					tok_cur_tag.attrs_a[0][0] += c
 		return null
@@ -3723,7 +3736,7 @@ parse_html = (txt, parse_error_cb = null) ->
 			return
 		# Otherwise
 		parse_error()
-		tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
+		tok_cur_tag = new_comment_token ''
 		tok_state = tok_state_bogus_comment
 		return
 
@@ -3734,6 +3747,7 @@ parse_html = (txt, parse_error_cb = null) ->
 				tok_state = tok_state_comment_start_dash
 			when "\u0000"
 				parse_error()
+				tok_state = tok_state_comment
 				return new_character_token "\ufffd"
 			when '>'
 				parse_error()
@@ -3746,6 +3760,7 @@ parse_html = (txt, parse_error_cb = null) ->
 				return tok_cur_tag
 			else
 				tok_cur_tag.text += c
+				tok_state = tok_state_comment
 		return null
 
 	# 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
@@ -4294,7 +4309,9 @@ parse_html = (txt, parse_error_cb = null) ->
 		else
 			val = txt.substr cur, (next_gt - cur)
 			cur = next_gt + 3
-		val = val.replace "\u0000", "\ufffd" # fixfull spec doesn't say this
+		val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
+		val = val.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
+		val = val.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
 		return new_character_token val # fixfull split
 
 	# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
@@ -4375,13 +4392,15 @@ parse_html = (txt, parse_error_cb = null) ->
 
 	# tree constructor initialization
 	# see comments on TYPE_TAG/etc for the structure of this data
+	txt = args.html
+	cur = 0
 	doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
 	open_els = []
 	afe = [] # active formatting elements
 	template_ins_modes = []
 	ins_mode = ins_mode_initial
 	original_ins_mode = ins_mode # TODO check spec
-	flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
+	flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
 	flag_frameset_ok = true
 	flag_parsing = true
 	flag_foster_parenting = false
@@ -4395,6 +4414,8 @@ parse_html = (txt, parse_error_cb = null) ->
 	# tokenizer initialization
 	tok_state = tok_state_data
 
+	if args.name is "one_that_breaks #1"
+		throw "hi" # console.log "hi"
 	# proccess input
 	# http://www.w3.org/TR/html5/syntax.html#tree-construction
 	while flag_parsing