X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=d452803d19686026df20e525109c4cd004e21789;hb=0bc074bd5f825a287e00a54ac242ae2949d7f708;hp=55d73a31b03afd40f5cd393a615a6fd670cd5467;hpb=38367e9bad7bcad9f30c2c86d15feba64c71e1ad;p=peach-html5-editor.git

diff --git a/parse-html.coffee b/parse-html.coffee
index 55d73a3..d452803 100644
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -512,8 +512,9 @@ decode_named_char_ref = (txt) ->
 	return null if decoded is txt
 	return g_dncr.cache[txt] = decoded
 
-parse_html = (txt, parse_error_cb = null) ->
-	cur = 0 # index of next char in txt to be parsed
+parse_html = (args) ->
+	txt = null
+	cur = null # index of next char in txt to be parsed
 	# declare doc and tokenizer variables so they're in scope below
 	doc = null
 	open_els = null # stack of open elements
@@ -538,8 +539,8 @@ parse_html = (txt, parse_error_cb = null) ->
 		flag_parsing = false
 
 	parse_error = ->
-		if parse_error_cb?
-			parse_error_cb cur
+		if args.error_cb?
+			args.error_cb cur
 		else
 			console.log "Parse error at character #{cur} of #{txt.length}"
 
@@ -1439,7 +1440,7 @@ parse_html = (txt, parse_error_cb = null) ->
 		if t.type is TYPE_START_TAG and t.name is 'title'
 			parse_generic_rcdata_text t
 			return
-		if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
+		if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
 			parse_generic_raw_text t
 			return
 		if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
@@ -1501,14 +1502,14 @@ parse_html = (txt, parse_error_cb = null) ->
 		if t.type is TYPE_DOCTYPE
 			parse_error()
 			return
-		if t.type is TYPE_START_TAG
+		if t.type is TYPE_START_TAG and t.name is 'html'
 			ins_mode_in_body t
 			return
 		if t.type is TYPE_END_TAG and t.name is 'noscript'
 			open_els.shift()
 			ins_mode = ins_mode_in_head
 			return
-		if (t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\u000a" or t.text is "\u000c" or t.text is "\u000d" or t.text is ' ')) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
+		if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
 			ins_mode_in_head t
 			return
 		if t.type is TYPE_END_TAG and t.name is 'br'
@@ -3041,9 +3042,9 @@ parse_html = (txt, parse_error_cb = null) ->
 	is_appropriate_end_tag = (t) ->
 		# spec says to check against "the tag name of the last start tag to
 		# have been emitted from this tokenizer", but this is only called from
-		# the various "raw" states, which I'm pretty sure all push the start
-		# token onto open_els. TODO: verify this after the script data states
-		# are implemented
+		# the various "raw" states, so it's hopefully ok to assume that
+		# open_els[0].name will work instead TODO: verify this after the script
+		# data states are implemented
 		debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
 		return t.type is TYPE_END_TAG and t.name is open_els[0].name
 
@@ -3185,6 +3186,11 @@ parse_html = (txt, parse_error_cb = null) ->
 				tok_state = tok_state_self_closing_start_tag
 				return
 			# fall through
+		if c is '>'
+			if is_appropriate_end_tag tok_cur_tag
+				tok_state = tok_state_data
+				return tok_cur_tag
+			# fall through
 		if is_uc_alpha(c)
 			tok_cur_tag.name += c.toLowerCase()
 			temporary_buffer += c
@@ -3337,6 +3343,11 @@ parse_html = (txt, parse_error_cb = null) ->
 				tok_state = tok_state_self_closing_start_tag
 				return
 			# fall through
+		if c is '>'
+			if is_appropriate_end_tag tok_cur_tag
+				tok_state = tok_state_data
+				return tok_cur_tag
+			# fall through
 		if is_uc_alpha(c)
 			tok_cur_tag.name += c.toLowerCase()
 			temporary_buffer += c.toLowerCase()
@@ -3517,16 +3528,16 @@ parse_html = (txt, parse_error_cb = null) ->
 				return tmp
 			when "\u0000"
 				parse_error()
-				tok_cur_tag.attrs_a[0][0] = "\ufffd"
+				tok_cur_tag.attrs_a[0][0] += "\ufffd"
 			when '"', "'", '<'
 				parse_error()
-				tok_cur_tag.attrs_a[0][0] = c
+				tok_cur_tag.attrs_a[0][0] += c
 			when '' # EOF
 				parse_error()
 				tok_state = tok_state_data
 			else
 				if is_uc_alpha(c)
-					tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
+					tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
 				else
 					tok_cur_tag.attrs_a[0][0] += c
 		return null
@@ -3735,6 +3746,7 @@ parse_html = (txt, parse_error_cb = null) ->
 				tok_state = tok_state_comment_start_dash
 			when "\u0000"
 				parse_error()
+				tok_state = tok_state_comment
 				return new_character_token "\ufffd"
 			when '>'
 				parse_error()
@@ -3747,6 +3759,7 @@ parse_html = (txt, parse_error_cb = null) ->
 				return tok_cur_tag
 			else
 				tok_cur_tag.text += c
+				tok_state = tok_state_comment
 		return null
 
 	# 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
@@ -4295,7 +4308,9 @@ parse_html = (txt, parse_error_cb = null) ->
 		else
 			val = txt.substr cur, (next_gt - cur)
 			cur = next_gt + 3
-		val = val.replace "\u0000", "\ufffd" # fixfull spec doesn't say this
+		val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
+		val = val.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
+		val = val.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
 		return new_character_token val # fixfull split
 
 	# 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
@@ -4376,13 +4391,15 @@ parse_html = (txt, parse_error_cb = null) ->
 
 	# tree constructor initialization
 	# see comments on TYPE_TAG/etc for the structure of this data
+	txt = args.html
+	cur = 0
 	doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
 	open_els = []
 	afe = [] # active formatting elements
 	template_ins_modes = []
 	ins_mode = ins_mode_initial
 	original_ins_mode = ins_mode # TODO check spec
-	flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
+	flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
 	flag_frameset_ok = true
 	flag_parsing = true
 	flag_foster_parenting = false