From 3ff49c30096e8e97599b98755157f9a692937f58 Mon Sep 17 00:00:00 2001
From: Jason Woofenden <jason@jasonwoof.com>
Date: Wed, 23 Dec 2015 18:59:41 -0500
Subject: [PATCH] decode &#0098;/etc internally, fix more ns

---
 index.html        |    1 +
 parse-html.coffee |  103 ++++++++++++++++++++++++++++++++++++++++-------------
 2 files changed, 79 insertions(+), 25 deletions(-)
diff --git a/index.html b/index.html
index 5750538..6d8e4f5 100644
--- a/index.html
+++ b/index.html
@@ -1,6 +1,7 @@
 <!DOCTYPE html>
 <html lang="en">
 <head>
+	<meta charset="UTF-8">
 	<title>html parser tester</title>
 	<script src="parse-html.js"></script>
 	<script src="test.js"></script>
diff --git a/parse-html.coffee b/parse-html.coffee
index 60a10f3..06c2a1b 100644
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -54,6 +54,15 @@ unless module?.exports?
 	window.wheic = {}
 	module = exports: window.wheic
 
+from_code_point = (x) ->
+	if String.fromCodePoint?
+		return String.fromCodePoint x
+	else
+		if x <= 0xffff
+			return String.fromCharCode x
+		x -= 0x10000
+		return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
+
 # Each node is an obect of the Node class. Here are the Node types:
 TYPE_TAG = 0 # name, {attributes}, [children]
 TYPE_TEXT = 1 # "text"
@@ -206,6 +215,36 @@ is_input_hidden_tok = (t) ->
 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
 
+unicode_fixes = {}
+unicode_fixes[0x00] = "\uFFFD"
+unicode_fixes[0x80] = "\u20AC"
+unicode_fixes[0x82] = "\u201A"
+unicode_fixes[0x83] = "\u0192"
+unicode_fixes[0x84] = "\u201E"
+unicode_fixes[0x85] = "\u2026"
+unicode_fixes[0x86] = "\u2020"
+unicode_fixes[0x87] = "\u2021"
+unicode_fixes[0x88] = "\u02C6"
+unicode_fixes[0x89] = "\u2030"
+unicode_fixes[0x8A] = "\u0160"
+unicode_fixes[0x8B] = "\u2039"
+unicode_fixes[0x8C] = "\u0152"
+unicode_fixes[0x8E] = "\u017D"
+unicode_fixes[0x91] = "\u2018"
+unicode_fixes[0x92] = "\u2019"
+unicode_fixes[0x93] = "\u201C"
+unicode_fixes[0x94] = "\u201D"
+unicode_fixes[0x95] = "\u2022"
+unicode_fixes[0x96] = "\u2013"
+unicode_fixes[0x97] = "\u2014"
+unicode_fixes[0x98] = "\u02DC"
+unicode_fixes[0x99] = "\u2122"
+unicode_fixes[0x9A] = "\u0161"
+unicode_fixes[0x9B] = "\u203A"
+unicode_fixes[0x9C] = "\u0153"
+unicode_fixes[0x9E] = "\u017E"
+unicode_fixes[0x9F] = "\u0178"
+
 # These are the character references that don't need a terminating semicolon
 # min length: 2, max: 6, none are a prefix of any other.
 legacy_char_refs = {
@@ -683,7 +722,7 @@ parse_html = (args) ->
 				# fixfull (fragment case)
 
 			# 4. If node is a select element, run these substeps:
-			if node.name is 'select'
+			if node.name is 'select' and node.namespace is NS_HTML
 				# 1. If last is true, jump to the step below labeled done.
 				unless last
 					# 2. Let ancestor be node.
@@ -700,11 +739,11 @@ parse_html = (args) ->
 						ancestor = open_els[ancestor_i]
 						# 5. If ancestor is a template node, jump to the step below
 						# labeled done.
-						if ancestor.name is 'template'
+						if ancestor.name is 'template' and ancestor.namespace is NS_HTML
 							break
 						# 6. If ancestor is a table node, switch the insertion mode
 						# to "in select in table" and abort these steps.
-						if ancestor.name is 'table'
+						if ancestor.name is 'table' and ancestor.namespace is NS_HTML
 							ins_mode = ins_mode_in_select_in_table
 							return
 						# 7. Jump back to the step labeled loop.
@@ -714,61 +753,62 @@ parse_html = (args) ->
 				return
 			# 5. If node is a td or th element and last is false, then switch
 			# the insertion mode to "in cell" and abort these steps.
-			if (node.name is 'td' or node.name is 'th') and last is false
+			if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
 				ins_mode = ins_mode_in_cell
 				return
 			# 6. If node is a tr element, then switch the insertion mode to "in
 			# row" and abort these steps.
-			if node.name is 'tr'
+			if node.name is 'tr' and node.namespace is NS_HTML
 				ins_mode = ins_mode_in_row
 				return
 			# 7. If node is a tbody, thead, or tfoot element, then switch the
 			# insertion mode to "in table body" and abort these steps.
-			if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
+			if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
 				ins_mode = ins_mode_in_table_body
 				return
 			# 8. If node is a caption element, then switch the insertion mode
 			# to "in caption" and abort these steps.
-			if node.name is 'caption'
+			if node.name is 'caption' and node.namespace is NS_HTML
 				ins_mode = ins_mode_in_caption
 				return
 			# 9. If node is a colgroup element, then switch the insertion mode
 			# to "in column group" and abort these steps.
-			if node.name is 'colgroup'
+			if node.name is 'colgroup' and node.namespace is NS_HTML
 				ins_mode = ins_mode_in_column_group
 				return
 			# 10. If node is a table element, then switch the insertion mode to
 			# "in table" and abort these steps.
-			if node.name is 'table'
+			if node.name is 'table' and node.namespace is NS_HTML
 				ins_mode = ins_mode_in_table
 				return
 			# 11. If node is a template element, then switch the insertion mode
 			# to the current template insertion mode and abort these steps.
-			# fixfull (template insertion mode stack)
-
+			if node.name is 'template' and node.namespace is NS_HTML
+				ins_mode = template_ins_modes[0]
+				return
 			# 12. If node is a head element and last is true, then switch the
 			# insertion mode to "in body" ("in body"! not "in head"!) and abort
 			# these steps. (fragment case)
-			if node.name is 'head' and last
+			if node.name is 'head' and node.namespace is NS_HTML and last
 				ins_mode = ins_mode_in_body
 				return
 			# 13. If node is a head element and last is false, then switch the
 			# insertion mode to "in head" and abort these steps.
-			if node.name is 'head' and last is false
+			if node.name is 'head' and node.namespace is NS_HTML and last is false
 				ins_mode = ins_mode_in_head
 				return
 			# 14. If node is a body element, then switch the insertion mode to
 			# "in body" and abort these steps.
-			if node.name is 'body'
+			if node.name is 'body' and node.namespace is NS_HTML
 				ins_mode = ins_mode_in_body
 				return
 			# 15. If node is a frameset element, then switch the insertion mode
 			# to "in frameset" and abort these steps. (fragment case)
-			if node.name is 'frameset'
+			if node.name is 'frameset' and node.namespace is NS_HTML
 				ins_mode = ins_mode_in_frameset
 				return
 			# 16. If node is an html element, run these substeps:
-			if node.name is 'html'
+			if node.name is 'html' and node.namespace is NS_HTML
 				# 1. If the head element pointer is null, switch the insertion
 				# mode to "before head" and abort these steps. (fragment case)
 				if head_element_pointer is null
@@ -4322,26 +4362,39 @@ parse_html = (args) ->
 				if cur + 1 >= txt.length
 					return '&'
 				if txt.charAt(cur + 1).toLowerCase() is 'x'
-					prefix = '#x'
+					base = 16
 					charset = hex_chars
 					start = cur + 2
 				else
 					charset = digits
 					start = cur + 1
-					prefix = '#'
+					base = 10
 				i = 0
 				while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
 					i += 1
 				if i is 0
 					return '&'
+				cur = start + i
 				if txt.charAt(start + i) is ';'
-					i += 1
-				# FIXME This is supposed to generate parse errors for some chars
-				decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
-				if decoded?
-					cur = start + i
-					return decoded
-				return '&'
+					cur += 1
+				else
+					parse_error()
+				code_point = txt.substr(start, i)
+				while code_point.charAt(0) is '0' and code_point.length > 1
+					code_point = code_point.substr 1
+				code_point = parseInt(code_point, base)
+				if unicode_fixes[code_point]?
+					parse_error()
+					return unicode_fixes[code_point]
+				else
+					if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
+						parse_error()
+						return "\ufffd"
+					else
+						if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
+							parse_error()
+						return from_code_point code_point
+				return
 			else
 				for i in [0...31]
 					if alnum.indexOf(txt.charAt(cur + i)) is -1
-- 
1.7.10.4