window.wheic = {}
module = exports: window.wheic
+from_code_point = (x) ->
+ if String.fromCodePoint?
+ return String.fromCodePoint x
+ else
+ if x <= 0xffff
+ return String.fromCharCode x
+ x -= 0x10000
+ return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
+
# Each node is an obect of the Node class. Here are the Node types:
TYPE_TAG = 0 # name, {attributes}, [children]
TYPE_TEXT = 1 # "text"
# https://en.wikipedia.org/wiki/Whitespace_character#Unicode
whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
+unicode_fixes = {}
+unicode_fixes[0x00] = "\uFFFD"
+unicode_fixes[0x80] = "\u20AC"
+unicode_fixes[0x82] = "\u201A"
+unicode_fixes[0x83] = "\u0192"
+unicode_fixes[0x84] = "\u201E"
+unicode_fixes[0x85] = "\u2026"
+unicode_fixes[0x86] = "\u2020"
+unicode_fixes[0x87] = "\u2021"
+unicode_fixes[0x88] = "\u02C6"
+unicode_fixes[0x89] = "\u2030"
+unicode_fixes[0x8A] = "\u0160"
+unicode_fixes[0x8B] = "\u2039"
+unicode_fixes[0x8C] = "\u0152"
+unicode_fixes[0x8E] = "\u017D"
+unicode_fixes[0x91] = "\u2018"
+unicode_fixes[0x92] = "\u2019"
+unicode_fixes[0x93] = "\u201C"
+unicode_fixes[0x94] = "\u201D"
+unicode_fixes[0x95] = "\u2022"
+unicode_fixes[0x96] = "\u2013"
+unicode_fixes[0x97] = "\u2014"
+unicode_fixes[0x98] = "\u02DC"
+unicode_fixes[0x99] = "\u2122"
+unicode_fixes[0x9A] = "\u0161"
+unicode_fixes[0x9B] = "\u203A"
+unicode_fixes[0x9C] = "\u0153"
+unicode_fixes[0x9E] = "\u017E"
+unicode_fixes[0x9F] = "\u0178"
+
# These are the character references that don't need a terminating semicolon
# min length: 2, max: 6, none are a prefix of any other.
legacy_char_refs = {
# fixfull (fragment case)
# 4. If node is a select element, run these substeps:
- if node.name is 'select'
+ if node.name is 'select' and node.namespace is NS_HTML
# 1. If last is true, jump to the step below labeled done.
unless last
# 2. Let ancestor be node.
ancestor = open_els[ancestor_i]
# 5. If ancestor is a template node, jump to the step below
# labeled done.
- if ancestor.name is 'template'
+ if ancestor.name is 'template' and ancestor.namespace is NS_HTML
break
# 6. If ancestor is a table node, switch the insertion mode
# to "in select in table" and abort these steps.
- if ancestor.name is 'table'
+ if ancestor.name is 'table' and ancestor.namespace is NS_HTML
ins_mode = ins_mode_in_select_in_table
return
# 7. Jump back to the step labeled loop.
return
# 5. If node is a td or th element and last is false, then switch
# the insertion mode to "in cell" and abort these steps.
- if (node.name is 'td' or node.name is 'th') and last is false
+ if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
ins_mode = ins_mode_in_cell
return
# 6. If node is a tr element, then switch the insertion mode to "in
# row" and abort these steps.
- if node.name is 'tr'
+ if node.name is 'tr' and node.namespace is NS_HTML
ins_mode = ins_mode_in_row
return
# 7. If node is a tbody, thead, or tfoot element, then switch the
# insertion mode to "in table body" and abort these steps.
- if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
+ if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
ins_mode = ins_mode_in_table_body
return
# 8. If node is a caption element, then switch the insertion mode
# to "in caption" and abort these steps.
- if node.name is 'caption'
+ if node.name is 'caption' and node.namespace is NS_HTML
ins_mode = ins_mode_in_caption
return
# 9. If node is a colgroup element, then switch the insertion mode
# to "in column group" and abort these steps.
- if node.name is 'colgroup'
+ if node.name is 'colgroup' and node.namespace is NS_HTML
ins_mode = ins_mode_in_column_group
return
# 10. If node is a table element, then switch the insertion mode to
# "in table" and abort these steps.
- if node.name is 'table'
+ if node.name is 'table' and node.namespace is NS_HTML
ins_mode = ins_mode_in_table
return
# 11. If node is a template element, then switch the insertion mode
# to the current template insertion mode and abort these steps.
- # fixfull (template insertion mode stack)
-
+ if node.name is 'template' and node.namespace is NS_HTML
+ ins_mode = template_ins_modes[0]
+ return
# 12. If node is a head element and last is true, then switch the
# insertion mode to "in body" ("in body"! not "in head"!) and abort
# these steps. (fragment case)
- if node.name is 'head' and last
+ if node.name is 'head' and node.namespace is NS_HTML and last
ins_mode = ins_mode_in_body
return
# 13. If node is a head element and last is false, then switch the
# insertion mode to "in head" and abort these steps.
- if node.name is 'head' and last is false
+ if node.name is 'head' and node.namespace is NS_HTML and last is false
ins_mode = ins_mode_in_head
return
# 14. If node is a body element, then switch the insertion mode to
# "in body" and abort these steps.
- if node.name is 'body'
+ if node.name is 'body' and node.namespace is NS_HTML
ins_mode = ins_mode_in_body
return
# 15. If node is a frameset element, then switch the insertion mode
# to "in frameset" and abort these steps. (fragment case)
- if node.name is 'frameset'
+ if node.name is 'frameset' and node.namespace is NS_HTML
ins_mode = ins_mode_in_frameset
return
# 16. If node is an html element, run these substeps:
- if node.name is 'html'
+ if node.name is 'html' and node.namespace is NS_HTML
# 1. If the head element pointer is null, switch the insertion
# mode to "before head" and abort these steps. (fragment case)
if head_element_pointer is null
if cur + 1 >= txt.length
return '&'
if txt.charAt(cur + 1).toLowerCase() is 'x'
- prefix = '#x'
+ base = 16
charset = hex_chars
start = cur + 2
else
charset = digits
start = cur + 1
- prefix = '#'
+ base = 10
i = 0
while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
i += 1
if i is 0
return '&'
+ cur = start + i
if txt.charAt(start + i) is ';'
- i += 1
- # FIXME This is supposed to generate parse errors for some chars
- decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
- if decoded?
- cur = start + i
- return decoded
- return '&'
+ cur += 1
+ else
+ parse_error()
+ code_point = txt.substr(start, i)
+ while code_point.charAt(0) is '0' and code_point.length > 1
+ code_point = code_point.substr 1
+ code_point = parseInt(code_point, base)
+ if unicode_fixes[code_point]?
+ parse_error()
+ return unicode_fixes[code_point]
+ else
+ if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
+ parse_error()
+ return "\ufffd"
+ else
+ if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
+ parse_error()
+ return from_code_point code_point
+ return
else
for i in [0...31]
if alnum.indexOf(txt.charAt(cur + i)) is -1