+ # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
+ # & just got consumed
+ tok_state_character_reference_in_data = ->
+ tok_state = tok_state_data
+ if cur >= txt.length
+ return [TYPE_TEXT, '&']
+ switch c = txt.charAt(cur)
+ when ';'
+ return [TYPE_TEXT, '&']
+ when '#'
+ if cur + 1 >= txt.length
+ return [TYPE_TEXT, '&']
+ if txt.charAt(cur + 1).toLowerCase() is 'x'
+ prefix = '#x'
+ charset = hex_chars
+ start = cur + 2
+ else
+ charset = digits
+ start = cur + 1
+ prefix = '#'
+ i = 0
+ while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
+ i += 1
+ if i is 0
+ return [TYPE_TEXT, '&']
+ if txt.charAt(start + i) is ';'
+ i += 1
+ decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
+ if decoded?
+ cur = start + i
+ return [TYPE_TEXT, decoded]
+ return [TYPE_TEXT, '&']
+ else
+ for i in [0...31]
+ if alnum.indexOf(txt.charAt(cur + i)) is -1
+ break
+ if i is 0
+ return [TYPE_TEXT, '&']
+ if txt.charAt(cur + i) is ';'
+ i += 1 # include ';' terminator in value
+ decoded = decode_named_char_ref txt.substr(cur, i)
+ if decoded?
+ cur += i
+ return [TYPE_TEXT, decoded]
+ return [TYPE_TEXT, '&']
+ else
+ # no ';' terminator (only legacy char refs)
+ if i < 2 or i > 6
+ return [TYPE_TEXT, '&']
+ # FIXME: if we're inside an attribute:
+ # 1. don't parse refs that are followed by =
+ # 2. don't parse refs that are followed by alnum
+ max = i
+ for i in [2..max] # no prefix matches, so ok to check shortest first
+ c = legacy_char_refs[txt.substr(cur, i)]
+ if c?
+ cur += i # consume entity chars
+ return [TYPE_TEXT, c]
+ return null
+