return [TYPE_TEXT, c]
return null
+ # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
+ # & just got consumed
+ tok_state_character_reference_in_data = ->
+ tok_state = tok_state_data
+ if cur >= txt.length
+ return [TYPE_TEXT, '&']
+ switch c = txt.charAt(cur)
+ when ';'
+ return [TYPE_TEXT, '&']
+ when '#'
+ if cur + 1 >= txt.length
+ return [TYPE_TEXT, '&']
+ if txt.charAt(cur + 1).toLowerCase() is 'x'
+ prefix = '#x'
+ charset = hex_chars
+ start = cur + 2
+ else
+ charset = digits
+ start = cur + 1
+ prefix = '#'
+ i = 0
+ while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
+ i += 1
+ if i is 0
+ return [TYPE_TEXT, '&']
+ if txt.charAt(start + i) is ';'
+ i += 1
+ decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
+ if decoded?
+ cur = start + i
+ return [TYPE_TEXT, decoded]
+ return [TYPE_TEXT, '&']
+ else
+ for i in [0...31]
+ if alnum.indexOf(txt.charAt(cur + i)) is -1
+ break
+ if i is 0
+ return [TYPE_TEXT, '&']
+ if txt.charAt(cur + i) is ';'
+ i += 1 # include ';' terminator in value
+ decoded = decode_named_char_ref txt.substr(cur, i)
+ if decoded?
+ cur += i
+ return [TYPE_TEXT, decoded]
+ return [TYPE_TEXT, '&']
+ else
+ # no ';' terminator (only legacy char refs)
+ if i < 2 or i > 6
+ return [TYPE_TEXT, '&']
+ # FIXME: if we're inside an attribute:
+ # 1. don't parse refs that are followed by =
+ # 2. don't parse refs that are followed by alnum
+ max = i
+ for i in [2..max] # no prefix matches, so ok to check shortest first
+ c = legacy_char_refs[txt.substr(cur, i)]
+ if c?
+ cur += i # consume entity chars
+ return [TYPE_TEXT, c]
+ return null
+
# 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
tok_state_tag_open = ->
switch c = txt.charAt(cur++)
cur -= 1 # we didn't handle that char
return null
-
- # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
- # & just got consumed
- tok_state_character_reference_in_data = ->
- tok_state = tok_state_data
- if cur >= txt.length
- return [TYPE_TEXT, '&']
- switch c = txt.charAt(cur)
- when ';'
- return [TYPE_TEXT, '&']
- when '#'
- if cur + 1 >= txt.length
- return [TYPE_TEXT, '&']
- if txt.charAt(cur + 1).toLowerCase() is 'x'
- prefix = '#x'
- charset = hex_chars
- start = cur + 2
- else
- charset = digits
- start = cur + 1
- prefix = '#'
- i = 0
- while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
- i += 1
- if i is 0
- return [TYPE_TEXT, '&']
- if txt.charAt(start + i) is ';'
- i += 1
- decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
- if decoded?
- cur = start + i
- return [TYPE_TEXT, decoded]
- return [TYPE_TEXT, '&']
- else
- for i in [0...31]
- if alnum.indexOf(txt.charAt(cur + i)) is -1
- break
- if i is 0
- return [TYPE_TEXT, '&']
- if txt.charAt(cur + i) is ';'
- i += 1 # include ';' terminator in value
- decoded = decode_named_char_ref txt.substr(cur, i)
- if decoded?
- cur += i
- return [TYPE_TEXT, decoded]
- return [TYPE_TEXT, '&']
- else
- # no ';' terminator (only legacy char refs)
- if i < 2 or i > 6
- return [TYPE_TEXT, '&']
- # FIXME: if we're inside an attribute:
- # 1. don't parse refs that are followed by =
- # 2. don't parse refs that are followed by alnum
- max = i
- for i in [2..max] # no prefix matches, so ok to check shortest first
- c = legacy_char_refs[txt.substr(cur, i)]
- if c?
- cur += i # consume entity chars
- return [TYPE_TEXT, c]
- return null
-
# the functions below impliment the Tree Contstruction algorithm here:
# http://www.w3.org/TR/html5/syntax.html#tree-construction
# FIXME this is just a bit of a hack that makes sense... read spec and do it that way