return [TYPE_TEXT, c]
return null
+ # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
+ # & just got consumed
+ tok_state_character_reference_in_data = ->
+ tok_state = tok_state_data
+ if cur >= txt.length
+ return [TYPE_TEXT, '&']
+ switch c = txt.charAt(cur)
+ when ';'
+ return [TYPE_TEXT, '&']
+ when '#'
+ if cur + 1 >= txt.length
+ return [TYPE_TEXT, '&']
+ if txt.charAt(cur + 1).toLowerCase() is 'x'
+ prefix = '#x'
+ charset = hex_chars
+ start = cur + 2
+ else
+ charset = digits
+ start = cur + 1
+ prefix = '#'
+ i = 0
+ while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
+ i += 1
+ if i is 0
+ return [TYPE_TEXT, '&']
+ if txt.charAt(start + i) is ';'
+ i += 1
+ decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
+ if decoded?
+ cur = start + i
+ return [TYPE_TEXT, decoded]
+ return [TYPE_TEXT, '&']
+ else
+ for i in [0...31]
+ if alnum.indexOf(txt.charAt(cur + i)) is -1
+ break
+ if i is 0
+ return [TYPE_TEXT, '&']
+ if txt.charAt(cur + i) is ';'
+ i += 1 # include ';' terminator in value
+ decoded = decode_named_char_ref txt.substr(cur, i)
+ if decoded?
+ cur += i
+ return [TYPE_TEXT, decoded]
+ return [TYPE_TEXT, '&']
+ else
+ # no ';' terminator (only legacy char refs)
+ if i < 2 or i > 6
+ return [TYPE_TEXT, '&']
+ # FIXME: if we're inside an attribute:
+ # 1. don't parse refs that are followed by =
+ # 2. don't parse refs that are followed by alnum
+ max = i
+ for i in [2..max] # no prefix matches, so ok to check shortest first
+ c = legacy_char_refs[txt.substr(cur, i)]
+ if c?
+ cur += i # consume entity chars
+ return [TYPE_TEXT, c]
+ return null
+
# 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
tok_state_tag_open = ->
switch c = txt.charAt(cur++)
tok_cur_tag = null
return tmp
else
- if uc_alpha.indexOf(c) > -1
- tok_cur_tag[2][0][1] += c.toLowerCase()
- else
- # Parse error if ", ` or < (that's a backtick)
- tok_cur_tag[2][0][1] += c
+ tok_cur_tag[2][0][1] += c
+ tok_state = tok_state_attribute_value_unquoted
return null
# 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
when "\u0000"
# Parse error
tok_cur_tag[2][0][1] += "\ufffd"
- tok_state = tok_state_attribute_value_unquoted
else
tok_cur_tag[2][0][1] += c
return null
+ # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
+ tok_state_attribute_value_single_quoted = ->
+ switch c = txt.charAt(cur++)
+ when "'"
+ tok_state = tok_state_after_attribute_value_quoted
+ when '&'
+ tok_state = tok_state_character_reference_in_attribute_value
+ tok_char_ref_addl_allowed = "'" # FIXME
+ when "\u0000"
+ # Parse error
+ tok_cur_tag[2][0][1] += "\ufffd"
+ else
+ tok_cur_tag[2][0][1] += c
+ return null
+
+ # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
+ tok_state_attribute_value_unquoted = ->
+ switch c = txt.charAt(cur++)
+ when "\t", "\n", "\u000c", ' '
+ tok_state = tok_state_before_attribute_name
+ when '&'
+ tok_state = tok_state_character_reference_in_attribute_value
+ tok_char_ref_addl_allowed = '>' # FIXME
+ when '>'
+ tok_state = tok_state_data
+ tmp = tok_cur_tag
+ tok_cur_tag = null
+ return tmp
+ when "\u0000"
+ tok_cur_tag[2][0][1] += "\ufffd"
+ else
+ # Parse Error if ', <, = or ` (backtick)
+ tok_cur_tag[2][0][1] += c
+ return null
+
# 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
tok_state_after_attribute_value_quoted = ->
switch c = txt.charAt(cur++)
cur -= 1 # we didn't handle that char
return null
-
- # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
- # & just got consumed
- tok_state_character_reference_in_data = ->
- tok_state = tok_state_data
- if cur >= txt.length
- return [TYPE_TEXT, '&']
- switch c = txt.charAt(cur)
- when ';'
- return [TYPE_TEXT, '&']
- when '#'
- if cur + 1 >= txt.length
- return [TYPE_TEXT, '&']
- if txt.charAt(cur + 1).toLowerCase() is 'x'
- prefix = '#x'
- charset = hex_chars
- start = cur + 2
- else
- charset = digits
- start = cur + 1
- prefix = '#'
- i = 0
- while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
- i += 1
- if i is 0
- return [TYPE_TEXT, '&']
- if txt.charAt(start + i) is ';'
- i += 1
- decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
- if decoded?
- cur = start + i
- return [TYPE_TEXT, decoded]
- return [TYPE_TEXT, '&']
- else
- for i in [0...31]
- if alnum.indexOf(txt.charAt(cur + i)) is -1
- break
- if i is 0
- return [TYPE_TEXT, '&']
- if txt.charAt(cur + i) is ';'
- i += 1 # include ';' terminator in value
- decoded = decode_named_char_ref txt.substr(cur, i)
- if decoded?
- cur += i
- return [TYPE_TEXT, decoded]
- return [TYPE_TEXT, '&']
- else
- # no ';' terminator (only legacy char refs)
- if i < 2 or i > 6
- return [TYPE_TEXT, '&']
- # FIXME: if we're inside an attribute:
- # 1. don't parse refs that are followed by =
- # 2. don't parse refs that are followed by alnum
- max = i
- for i in [2..max] # no prefix matches, so ok to check shortest first
- c = legacy_char_refs[txt.substr(cur, i)]
- if c?
- cur += i # consume entity chars
- return [TYPE_TEXT, c]
- return null
-
# the functions below impliment the Tree Contstruction algorithm here:
# http://www.w3.org/TR/html5/syntax.html#tree-construction
# FIXME this is just a bit of a hack that makes sense... read spec and do it that way
test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1€€ ƒ"]]'
test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
-test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar"},[[1,"bar"]]]]'
+test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
+test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'