TYPE_WHITESPACE = 2
TYPE_COMMENT = 3
# the following types are emited by the tokenizer, but shouldn't end up in the tree:
-TYPE_OPEN_TAG = 4
+TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
lc_alpha = "abcdefghijklmnopqrstuvwxqz"
uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
tree_append_point = null
tree_state = null
tok_state = null
- tok_cur = null # partially parsed tag
+ tok_cur_tag = null # partially parsed tag
# the functions below implement the tokenizer stats described here:
# http://www.w3.org/TR/html5/syntax.html#tokenization
- # http://www.w3.org/TR/html5/syntax.html#data-state
+ # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
tok_state_data = ->
switch c = txt.charAt(cur++)
when '&'
return [TYPE_TEXT, c]
return null
- # http://www.w3.org/TR/html5/syntax.html#tag-open-state
+ # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
tok_state_tag_open = ->
switch c = txt.charAt(cur++)
when '!'
tok_state = tok_state_bogus_comment
else
if lc_alpha.indexOf(c) > -1
- tok_cur = [TYPE_OPEN_TAG, c, {}, []]
+ tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
tok_state = tok_state_tag_name
else if uc_alpha.indexOf(c) > -1
- tok_cur = [TYPE_OPEN_TAG, c.toLowerCase(), {}, []]
+ tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
tok_state = tok_state_tag_name
else
# Parse error
return [TYPE_TEXT, '<']
return null
- # http://www.w3.org/TR/html5/syntax.html#tag-name-state
+ # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
tok_state_tag_name = ->
switch c = txt.charAt(cur++)
- when "\t", "\n", ' '
+ when "\t", "\n", "\u000c", ' '
tok_state = tok_state_before_attribute_name
when '/'
tok_state = tok_state_self_closing_start_tag
when '>'
tok_state = tok_state_data
- tmp = tok_cur
- tok_cur = null
+ tmp = tok_cur_tag
+ tok_cur_tag = null
return tmp
when "\u0000"
# Parse error
- tok_cur[1] += "\ufffd"
+ tok_cur_tag[1] += "\ufffd"
else
if uc_alpha.indexOf(c) > -1
- tok_cur[1] += c.toLowerCase()
+ tok_cur_tag[1] += c.toLowerCase()
else
- tok_cur[1] += c
+ tok_cur_tag[1] += c
return null
- # http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
+ # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
+ tok_state_before_attribute_name = ->
+ attr_name = null
+ switch c = txt.charAt(cur++)
+ when "\t", "\n", "\u000c", ' '
+ return null
+ when '/'
+ tok_state = tok_state_self_closing_start_tag
+ return null
+ when '>'
+ tok_state = tok_state_data
+ tmp = tok_cur_tag
+ tok_cur_tag = null
+ return tmp
+ when "\u0000"
+ # Parse error
+ attr_name = "\ufffd"
+ when '"', "'", '<', '='
+ # Parse error
+ attr_name = c
+ else
+ if uc_alpha.indexOf(c) > -1
+ attr_name = c.toLowerCase()
+ else
+ attr_name = c
+ if attr_name?
+ tok_cur_tag[2].unshift [attr_name, '']
+ tok_state = tok_state_attribute_name
+ return null
+
+ # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
+ tok_state_attribute_name = ->
+ switch c = txt.charAt(cur++)
+ when "\t", "\n", "\u000c", ' '
+ tok_state = tok_state_after_attribute_name
+ when '/'
+ tok_state = tok_state_self_closing_start_tag
+ when '='
+ tok_state = tok_state_before_attribute_value
+ when '>'
+ tok_state = tok_state_data
+ tmp = tok_cur_tag
+ tok_cur_tag = null
+ return tmp
+ when "\u0000"
+ # Parse error
+ tok_cur_tag[2][0][0] += "\ufffd"
+ else
+ if uc_alpha.indexOf(c) > -1
+ tok_cur_tag[2][0][0] += c.toLowerCase()
+ else
+ # Parse error if ", ' or <
+ tok_cur_tag[2][0][0] += c
+ return null
+
+ # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
+ tok_state_before_attribute_value = ->
+ switch c = txt.charAt(cur++)
+ when "\t", "\n", "\u000c", ' '
+ return null
+ when '"'
+ tok_state = tok_state_attribute_value_double_quoted
+ when '&'
+ tok_state = tok_state_attribute_value_unquoted
+ cur -= 1
+ when "'"
+ tok_state = tok_state_attribute_value_single_quoted
+ when "\u0000"
+ # Parse error
+ tok_cur_tag[2][0][1] += "\ufffd"
+ tok_state = tok_state_attribute_value_unquoted
+ when '>'
+ # Parse error
+ tok_state = tok_state_data
+ tmp = tok_cur_tag
+ tok_cur_tag = null
+ return tmp
+ else
+ if uc_alpha.indexOf(c) > -1
+ tok_cur_tag[2][0][1] += c.toLowerCase()
+ else
+ # Parse error if ", ` or < (that's a backtick)
+ tok_cur_tag[2][0][1] += c
+ return null
+
+ # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
+ tok_state_attribute_value_double_quoted = ->
+ switch c = txt.charAt(cur++)
+ when '"'
+ tok_state = tok_state_after_attribute_value_quoted
+ when '&'
+ tok_state = tok_state_character_reference_in_attribute_value
+ tok_char_ref_addl_allowed = '"' # FIXME
+ when "\u0000"
+ # Parse error
+ tok_cur_tag[2][0][1] += "\ufffd"
+ tok_state = tok_state_attribute_value_unquoted
+ else
+ tok_cur_tag[2][0][1] += c
+ return null
+
+ # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
+ tok_state_after_attribute_value_quoted = ->
+ switch c = txt.charAt(cur++)
+ when "\t", "\n", "\u000c", ' '
+ tok_state = tok_state_before_attribute_name
+ when '/'
+ tok_state = tok_state_self_closing_start_tag
+ when '>'
+ tok_state = tok_state_data
+ tmp = tok_cur_tag
+ tok_cur_tag = null
+ return tmp
+ else
+ # Parse Error
+ tok_state = tok_state_before_attribute_name
+ cur -= 1 # we didn't handle that char
+ return null
+
+
+ # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
# & just got consumed
tok_state_character_reference_in_data = ->
tok_state = tok_state_data
tree_append_point.push t
if t[0] is TYPE_OPEN_TAG
t[0] = TYPE_TAG
+ attrs = {}
+ while t[2].length
+ a = t[2].pop()
+ attrs[a[0]] = a[1]
+ t[2] = attrs
tree_append_point = t[3]
# tree constructor initialization
test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]'
test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1€€ ƒ"]]'
-test_equals "open_tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
+test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
+test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar"},[[1,"bar"]]]]'