TYPE_TEXT = 1 # "text"
TYPE_WHITESPACE = 2
TYPE_COMMENT = 3
+# the following types are emited by the tokenizer, but shouldn't end up in the tree:
+TYPE_OPEN_TAG = 4
-alnum = "abcdefghijklmnopqrstuvwxqzABCDEFGHIJKLMNOPQRSTUVWXQZ0123456789"
-hex_chars = "0123456789abcdefABCDEF"
+lc_alpha = "abcdefghijklmnopqrstuvwxqz"
+uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
digits = "0123456789"
+alnum = lc_alpha + uc_alpha + digits
+hex_chars = digits + "abcdefABCDEF"
# some SVG elements have dashes in them
tag_name_chars = alnum + "-"
tree_append_point = null
tree_state = null
tok_state = null
+ tok_cur = null # partially parsed tag
# the functions below implement the tokenizer stats described here:
# http://www.w3.org/TR/html5/syntax.html#tokenization
+ # http://www.w3.org/TR/html5/syntax.html#data-state
tok_state_data = ->
- if cur >= txt.length
- return null
switch c = txt.charAt(cur++)
when '&'
tok_state = tok_state_character_reference_in_data
return [TYPE_TEXT, c]
return null
+ # http://www.w3.org/TR/html5/syntax.html#tag-open-state
+ tok_state_tag_open = ->
+ switch c = txt.charAt(cur++)
+ when '!'
+ tok_state = tok_state_markup_declaration_open
+ when '/'
+ tok_state = tok_state_end_tag_open
+ when '?'
+ # Parse error
+ tok_state = tok_state_bogus_comment
+ else
+ if lc_alpha.indexOf(c) > -1
+ tok_cur = [TYPE_OPEN_TAG, c, {}, []]
+ tok_state = tok_state_tag_name
+ else if uc_alpha.indexOf(c) > -1
+ tok_cur = [TYPE_OPEN_TAG, c.toLowerCase(), {}, []]
+ tok_state = tok_state_tag_name
+ else
+ # Parse error
+ tok_state = tok_state_data
+ cur -= 1 # we didn't parse/handle the char after <
+ return [TYPE_TEXT, '<']
+ return null
+
+ # http://www.w3.org/TR/html5/syntax.html#tag-name-state
+ tok_state_tag_name = ->
+ switch c = txt.charAt(cur++)
+ when "\t", "\n", ' '
+ tok_state = tok_state_before_attribute_name
+ when '/'
+ tok_state = tok_state_self_closing_start_tag
+ when '>'
+ tok_state = tok_state_data
+ tmp = tok_cur
+ tok_cur = null
+ return tmp
+ when "\u0000"
+ # Parse error
+ tok_cur[1] += "\ufffd"
+ else
+ if uc_alpha.indexOf(c) > -1
+ tok_cur[1] += c.toLowerCase()
+ else
+ tok_cur[1] += c
+ return null
+
+ # http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
# & just got consumed
tok_state_character_reference_in_data = ->
tok_state = tok_state_data
cur += i # consume entity chars
return [TYPE_TEXT, c]
return null
-
# the functions below impliment the Tree Contstruction algorithm here:
# http://www.w3.org/TR/html5/syntax.html#tree-construction
+ # FIXME this is just a bit of a hack that makes sense... read spec and do it that way
tree_append = (t) ->
if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
tree_append_point[tree_append_point.length - 1][1] += t[1]
else
tree_append_point.push t
+ if t[0] is TYPE_OPEN_TAG
+ t[0] = TYPE_TAG
+ tree_append_point = t[3]
# tree constructor initialization
tree = [] # see comments on TYPE_TAG/etc for the structure of this data
t = tok_state()
if t?
tree_state t
-
+
return tree
# everything below is tests on the above
test_equals "named entity", html_to_json, "a&1234", '[[1,"a&1234"]]'
test_equals "broken named character references", html_to_json, "1&2&&3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
test_equals "numbered entity overrides", html_to_json, "1€€ ƒ", '[[1,"1€€ ƒ"]]'
+test_equals "open_tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'