TYPE_COMMENT = 3
# the following types are emited by the tokenizer, but shouldn't end up in the tree:
TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
-TYPE_EOF = 5
+TYPE_CLOSE_TAG = 5 # name
+TYPE_EOF = 6
lc_alpha = "abcdefghijklmnopqrstuvwxqz"
uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
cur = 0 # index of next char in txt to be parsed
# declare tree and tokenizer variables so they're in scope below
tree = null
- tree_append_point = null
+ open_tags = [] # stack of open elements
tree_state = null
tok_state = null
tok_cur_tag = null # partially parsed tag
return [TYPE_TEXT, '<']
return null
+ # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
+ tok_state_end_tag_open = ->
+ switch c = txt.charAt(cur++)
+ when '>'
+ parse_error()
+ tok_state = tok_state_data
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ return [TYPE_TEXT, '</']
+ else
+ if uc_alpha.indexOf(c) > -1
+ tok_cur_tag = [TYPE_CLOSE_TAG, c.toLowerCase(), [], []]
+ tok_state = tok_state_tag_name
+ else if lc_alpha.indexOf(c) > -1
+ tok_cur_tag = [TYPE_CLOSE_TAG, c, [], []]
+ tok_state = tok_state_tag_name
+ else
+ parse_error()
+ tok_state = tok_state_bogus_comment
+ return null
+
# 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
tok_state_tag_name = ->
switch c = txt.charAt(cur++)
tree_append = (t) ->
switch t[0]
when TYPE_TEXT
- if tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
- tree_append_point[tree_append_point.length - 1][1] += t[1]
+ if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
+ open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
else
- tree_append_point.push t
+ open_tags[0][3].push t
when TYPE_OPEN_TAG
t[0] = TYPE_TAG
# convert attributes into a hash
a = t[2].pop()
attrs[a[0]] = a[1]
t[2] = attrs
- tree_append_point.push t
- tree_append_point = t[3]
- # TODO implement stack of open elements
+ # FIXME probs have to auto-close things first
+ open_tags[0][3].push t
+ open_tags.unshift t
# TODO implement formatting elements thing
+ when TYPE_CLOSE_TAG
+ # FIXME this is just a hack for now
+ if open_tags.length < 2
+ parse_error()
+ return
+ if open_tags[0][1] isnt t[1]
+ parse_error()
+ # fall through and close something anyway
+ open_tags.shift()
when TYPE_EOF
return
# TODO implement close tags
# TODO implement self-closing tags
else
console.log "UNIMPLEMENTED tag type: #{t[0]}"
+ return
# tree constructor initialization
- tree = [] # see comments on TYPE_TAG/etc for the structure of this data
- tree_append_point = tree
+ # see comments on TYPE_TAG/etc for the structure of this data
+ tree = [TYPE_TAG, 'html', {}, []]
+ open_tags = [tree]
tree_state = tree_append
# tokenizer initialization
if t?
tree_state t
if t[0] is TYPE_EOF
- return tree
+ return tree[3]
return # never reached
# everything below is tests on the above
test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&=2&o=3&lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
+test_equals "matching closing tags", html_to_json, "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]'