TYPE_COMMENT = 3
# the following types are emited by the tokenizer, but shouldn't end up in the tree:
TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
-TYPE_EOF = 5
+TYPE_CLOSE_TAG = 5 # name
+TYPE_EOF = 6
lc_alpha = "abcdefghijklmnopqrstuvwxqz"
uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
return [TYPE_TEXT, '<']
return null
+ # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
+ tok_state_end_tag_open = ->
+ switch c = txt.charAt(cur++)
+ when '>'
+ parse_error()
+ tok_state = tok_state_data
+ when '' # EOF
+ parse_error()
+ tok_state = tok_state_data
+ return [TYPE_TEXT, '</']
+ else
+ if uc_alpha.indexOf(c) > -1
+ tok_cur_tag = [TYPE_CLOSE_TAG, c.toLowerCase(), [], []]
+ tok_state = tok_state_tag_name
+ else if lc_alpha.indexOf(c) > -1
+ tok_cur_tag = [TYPE_CLOSE_TAG, c, [], []]
+ tok_state = tok_state_tag_name
+ else
+ parse_error()
+ tok_state = tok_state_bogus_comment
+ return null
+
# 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
tok_state_tag_name = ->
switch c = txt.charAt(cur++)
open_tags[0][3].push t
open_tags.unshift t
# TODO implement formatting elements thing
+ when TYPE_CLOSE_TAG
+ # FIXME this is just a hack for now
+ if open_tags.length < 2
+ parse_error()
+ return
+ if open_tags[0][1] isnt t[1]
+ parse_error()
+ # fall through and close something anyway
+ open_tags.shift()
when TYPE_EOF
return
# TODO implement close tags
# TODO implement self-closing tags
else
console.log "UNIMPLEMENTED tag type: #{t[0]}"
+ return
# tree constructor initialization
# see comments on TYPE_TAG/etc for the structure of this data
test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&=2&o=3&lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&=2&o=3<=foo"},[[1,"bar"]]]]'
+test_equals "matching closing tags", html_to_json, "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]'