JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
parse end tags, close tags with proper nesting
authorJason Woofenden <jason@jasonwoof.com>
Mon, 14 Dec 2015 20:21:24 +0000 (15:21 -0500)
committerJason Woofenden <jason@jasonwoof.com>
Mon, 14 Dec 2015 20:21:24 +0000 (15:21 -0500)
parse-html.coffee

index a9e1ce0..db1837b 100644 (file)
@@ -32,7 +32,8 @@ TYPE_WHITESPACE = 2
 TYPE_COMMENT = 3
 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
-TYPE_EOF = 5
+TYPE_CLOSE_TAG = 5 # name
+TYPE_EOF = 6
 
 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
@@ -210,6 +211,28 @@ parse_html = (txt) ->
                                        return [TYPE_TEXT, '<']
                return null
 
+       # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
+       tok_state_end_tag_open = ->
+               switch c = txt.charAt(cur++)
+                       when '>'
+                               parse_error()
+                               tok_state = tok_state_data
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                               return [TYPE_TEXT, '</']
+                       else
+                               if uc_alpha.indexOf(c) > -1
+                                       tok_cur_tag = [TYPE_CLOSE_TAG, c.toLowerCase(), [], []]
+                                       tok_state = tok_state_tag_name
+                               else if lc_alpha.indexOf(c) > -1
+                                       tok_cur_tag = [TYPE_CLOSE_TAG, c, [], []]
+                                       tok_state = tok_state_tag_name
+                               else
+                                       parse_error()
+                                       tok_state = tok_state_bogus_comment
+               return null
+
        # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
        tok_state_tag_name = ->
                switch c = txt.charAt(cur++)
@@ -503,12 +526,22 @@ parse_html = (txt) ->
                                open_tags[0][3].push t
                                open_tags.unshift t
                                # TODO implement formatting elements thing
+                       when TYPE_CLOSE_TAG
+                               # FIXME this is just a hack for now
+                               if open_tags.length < 2
+                                       parse_error()
+                                       return
+                               if open_tags[0][1] isnt t[1]
+                                       parse_error()
+                                       # fall through and close something anyway
+                               open_tags.shift()
                        when TYPE_EOF
                                return
                        # TODO implement close tags
                        # TODO implement self-closing tags
                        else
                                console.log "UNIMPLEMENTED tag type: #{t[0]}"
+               return
 
        # tree constructor initialization
        # see comments on TYPE_TAG/etc for the structure of this data
@@ -550,3 +583,4 @@ test_equals "open tag with attributes of various quotings", html_to_json, "foo<s
 test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
 test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
 test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
+test_equals "matching closing tags", html_to_json, "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]'