JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
parse end tags, close tags with proper nesting
[peach-html5-editor.git] / parse-html.coffee
index a393dc0..db1837b 100644 (file)
@@ -32,7 +32,8 @@ TYPE_WHITESPACE = 2
 TYPE_COMMENT = 3
 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
-TYPE_EOF = 5
+TYPE_CLOSE_TAG = 5 # name
+TYPE_EOF = 6
 
 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
@@ -155,7 +156,7 @@ parse_html = (txt) ->
        cur = 0 # index of next char in txt to be parsed
        # declare tree and tokenizer variables so they're in scope below
        tree = null
-       tree_append_point = null
+       open_tags = [] # stack of open elements
        tree_state = null
        tok_state = null
        tok_cur_tag = null # partially parsed tag
@@ -210,6 +211,28 @@ parse_html = (txt) ->
                                        return [TYPE_TEXT, '<']
                return null
 
+       # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
+       tok_state_end_tag_open = ->
+               switch c = txt.charAt(cur++)
+                       when '>'
+                               parse_error()
+                               tok_state = tok_state_data
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                               return [TYPE_TEXT, '</']
+                       else
+                               if uc_alpha.indexOf(c) > -1
+                                       tok_cur_tag = [TYPE_CLOSE_TAG, c.toLowerCase(), [], []]
+                                       tok_state = tok_state_tag_name
+                               else if lc_alpha.indexOf(c) > -1
+                                       tok_cur_tag = [TYPE_CLOSE_TAG, c, [], []]
+                                       tok_state = tok_state_tag_name
+                               else
+                                       parse_error()
+                                       tok_state = tok_state_bogus_comment
+               return null
+
        # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
        tok_state_tag_name = ->
                switch c = txt.charAt(cur++)
@@ -487,10 +510,10 @@ parse_html = (txt) ->
        tree_append = (t) ->
                switch t[0]
                        when TYPE_TEXT
-                               if tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
-                                       tree_append_point[tree_append_point.length - 1][1] += t[1]
+                               if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
+                                       open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
                                else
-                                       tree_append_point.push t
+                                       open_tags[0][3].push t
                        when TYPE_OPEN_TAG
                                t[0] = TYPE_TAG
                                # convert attributes into a hash
@@ -499,20 +522,31 @@ parse_html = (txt) ->
                                        a = t[2].pop()
                                        attrs[a[0]] = a[1]
                                t[2] = attrs
-                               tree_append_point.push t
-                               tree_append_point = t[3]
-                               # TODO implement stack of open elements
+                               # FIXME probs have to auto-close things first
+                               open_tags[0][3].push t
+                               open_tags.unshift t
                                # TODO implement formatting elements thing
+                       when TYPE_CLOSE_TAG
+                               # FIXME this is just a hack for now
+                               if open_tags.length < 2
+                                       parse_error()
+                                       return
+                               if open_tags[0][1] isnt t[1]
+                                       parse_error()
+                                       # fall through and close something anyway
+                               open_tags.shift()
                        when TYPE_EOF
                                return
                        # TODO implement close tags
                        # TODO implement self-closing tags
                        else
                                console.log "UNIMPLEMENTED tag type: #{t[0]}"
+               return
 
        # tree constructor initialization
-       tree = [] # see comments on TYPE_TAG/etc for the structure of this data
-       tree_append_point = tree
+       # see comments on TYPE_TAG/etc for the structure of this data
+       tree = [TYPE_TAG, 'html', {}, []]
+       open_tags = [tree]
        tree_state = tree_append
 
        # tokenizer initialization
@@ -524,7 +558,7 @@ parse_html = (txt) ->
                if t?
                        tree_state t
                        if t[0] is TYPE_EOF
-                               return tree
+                               return tree[3]
        return # never reached
 
 # everything below is tests on the above
@@ -549,3 +583,4 @@ test_equals "open tag with attributes of various quotings", html_to_json, "foo<s
 test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
 test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
 test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
+test_equals "matching closing tags", html_to_json, "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]'