parse end tags, close tags with proper nesting

[peach-html5-editor.git] / parse-html.coffee
diff --git a/parse-html.coffee b/parse-html.coffee

index a393dc0..db1837b 100644 (file)
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -32,7 +32,8 @@ TYPE_WHITESPACE = 2
  TYPE_COMMENT = 3
  # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
-TYPE_EOF = 5
+TYPE_CLOSE_TAG = 5 # name
+TYPE_EOF = 6
  
  lc_alpha = "abcdefghijklmnopqrstuvwxqz"
  uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
@@ -155,7 +156,7 @@ parse_html = (txt) ->
         cur = 0 # index of next char in txt to be parsed
         # declare tree and tokenizer variables so they're in scope below
         tree = null
-       tree_append_point = null
+       open_tags = [] # stack of open elements
         tree_state = null
         tok_state = null
         tok_cur_tag = null # partially parsed tag
@@ -210,6 +211,28 @@ parse_html = (txt) ->
                                         return [TYPE_TEXT, '<']
                 return null
  
+       # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
+       tok_state_end_tag_open = ->
+               switch c = txt.charAt(cur++)
+                       when '>'
+                               parse_error()
+                               tok_state = tok_state_data
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                               return [TYPE_TEXT, '</']
+                       else
+                               if uc_alpha.indexOf(c) > -1
+                                       tok_cur_tag = [TYPE_CLOSE_TAG, c.toLowerCase(), [], []]
+                                       tok_state = tok_state_tag_name
+                               else if lc_alpha.indexOf(c) > -1
+                                       tok_cur_tag = [TYPE_CLOSE_TAG, c, [], []]
+                                       tok_state = tok_state_tag_name
+                               else
+                                       parse_error()
+                                       tok_state = tok_state_bogus_comment
+               return null
+
         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
         tok_state_tag_name = ->
                 switch c = txt.charAt(cur++)
@@ -487,10 +510,10 @@ parse_html = (txt) ->
         tree_append = (t) ->
                 switch t[0]
                         when TYPE_TEXT
-                               if tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
-                                       tree_append_point[tree_append_point.length - 1][1] += t[1]
+                               if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
+                                       open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
                                 else
-                                       tree_append_point.push t
+                                       open_tags[0][3].push t
                         when TYPE_OPEN_TAG
                                 t[0] = TYPE_TAG
                                 # convert attributes into a hash
@@ -499,20 +522,31 @@ parse_html = (txt) ->
                                         a = t[2].pop()
                                         attrs[a[0]] = a[1]
                                 t[2] = attrs
-                               tree_append_point.push t
-                               tree_append_point = t[3]
-                               # TODO implement stack of open elements
+                               # FIXME probs have to auto-close things first
+                               open_tags[0][3].push t
+                               open_tags.unshift t
                                 # TODO implement formatting elements thing
+                       when TYPE_CLOSE_TAG
+                               # FIXME this is just a hack for now
+                               if open_tags.length < 2
+                                       parse_error()
+                                       return
+                               if open_tags[0][1] isnt t[1]
+                                       parse_error()
+                                       # fall through and close something anyway
+                               open_tags.shift()
                         when TYPE_EOF
                                 return
                         # TODO implement close tags
                         # TODO implement self-closing tags
                         else
                                 console.log "UNIMPLEMENTED tag type: #{t[0]}"
+               return
  
         # tree constructor initialization
-       tree = [] # see comments on TYPE_TAG/etc for the structure of this data
-       tree_append_point = tree
+       # see comments on TYPE_TAG/etc for the structure of this data
+       tree = [TYPE_TAG, 'html', {}, []]
+       open_tags = [tree]
         tree_state = tree_append
  
         # tokenizer initialization
@@ -524,7 +558,7 @@ parse_html = (txt) ->
                 if t?
                         tree_state t
                         if t[0] is TYPE_EOF
-                               return tree
+                               return tree[3]
         return # never reached
  
  # everything below is tests on the above
@@ -549,3 +583,4 @@ test_equals "open tag with attributes of various quotings", html_to_json, "foo<s
  test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
  test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
  test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
+test_equals "matching closing tags", html_to_json, "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar", '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]'