start parsing open tags

author Jason Woofenden <jason@jasonwoof.com>

Mon, 14 Dec 2015 01:49:40 +0000 (20:49 -0500)

committer Jason Woofenden <jason@jasonwoof.com>

Mon, 14 Dec 2015 01:49:40 +0000 (20:49 -0500)
author Jason Woofenden <jason@jasonwoof.com>
Mon, 14 Dec 2015 01:49:40 +0000 (20:49 -0500)
committer Jason Woofenden <jason@jasonwoof.com>
Mon, 14 Dec 2015 01:49:40 +0000 (20:49 -0500)
diff --git a/parse-html.coffee b/parse-html.coffee

index 04733fb..318422b 100644 (file)
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -30,10 +30,14 @@ TYPE_TAG = 0 # name, {attributes}, [children]
  TYPE_TEXT = 1 # "text"
  TYPE_WHITESPACE = 2
  TYPE_COMMENT = 3
+# the following types are emited by the tokenizer, but shouldn't end up in the tree:
+TYPE_OPEN_TAG = 4
  
-alnum = "abcdefghijklmnopqrstuvwxqzABCDEFGHIJKLMNOPQRSTUVWXQZ0123456789"
-hex_chars = "0123456789abcdefABCDEF"
+lc_alpha = "abcdefghijklmnopqrstuvwxqz"
+uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
  digits = "0123456789"
+alnum = lc_alpha + uc_alpha + digits
+hex_chars = digits + "abcdefABCDEF"
  
  # some SVG elements have dashes in them
  tag_name_chars = alnum + "-"
@@ -153,14 +157,14 @@ parse_html = (txt) ->
         tree_append_point = null
         tree_state = null
         tok_state = null
+       tok_cur = null # partially parsed tag
  
  
         # the functions below implement the tokenizer stats described here:
         # http://www.w3.org/TR/html5/syntax.html#tokenization
  
+       # http://www.w3.org/TR/html5/syntax.html#data-state
         tok_state_data = ->
-               if cur >= txt.length
-                       return null
                 switch c = txt.charAt(cur++)
                         when '&'
                                 tok_state = tok_state_character_reference_in_data
@@ -173,6 +177,53 @@ parse_html = (txt) ->
                                 return [TYPE_TEXT, c]
                 return null
  
+       # http://www.w3.org/TR/html5/syntax.html#tag-open-state
+       tok_state_tag_open = ->
+               switch c = txt.charAt(cur++)
+                       when '!'
+                               tok_state = tok_state_markup_declaration_open
+                       when '/'
+                               tok_state = tok_state_end_tag_open
+                       when '?'
+                               # Parse error
+                               tok_state = tok_state_bogus_comment
+                       else
+                               if lc_alpha.indexOf(c) > -1
+                                       tok_cur = [TYPE_OPEN_TAG, c, {}, []]
+                                       tok_state = tok_state_tag_name
+                               else if uc_alpha.indexOf(c) > -1
+                                       tok_cur = [TYPE_OPEN_TAG, c.toLowerCase(), {}, []]
+                                       tok_state = tok_state_tag_name
+                               else
+                                       # Parse error
+                                       tok_state = tok_state_data
+                                       cur -= 1 # we didn't parse/handle the char after <
+                                       return [TYPE_TEXT, '<']
+               return null
+
+       # http://www.w3.org/TR/html5/syntax.html#tag-name-state
+       tok_state_tag_name = ->
+               switch c = txt.charAt(cur++)
+                       when "\t", "\n", ' '
+                               tok_state = tok_state_before_attribute_name
+                       when '/'
+                               tok_state = tok_state_self_closing_start_tag
+                       when '>'
+                               tok_state = tok_state_data
+                               tmp = tok_cur
+                               tok_cur = null
+                               return tmp
+                       when "\u0000"
+                               # Parse error
+                               tok_cur[1] += "\ufffd"
+                       else
+                               if uc_alpha.indexOf(c) > -1
+                                       tok_cur[1] += c.toLowerCase()
+                               else
+                                       tok_cur[1] += c
+               return null
+
+       # http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
         # & just got consumed
         tok_state_character_reference_in_data = ->
                 tok_state = tok_state_data
@@ -231,15 +282,18 @@ parse_html = (txt) ->
                                                         cur += i # consume entity chars
                                                         return [TYPE_TEXT, c]
                 return null
-                               
  
         # the functions below impliment the Tree Contstruction algorithm here:
         # http://www.w3.org/TR/html5/syntax.html#tree-construction
+       # FIXME this is just a bit of a hack that makes sense... read spec and do it that way
         tree_append = (t) ->
                 if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
                         tree_append_point[tree_append_point.length - 1][1] += t[1]
                 else
                         tree_append_point.push t
+                       if t[0] is TYPE_OPEN_TAG
+                               t[0] = TYPE_TAG
+                               tree_append_point = t[3]
  
         # tree constructor initialization
         tree = [] # see comments on TYPE_TAG/etc for the structure of this data
@@ -254,7 +308,7 @@ parse_html = (txt) ->
                 t = tok_state()
                 if t?
                         tree_state t
-       
+
         return tree
  
  # everything below is tests on the above
@@ -271,3 +325,4 @@ test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
  test_equals "named entity", html_to_json, "a&amp;1234", '[[1,"a&1234"]]'
  test_equals "broken named character references", html_to_json, "1&amp2&&amp;3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
  test_equals "numbered entity overrides", html_to_json, "1&#X80&#x80; &#x83", '[[1,"1€€ ƒ"]]'
+test_equals "open_tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
author	Jason Woofenden <jason@jasonwoof.com>
	Mon, 14 Dec 2015 01:49:40 +0000 (20:49 -0500)
committer	Jason Woofenden <jason@jasonwoof.com>
	Mon, 14 Dec 2015 01:49:40 +0000 (20:49 -0500)