JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
parse_errors, EOF, &amp/etc in attrs
authorJason Woofenden <jason@jasonwoof.com>
Mon, 14 Dec 2015 19:37:25 +0000 (14:37 -0500)
committerJason Woofenden <jason@jasonwoof.com>
Mon, 14 Dec 2015 19:37:25 +0000 (14:37 -0500)
parse-html.coffee

index 1ef077a..a393dc0 100644 (file)
@@ -32,6 +32,7 @@ TYPE_WHITESPACE = 2
 TYPE_COMMENT = 3
 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
+TYPE_EOF = 5
 
 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
@@ -159,6 +160,8 @@ parse_html = (txt) ->
        tok_state = null
        tok_cur_tag = null # partially parsed tag
 
+       parse_error = ->
+               console.log "Parse error at character #{cur} of #{txt.length}"
 
        # the functions below implement the tokenizer stats described here:
        # http://www.w3.org/TR/html5/syntax.html#tokenization
@@ -167,75 +170,21 @@ parse_html = (txt) ->
        tok_state_data = ->
                switch c = txt.charAt(cur++)
                        when '&'
-                               tok_state = tok_state_character_reference_in_data
+                               return [TYPE_TEXT, tokenize_character_reference()]
                        when '<'
                                tok_state = tok_state_tag_open
                        when "\u0000"
-                               # Parse error
+                               parse_error()
                                return [TYPE_TEXT, c]
+                       when '' # EOF
+                               return [TYPE_EOF]
                        else
                                return [TYPE_TEXT, c]
                return null
 
        # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
-       # & just got consumed
-       tok_state_character_reference_in_data = ->
-               tok_state = tok_state_data
-               if cur >= txt.length
-                       return [TYPE_TEXT, '&']
-               switch c = txt.charAt(cur)
-                       when ';'
-                               return [TYPE_TEXT, '&']
-                       when '#'
-                               if cur + 1 >= txt.length
-                                       return [TYPE_TEXT, '&']
-                               if txt.charAt(cur + 1).toLowerCase() is 'x'
-                                       prefix = '#x'
-                                       charset = hex_chars
-                                       start = cur + 2
-                               else
-                                       charset = digits
-                                       start = cur + 1
-                                       prefix = '#'
-                               i = 0
-                               while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
-                                       i += 1
-                               if i is 0
-                                       return [TYPE_TEXT, '&']
-                               if txt.charAt(start + i) is ';'
-                                       i += 1
-                               decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
-                               if decoded?
-                                       cur = start + i
-                                       return [TYPE_TEXT, decoded]
-                               return [TYPE_TEXT, '&']
-                       else
-                               for i in [0...31]
-                                       if alnum.indexOf(txt.charAt(cur + i)) is -1
-                                               break
-                               if i is 0
-                                       return [TYPE_TEXT, '&']
-                               if txt.charAt(cur + i) is ';'
-                                       i += 1 # include ';' terminator in value
-                                       decoded = decode_named_char_ref txt.substr(cur, i)
-                                       if decoded?
-                                               cur += i
-                                               return [TYPE_TEXT, decoded]
-                                       return [TYPE_TEXT, '&']
-                               else
-                                       # no ';' terminator (only legacy char refs)
-                                       if i < 2 or i > 6
-                                               return [TYPE_TEXT, '&']
-                                       # FIXME: if we're inside an attribute:
-                                       # 1.    don't parse refs that are followed by =
-                                       # 2.    don't parse refs that are followed by alnum
-                                       max = i
-                                       for i in [2..max] # no prefix matches, so ok to check shortest first
-                                               c = legacy_char_refs[txt.substr(cur, i)]
-                                               if c?
-                                                       cur += i # consume entity chars
-                                                       return [TYPE_TEXT, c]
-               return null
+       # not needed: tok_state_character_reference_in_data = ->
+       # just call tok_state_character_reference_in_data()
 
        # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
        tok_state_tag_open = ->
@@ -245,7 +194,7 @@ parse_html = (txt) ->
                        when '/'
                                tok_state = tok_state_end_tag_open
                        when '?'
-                               # Parse error
+                               parse_error()
                                tok_state = tok_state_bogus_comment
                        else
                                if lc_alpha.indexOf(c) > -1
@@ -255,7 +204,7 @@ parse_html = (txt) ->
                                        tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
                                        tok_state = tok_state_tag_name
                                else
-                                       # Parse error
+                                       parse_error()
                                        tok_state = tok_state_data
                                        cur -= 1 # we didn't parse/handle the char after <
                                        return [TYPE_TEXT, '<']
@@ -274,8 +223,11 @@ parse_html = (txt) ->
                                tok_cur_tag = null
                                return tmp
                        when "\u0000"
-                               # Parse error
+                               parse_error()
                                tok_cur_tag[1] += "\ufffd"
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
                        else
                                if uc_alpha.indexOf(c) > -1
                                        tok_cur_tag[1] += c.toLowerCase()
@@ -298,11 +250,14 @@ parse_html = (txt) ->
                                tok_cur_tag = null
                                return tmp
                        when "\u0000"
-                               # Parse error
+                               parse_error()
                                attr_name = "\ufffd"
                        when '"', "'", '<', '='
-                               # Parse error
+                               parse_error()
                                attr_name = c
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
                        else
                                if uc_alpha.indexOf(c) > -1
                                        attr_name = c.toLowerCase()
@@ -328,13 +283,18 @@ parse_html = (txt) ->
                                tok_cur_tag = null
                                return tmp
                        when "\u0000"
-                               # Parse error
-                               tok_cur_tag[2][0][0] += "\ufffd"
+                               parse_error()
+                               tok_cur_tag[2][0][0] = "\ufffd"
+                       when '"', "'", '<'
+                               parse_error()
+                               tok_cur_tag[2][0][0] = c
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
                        else
                                if uc_alpha.indexOf(c) > -1
-                                       tok_cur_tag[2][0][0] += c.toLowerCase()
+                                       tok_cur_tag[2][0][0] = c.toLowerCase()
                                else
-                                       # Parse error if ", ' or <
                                        tok_cur_tag[2][0][0] += c
                return null
 
@@ -360,6 +320,9 @@ parse_html = (txt) ->
                                tmp = tok_cur_tag
                                tok_cur_tag = null
                                return tmp
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
                        else
                                tok_cur_tag[2][0][1] += c
                                tok_state = tok_state_attribute_value_unquoted
@@ -371,11 +334,13 @@ parse_html = (txt) ->
                        when '"'
                                tok_state = tok_state_after_attribute_value_quoted
                        when '&'
-                               tok_state = tok_state_character_reference_in_attribute_value
-                               tok_char_ref_addl_allowed = '"' # FIXME
+                               tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
                        when "\u0000"
                                # Parse error
                                tok_cur_tag[2][0][1] += "\ufffd"
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
                        else
                                tok_cur_tag[2][0][1] += c
                return null
@@ -386,11 +351,13 @@ parse_html = (txt) ->
                        when "'"
                                tok_state = tok_state_after_attribute_value_quoted
                        when '&'
-                               tok_state = tok_state_character_reference_in_attribute_value
-                               tok_char_ref_addl_allowed = "'" # FIXME
+                               tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
                        when "\u0000"
                                # Parse error
                                tok_cur_tag[2][0][1] += "\ufffd"
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
                        else
                                tok_cur_tag[2][0][1] += c
                return null
@@ -401,8 +368,7 @@ parse_html = (txt) ->
                        when "\t", "\n", "\u000c", ' '
                                tok_state = tok_state_before_attribute_name
                        when '&'
-                               tok_state = tok_state_character_reference_in_attribute_value
-                               tok_char_ref_addl_allowed = '>' # FIXME
+                               tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
                        when '>'
                                tok_state = tok_state_data
                                tmp = tok_cur_tag
@@ -410,6 +376,9 @@ parse_html = (txt) ->
                                return tmp
                        when "\u0000"
                                tok_cur_tag[2][0][1] += "\ufffd"
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
                        else
                                # Parse Error if ', <, = or ` (backtick)
                                tok_cur_tag[2][0][1] += c
@@ -427,28 +396,119 @@ parse_html = (txt) ->
                                tmp = tok_cur_tag
                                tok_cur_tag = null
                                return tmp
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
                        else
                                # Parse Error
                                tok_state = tok_state_before_attribute_name
                                cur -= 1 # we didn't handle that char
                return null
 
+       # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
+       # Don't set this as a state, just call it
+       # returns a string (NOT a text node)
+       tokenize_character_reference = (allowed_char = null, in_attr = false) ->
+               if cur >= txt.length
+                       return '&'
+               switch c = txt.charAt(cur)
+                       when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
+                               # explicitly not a parse error
+                               return '&'
+                       when ';'
+                               # there has to be "one or more" alnums between & and ; to be a parse error
+                               return '&'
+                       when '#'
+                               if cur + 1 >= txt.length
+                                       return '&'
+                               if txt.charAt(cur + 1).toLowerCase() is 'x'
+                                       prefix = '#x'
+                                       charset = hex_chars
+                                       start = cur + 2
+                               else
+                                       charset = digits
+                                       start = cur + 1
+                                       prefix = '#'
+                               i = 0
+                               while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
+                                       i += 1
+                               if i is 0
+                                       return '&'
+                               if txt.charAt(start + i) is ';'
+                                       i += 1
+                               # FIXME This is supposed to generate parse errors for some chars
+                               decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
+                               if decoded?
+                                       cur = start + i
+                                       return decoded
+                               return '&'
+                       else
+                               for i in [0...31]
+                                       if alnum.indexOf(txt.charAt(cur + i)) is -1
+                                               break
+                               if i is 0
+                                       # exit early, because parse_error() below needs at least one alnum
+                                       return '&'
+                               if txt.charAt(cur + i) is ';'
+                                       i += 1 # include ';' terminator in value
+                                       decoded = decode_named_char_ref txt.substr(cur, i)
+                                       if decoded?
+                                               cur += i
+                                               return decoded
+                                       parse_error()
+                                       return '&'
+                               else
+                                       # no ';' terminator (only legacy char refs)
+                                       max = i
+                                       for i in [2..max] # no prefix matches, so ok to check shortest first
+                                               c = legacy_char_refs[txt.substr(cur, i)]
+                                               if c?
+                                                       if in_attr
+                                                               if txt.charAt(cur + i) is '='
+                                                                       # "because some legacy user agents will
+                                                                       # misinterpret the markup in those cases"
+                                                                       parse_error()
+                                                                       return '&'
+                                                               if alnum.indexOf(txt.charAt(cur + i)) > -1
+                                                                       # this makes attributes forgiving about url args
+                                                                       return '&'
+                                                       # ok, and besides the weird exceptions for attributes...
+                                                       # return the matching char
+                                                       cur += i # consume entity chars
+                                                       parse_error() # because no terminating ";"
+                                                       return c
+                                       parse_error()
+                                       return '&'
+               return # never reached
+
        # the functions below impliment the Tree Contstruction algorithm here:
        # http://www.w3.org/TR/html5/syntax.html#tree-construction
        # FIXME this is just a bit of a hack that makes sense... read spec and do it that way
        tree_append = (t) ->
-               if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
-                       tree_append_point[tree_append_point.length - 1][1] += t[1]
-               else
-                       tree_append_point.push t
-                       if t[0] is TYPE_OPEN_TAG
+               switch t[0]
+                       when TYPE_TEXT
+                               if tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
+                                       tree_append_point[tree_append_point.length - 1][1] += t[1]
+                               else
+                                       tree_append_point.push t
+                       when TYPE_OPEN_TAG
                                t[0] = TYPE_TAG
+                               # convert attributes into a hash
                                attrs = {}
                                while t[2].length
                                        a = t[2].pop()
                                        attrs[a[0]] = a[1]
                                t[2] = attrs
+                               tree_append_point.push t
                                tree_append_point = t[3]
+                               # TODO implement stack of open elements
+                               # TODO implement formatting elements thing
+                       when TYPE_EOF
+                               return
+                       # TODO implement close tags
+                       # TODO implement self-closing tags
+                       else
+                               console.log "UNIMPLEMENTED tag type: #{t[0]}"
 
        # tree constructor initialization
        tree = [] # see comments on TYPE_TAG/etc for the structure of this data
@@ -459,12 +519,13 @@ parse_html = (txt) ->
        tok_state = tok_state_data
 
        # proccess input
-       while cur < txt.length
+       loop
                t = tok_state()
                if t?
                        tree_state t
-
-       return tree
+                       if t[0] is TYPE_EOF
+                               return tree
+       return # never reached
 
 # everything below is tests on the above
 test_equals = (description, fn, args..., expected_output) ->
@@ -472,7 +533,9 @@ test_equals = (description, fn, args..., expected_output) ->
        if output is expected_output
                console.log "passed: #{description}."
        else
-               console.log "FAILED: #{description}. Expected: #{expected_output}, actual: #{output}"
+               console.log "FAILED: #{description}..."
+               console.log "   Expected: #{expected_output}"
+               console.log "     Actual: #{output}"
 html_to_json = (html) ->
        return JSON.stringify parse_html html
 test_equals "empty", html_to_json, "", '[]'
@@ -483,3 +546,6 @@ test_equals "numbered entity overrides", html_to_json, "1&#X80&#x80; &#x83", '[[
 test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
 test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
 test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'
+test_equals "attribute entity exceptions dq", html_to_json, "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
+test_equals "attribute entity exceptions sq", html_to_json, "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'
+test_equals "attribute entity exceptions uq", html_to_json, "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar", '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]'