JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
use Node class for data structures, improve is_in_scope
authorJason Woofenden <jason@jasonwoof.com>
Tue, 15 Dec 2015 18:23:17 +0000 (13:23 -0500)
committerJason Woofenden <jason@jasonwoof.com>
Tue, 15 Dec 2015 18:23:17 +0000 (13:23 -0500)
parse-html.coffee

index 927cb95..ef5545f 100644 (file)
 #
 # Instead, the data structure produced by this parser is an array of nodes.
 #
-# Each node is an array. The first element in the array is an integer (one of
-# the TYPE_* constants below) followed by the appropriate fields for that type
-# (shown below in the comments after the TYPE_* definition.)
-
+# Each node is an obect of the Node class. Here are the Node types:
 TYPE_TAG = 0 # name, {attributes}, [children]
 TYPE_TEXT = 1 # "text"
 TYPE_COMMENT = 2
@@ -35,15 +32,65 @@ TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [child
 TYPE_END_TAG = 5 # name
 TYPE_EOF = 6
 
+class Node
+       constructor: (type, args = {}) ->
+               @type = type # one of the TYPE_* constants above
+               @name = args.name ? '' # tag name
+               @text = args.text ? '' # contents for text/comment nodes
+               @attrs = args.attrs ? {}
+               @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
+               @children = args.children ? []
+       serialize: -> # for unit tests
+               ret = ''
+               switch @type
+                       when TYPE_TAG
+                               ret += 'tag:'
+                               ret += JSON.stringify @name
+                               ret += ','
+                               ret += JSON.stringify @attrs
+                               ret += ','
+                               sep = '['
+                               for c in @children
+                                       ret += sep
+                                       sep = ','
+                                       ret += c.serialize()
+                               ret += ']'
+                       when TYPE_TEXT
+                               ret += 'text:'
+                               ret += JSON.stringify @text
+                       when TYPE_COMMENT
+                               ret += 'comment:'
+                               ret += JSON.stringify @text
+                       when TYPE_DOCTYPE
+                               ret += 'doctype'
+                               # FIXME
+                       else
+                               ret += 'unknown:'
+               return ret
+
+
+# helpers: (only take args that are normally known when parser creates nodes)
+new_open_tag = (name) ->
+       return new Node TYPE_OPEN_TAG, name: name
+new_end_tag = (name) ->
+       return new Node TYPE_END_TAG, name: name
+new_text_node = (txt) ->
+       return new Node TYPE_TEXT, text: txt
+new_comment_node = (txt) ->
+       return new Node TYPE_COMMENT, text: txt
+new_eof_token = ->
+       return new Node TYPE_EOF
+
 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
 digits = "0123456789"
 alnum = lc_alpha + uc_alpha + digits
 hex_chars = digits + "abcdefABCDEF"
 scopers = { # FIXME these are supposed to be namespace specific
-       'applet', 'caption', 'html', 'table', 'td', 'th', 'marquee', 'object',
-       'template', 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml',
-       'foreignObject', 'desc', 'title'
+       'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
+       'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
+       'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
+       'foreignObject': true, 'desc': true, 'title'
 }
 
 # some SVG elements have dashes in them
@@ -213,14 +260,15 @@ parse_html = (txt, parse_error_cb = null) ->
        # But first... the helpers
        template_tag_is_open = ->
                for t of open_tags
-                       if t[0] is TYPE_TAG and t[1] is 'template'
+                       if t.type is TYPE_TAG and t.name is 'template'
                                return true
                return false
        is_in_scope = (tag_name) ->
                for t of open_tags
-                       if t[0] is TYPE_TAG and t[1] is tag_name
+                       if t.name is tag_name
                                return true
-                       # FIXME bail if in scopers
+                       if t.name of scopers
+                               return false
                return false
 
        reconstruct_active_formatting_elements = ->
@@ -229,13 +277,13 @@ parse_html = (txt, parse_error_cb = null) ->
        # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
        # FIXME implement this
        close_p_if_in_button_scope = ->
-               if open_tags[0][1] is 'p' # FIXME
+               if open_tags[0].name is 'p'
                        open_tags.pop()
                return
                #p = find_button_scope 'p'
                #if p?
                        # TODO generate_implied_end_tags except for p tags
-                       # TODO parse_error unless open_tags[0][1] is 'p'
+                       # TODO parse_error unless open_tags[0].name is 'p'
                        # TODO pop stack until 'p' popped
 
 
@@ -243,34 +291,33 @@ parse_html = (txt, parse_error_cb = null) ->
        # http://www.w3.org/TR/html5/syntax.html#insert-a-character
        tree_insert_a_character = (t) ->
                # FIXME read spec for "adjusted insertion location, etc, this might be wrong
-               if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
-                       open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
+               dest = open_tags[0].children
+               if dest.length > 0 and dest[dest.length - 1].type is TYPE_TEXT
+                       dest[dest.length - 1].text += t.text
                else
-                       open_tags[0][3].push t
+                       dest.push t
 
        # FIXME read spec, do this right
        # note: this assumes it's an open tag
        tree_insert_tag = (t) ->
-               t[0] = TYPE_TAG # not TYPE_OPEN_TAG
+               t.type = TYPE_TAG # not TYPE_OPEN_TAG
                # convert attributes into a hash
-               attrs = {}
-               while t[2].length
-                       a = t[2].pop()
-                       attrs[a[0]] = a[1]
-               t[2] = attrs
-               open_tags[0][3].push t
+               while t.attrs_a.length
+                       a = t.attrs_a.pop()
+                       t.attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
+               open_tags[0].children.push t
                open_tags.unshift t
 
        # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
        tree_insert_a_comment = (t) ->
                # FIXME read spec for "adjusted insertion location, etc, this might be wrong
-               open_tags[0][3].push t
+               open_tags[0].children.push t
 
        # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
        tree_in_body = (t) ->
-               switch t[0]
+               switch t.type
                        when TYPE_TEXT
-                               switch t[1]
+                               switch t.text
                                        when "\u0000"
                                                parse_error()
                                        when "\t", "\u000a", "\u000c", "\u000d", ' '
@@ -285,12 +332,12 @@ parse_html = (txt, parse_error_cb = null) ->
                        when TYPE_DOCTYPE
                                parse_error()
                        when TYPE_OPEN_TAG
-                               switch t[1]
+                               switch t.name
                                        when 'html'
                                                parse_error()
                                                return if template_tag_is_open()
-                                               root_attrs = open_tags[open_tags.length - 1][3]
-                                               for k, v of t[2]
+                                               root_attrs = open_tags[open_tags.length - 1].children
+                                               for k, v of t.attrs
                                                        root_attrs[k] = v unless root_attrs[k]?
                                        when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
                                                # FIXME also do this for </template> (end tag)
@@ -306,7 +353,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                                tree_insert_tag t
                                        when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
                                                close_p_if_in_button_scope()
-                                               if open_tags[0][1] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+                                               if open_tags[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
                                                        parse_error()
                                                        open_tags.shift()
                                                tree_insert_tag t
@@ -320,13 +367,13 @@ parse_html = (txt, parse_error_cb = null) ->
                                        tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
                                }
                                for t in open_tags
-                                       unless ok_tags[t[1]]?
+                                       unless ok_tags[t.name]?
                                                parse_error()
                                                break
                                # TODO stack of template insertion modes thing
                                flag_parsing = false # stop parsing
                        when TYPE_END_TAG
-                               switch t[1]
+                               switch t.name
                                        when 'body'
                                                unless is_in_scope 'body'
                                                        parse_error()
@@ -340,15 +387,15 @@ parse_html = (txt, parse_error_cb = null) ->
                                        # TODO lots more close tags to implement here
                                        else
                                                for node, i in open_tags
-                                                       if node[1] is t[1]
-                                                               # FIXME generate implied end tags except those with name==t[1]
+                                                       if node.name is t.name
+                                                               # FIXME generate implied end tags except those with name==t.name
                                                                parse_error() unless i is 0
                                                                while i > 0
                                                                        open_tags.shift()
                                                                        i -= 1
                                                                open_tags.shift()
                                                                return
-                                                       if special_elements[node[1]]?
+                                                       if special_elements[node.name]?
                                                                parse_error()
                                                                return
 
@@ -360,16 +407,16 @@ parse_html = (txt, parse_error_cb = null) ->
        tok_state_data = ->
                switch c = txt.charAt(cur++)
                        when '&'
-                               return [TYPE_TEXT, tokenize_character_reference()]
+                               return new_text_node tokenize_character_reference()
                        when '<'
                                tok_state = tok_state_tag_open
                        when "\u0000"
                                parse_error()
-                               return [TYPE_TEXT, c]
+                               return new_text_node c
                        when '' # EOF
-                               return [TYPE_EOF]
+                               return new_eof_token()
                        else
-                               return [TYPE_TEXT, c]
+                               return new_text_node c
                return null
 
        # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
@@ -388,16 +435,16 @@ parse_html = (txt, parse_error_cb = null) ->
                                tok_state = tok_state_bogus_comment
                        else
                                if lc_alpha.indexOf(c) > -1
-                                       tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
+                                       tok_cur_tag = new_open_tag c
                                        tok_state = tok_state_tag_name
                                else if uc_alpha.indexOf(c) > -1
-                                       tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
+                                       tok_cur_tag = new_open_tag c.toLowerCase()
                                        tok_state = tok_state_tag_name
                                else
                                        parse_error()
                                        tok_state = tok_state_data
                                        cur -= 1 # we didn't parse/handle the char after <
-                                       return [TYPE_TEXT, '<']
+                                       return new_text_node '<'
                return null
 
        # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
@@ -409,13 +456,13 @@ parse_html = (txt, parse_error_cb = null) ->
                        when '' # EOF
                                parse_error()
                                tok_state = tok_state_data
-                               return [TYPE_TEXT, '</']
+                               return new_text_node '</'
                        else
                                if uc_alpha.indexOf(c) > -1
-                                       tok_cur_tag = [TYPE_END_TAG, c.toLowerCase(), [], []]
+                                       tok_cur_tag = new_end_tag c.toLowerCase()
                                        tok_state = tok_state_tag_name
                                else if lc_alpha.indexOf(c) > -1
-                                       tok_cur_tag = [TYPE_END_TAG, c, [], []]
+                                       tok_cur_tag = new_end_tag c
                                        tok_state = tok_state_tag_name
                                else
                                        parse_error()
@@ -436,15 +483,15 @@ parse_html = (txt, parse_error_cb = null) ->
                                return tmp
                        when "\u0000"
                                parse_error()
-                               tok_cur_tag[1] += "\ufffd"
+                               tok_cur_tag.name += "\ufffd"
                        when '' # EOF
                                parse_error()
                                tok_state = tok_state_data
                        else
                                if uc_alpha.indexOf(c) > -1
-                                       tok_cur_tag[1] += c.toLowerCase()
+                                       tok_cur_tag.name += c.toLowerCase()
                                else
-                                       tok_cur_tag[1] += c
+                                       tok_cur_tag.name += c
                return null
 
        # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
@@ -476,7 +523,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                else
                                        attr_name = c
                if attr_name?
-                       tok_cur_tag[2].unshift [attr_name, '']
+                       tok_cur_tag.attrs_a.unshift [attr_name, '']
                        tok_state = tok_state_attribute_name
                return null
 
@@ -496,18 +543,18 @@ parse_html = (txt, parse_error_cb = null) ->
                                return tmp
                        when "\u0000"
                                parse_error()
-                               tok_cur_tag[2][0][0] = "\ufffd"
+                               tok_cur_tag.attrs_a[0][0] = "\ufffd"
                        when '"', "'", '<'
                                parse_error()
-                               tok_cur_tag[2][0][0] = c
+                               tok_cur_tag.attrs_a[0][0] = c
                        when '' # EOF
                                parse_error()
                                tok_state = tok_state_data
                        else
                                if uc_alpha.indexOf(c) > -1
-                                       tok_cur_tag[2][0][0] = c.toLowerCase()
+                                       tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
                                else
-                                       tok_cur_tag[2][0][0] += c
+                                       tok_cur_tag.attrs_a[0][0] += c
                return null
 
        # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
@@ -524,7 +571,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                tok_state = tok_state_attribute_value_single_quoted
                        when "\u0000"
                                # Parse error
-                               tok_cur_tag[2][0][1] += "\ufffd"
+                               tok_cur_tag.attrs_a[0][1] += "\ufffd"
                                tok_state = tok_state_attribute_value_unquoted
                        when '>'
                                # Parse error
@@ -536,7 +583,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                parse_error()
                                tok_state = tok_state_data
                        else
-                               tok_cur_tag[2][0][1] += c
+                               tok_cur_tag.attrs_a[0][1] += c
                                tok_state = tok_state_attribute_value_unquoted
                return null
 
@@ -546,15 +593,15 @@ parse_html = (txt, parse_error_cb = null) ->
                        when '"'
                                tok_state = tok_state_after_attribute_value_quoted
                        when '&'
-                               tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
+                               tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
                        when "\u0000"
                                # Parse error
-                               tok_cur_tag[2][0][1] += "\ufffd"
+                               tok_cur_tag.attrs_a[0][1] += "\ufffd"
                        when '' # EOF
                                parse_error()
                                tok_state = tok_state_data
                        else
-                               tok_cur_tag[2][0][1] += c
+                               tok_cur_tag.attrs_a[0][1] += c
                return null
 
        # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
@@ -563,15 +610,15 @@ parse_html = (txt, parse_error_cb = null) ->
                        when "'"
                                tok_state = tok_state_after_attribute_value_quoted
                        when '&'
-                               tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
+                               tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
                        when "\u0000"
                                # Parse error
-                               tok_cur_tag[2][0][1] += "\ufffd"
+                               tok_cur_tag.attrs_a[0][1] += "\ufffd"
                        when '' # EOF
                                parse_error()
                                tok_state = tok_state_data
                        else
-                               tok_cur_tag[2][0][1] += c
+                               tok_cur_tag.attrs_a[0][1] += c
                return null
 
        # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
@@ -580,20 +627,20 @@ parse_html = (txt, parse_error_cb = null) ->
                        when "\t", "\n", "\u000c", ' '
                                tok_state = tok_state_before_attribute_name
                        when '&'
-                               tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
+                               tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
                        when '>'
                                tok_state = tok_state_data
                                tmp = tok_cur_tag
                                tok_cur_tag = null
                                return tmp
                        when "\u0000"
-                               tok_cur_tag[2][0][1] += "\ufffd"
+                               tok_cur_tag.attrs_a[0][1] += "\ufffd"
                        when '' # EOF
                                parse_error()
                                tok_state = tok_state_data
                        else
                                # Parse Error if ', <, = or ` (backtick)
-                               tok_cur_tag[2][0][1] += c
+                               tok_cur_tag.attrs_a[0][1] += c
                return null
 
        # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
@@ -695,7 +742,7 @@ parse_html = (txt, parse_error_cb = null) ->
 
        # tree constructor initialization
        # see comments on TYPE_TAG/etc for the structure of this data
-       tree = [TYPE_TAG, 'html', {}, []]
+       tree = new Node TYPE_TAG, name: 'html'
        open_tags = [tree]
        tree_state = tree_in_body
        flag_frameset_ok = true
@@ -709,7 +756,7 @@ parse_html = (txt, parse_error_cb = null) ->
                t = tok_state()
                if t?
                        tree_state t
-       return tree[3]
+       return tree.children
 
 # everything below is tests on the above
 test_equals = (description, output, expected_output) ->
@@ -724,67 +771,72 @@ test_parser = (args) ->
        errors_cb = (i) ->
                parse_errors.push i
        parsed = parse_html args.html, errors_cb
-       parsed = JSON.stringify parsed
-       if parsed isnt args.expected or parse_errors.length isnt args.errors
+       serialized = ''
+       sep = ''
+       for t in parsed
+               serialized += sep
+               sep = ','
+               serialized += t.serialize()
+       if serialized isnt args.expected or parse_errors.length isnt args.errors
                console.log "test FAILED: \"#{args.name}\""
        else
                console.log 'test passed'
-       if parsed isnt args.expected
+       if serialized isnt args.expected
                console.log "      Input: #{args.html}"
                console.log "    Correct: #{args.expected}"
-               console.log "     Output: #{parsed}"
+               console.log "     Output: #{serialized}"
        if parse_errors.length isnt args.errors
                console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
 
 test_parser name: "empty", \
        html: "",
-       expected: '[]',
+       expected: '',
        errors: 0
 test_parser name: "just text", \
        html: "abc",
-       expected: '[[1,"abc"]]',
+       expected: 'text:"abc"',
        errors: 0
 test_parser name: "named entity", \
        html: "a&amp;1234",
-       expected: '[[1,"a&1234"]]',
+       expected: 'text:"a&1234"',
        errors: 0
 test_parser name: "broken named character references", \
        html: "1&amp2&&amp;3&aabbcc;",
-       expected: '[[1,"1&2&&3&aabbcc;"]]',
+       expected: 'text:"1&2&&3&aabbcc;"',
        errors: 2
 test_parser name: "numbered entity overrides", \
        html: "1&#X80&#x80; &#x83",
-       expected: '[[1,"1€€ ƒ"]]',
+       expected: 'text:"1€€ ƒ"',
        errors: 0
 test_parser name: "open tag", \
        html: "foo<span>bar",
-       expected: '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]',
+       expected: 'text:"foo",tag:"span",{},[text:"bar"]',
        errors: 1 # no close tag
 test_parser name: "open tag with attributes", \
        html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
-       expected: '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]',
+       expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
        errors: 1 # no close tag
 test_parser name: "open tag with attributes of various quotings", \
        html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
-       expected: '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]',
+       expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
        errors: 1 # no close tag
 test_parser name: "attribute entity exceptions dq", \
        html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
-       expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]',
+       expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
        errors: 2 # no close tag, &amp= in attr
 test_parser name: "attribute entity exceptions sq", \
        html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
-       expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]',
+       expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
        errors: 2 # no close tag, &amp= in attr
 test_parser name: "attribute entity exceptions uq", \
        html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
-       expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]',
+       expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
        errors: 2 # no close tag, &amp= in attr
 test_parser name: "matching closing tags", \
        html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
-       expected: '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]',
+       expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
        errors: 0
 test_parser name: "mis-matched closing tags", \
        html: "foo<div>bar<span>baz</div>qux",
-       expected: '[[1,"foo"],[0,"div",{},[[1,"bar"],[0,"span",{},[[1,"baz"]]]]],[1,"qux"]]',
+       expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
        errors: 1 # close tag mismatch