JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
rest of insertion modes (untested)
authorJason Woofenden <jason@jasonwoof.com>
Mon, 21 Dec 2015 04:13:18 +0000 (23:13 -0500)
committerJason Woofenden <jason@jasonwoof.com>
Mon, 21 Dec 2015 04:13:18 +0000 (23:13 -0500)
parse-html.coffee

index d35dd88..c6ed9a5 100644 (file)
@@ -85,6 +85,7 @@ class Node
                @children = args.children ? []
                @namespace = args.namespace ? NS_HTML
                @parent = args.parent ? null
+               @token = args.token ? null
                if args.id?
                        @id = "#{args.id}+"
                else
@@ -93,9 +94,12 @@ class Node
                # WARNING this doesn't work right on open tags that are still being parsed
                attrs = {}
                attrs[k] = v for k, v of @attrs
-               return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
+               return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id, token: @token
        acknowledge_self_closing: ->
-               @flag 'did_self_close', true
+               if @token?
+                       @token.flag 'did_self_close'
+               else
+                       @flag 'did_self_close', true
        flag: ->
                # fixfull
        serialize: (shallow = false, show_ids = false) -> # for unit tests
@@ -362,6 +366,10 @@ parse_html = (txt, parse_error_cb = null) ->
        temporary_buffer = null
        pending_table_character_tokens = null
        head_element_pointer = null
+       flag_fragment_parsing = null
+
+       stop_parsing = ->
+               flag_parsing = false
 
        parse_error = ->
                if parse_error_cb?
@@ -1051,7 +1059,7 @@ parse_html = (txt, parse_error_cb = null) ->
                while t.attrs_a.length
                        a = t.attrs_a.pop()
                        attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
-               el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
+               el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
 
                # TODO 2. If the newly created element has an xmlns attribute in the
                # XMLNS namespace whose value is not exactly the same as the element's
@@ -1230,12 +1238,12 @@ parse_html = (txt, parse_error_cb = null) ->
                if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
                        el = insert_html_element t
                        open_els.shift()
-                       el.acknowledge_self_closing()
+                       t.acknowledge_self_closing()
                        return
                if t.type is TYPE_START_TAG and t.name is 'meta'
                        el = insert_html_element t
                        open_els.shift()
-                       el.acknowledge_self_closing()
+                       t.acknowledge_self_closing()
                        # fixfull encoding stuff
                        return
                if t.type is TYPE_START_TAG and t.name is 'title'
@@ -1458,7 +1466,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                                parse_error()
                                                break
                                # TODO stack of template insertion modes thing
-                               flag_parsing = false # stop parsing
+                               stop_parsing()
                        when TYPE_END_TAG
                                switch t.name
                                        when 'body'
@@ -1594,7 +1602,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                                        parse_error()
                                                        el = insert_html_element t
                                                        open_els.shift()
-                                                       el.acknowledge_self_closing()
+                                                       t.acknowledge_self_closing()
                                        when 'form'
                                                parse_error()
                                                if form_element_pointer?
@@ -1705,7 +1713,7 @@ parse_html = (txt, parse_error_cb = null) ->
                if t.type is TYPE_START_TAG and t.name is 'col'
                        el = insert_html_element t
                        open_els.shift()
-                       el.acknowledge_self_closing()
+                       t.acknowledge_self_closing()
                        return
                if t.type is TYPE_END_TAG and t.name is 'colgroup'
                        if open_els[0].name is 'colgroup'
@@ -1982,14 +1990,191 @@ parse_html = (txt, parse_error_cb = null) ->
                ins_mode_in_select t
                return
 
-       # CONTINUE more insertion modes!
-
-
+       # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
+       ins_mode_in_template = (t) ->
+               if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
+                       ins_mode_in_body t
+                       return
+               if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
+                       ins_mode_in_head t
+                       return
+               if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
+                       template_insertion_modes.shift()
+                       template_insertion_modes.unshift ins_mode_in_table
+                       insertion_mode = ins_mode_in_table
+                       insertion_mode t
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'col'
+                       template_insertion_modes.shift()
+                       template_insertion_modes.unshift ins_mode_in_column_group
+                       insertion_mode = ins_mode_in_column_group
+                       insertion_mode t
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'tr'
+                       template_insertion_modes.shift()
+                       template_insertion_modes.unshift ins_mode_in_table_body
+                       insertion_mode = ins_mode_in_table_body
+                       insertion_mode t
+                       return
+               if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
+                       template_insertion_modes.shift()
+                       template_insertion_modes.unshift ins_mode_in_row
+                       insertion_mode = ins_mode_in_row
+                       insertion_mode t
+                       return
+               if t.type is TYPE_START_TAG
+                       template_insertion_modes.shift()
+                       template_insertion_modes.unshift ins_mode_in_body
+                       insertion_mode = ins_mode_in_body
+                       insertion_mode t
+                       return
+               if t.type is TYPE_END_TAG
+                       parse_error()
+                       return
+               if t.type is EOF
+                       unless template_tag_is_open()
+                               stop_parsing()
+                               return
+                       parse_error()
+                       loop
+                               el = open_els.shift()
+                               if el.name is 'template' # fixfull check namespace
+                                       break
+                       clear_afe_to_marker()
+                       template_insertion_modes.shift()
+                       reset_insertion_mode()
+                       insertion_mode t
 
+       # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
+       ins_mode_after_body = (t) ->
+               if is_space_tok t
+                       ins_mode_in_body t
+                       return
+               if t.type is TYPE_COMMENT
+                       insert_comment t, [open_els[0], open_els[0].children.length]
+                       return
+               if t.type is TYPE_DOCTYPE
+                       parse_error()
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'html'
+                       ins_mode_in_body t
+                       return
+               if t.type is TYPE_END_TAG and t.name is 'html'
+                       # fixfull fragment case
+                       insertion_mode = ins_mode_after_after_body
+                       return
+               if t.type is TYPE_EOF
+                       stop_parsing()
+                       return
+               # Anything ELse
+               parse_error()
+               insertion_mode = ins_mode_in_body
+               insertion_mode t
 
+       # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
+       ins_mode_in_frameset = (t) ->
+               if is_space_tok t
+                       insert_character t
+                       return
+               if t.type is TYPE_COMMENT
+                       insert_comment t
+                       return
+               if t.type is TYPE_DOCTYPE
+                       parse_error()
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'html'
+                       ins_mode_in_body t
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'frameset'
+                       insert_html_element t
+                       return
+               if t.type is TYPE_END_TAG and t.name is 'frameset'
+                       # TODO ?correct for: "if the current node is the root html element"
+                       if open_els.length is 1
+                               parse_error()
+                               return # fragment case
+                       open_els.shift()
+                       if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
+                               insertion_mode = ins_mode_after_frameset
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'frame'
+                       insert_html_element t
+                       open_els.shift()
+                       t.acknowledge_self_closing()
+                       return
+               if t.type is TYPE_START TAG and t.name is 'noframes'
+                       ins_mode_in_head t
+                       return
+               if t.type is TYPE_EOF
+                       # TODO ?correct for: "if the current node is not the root html element"
+                       if open_els.length isnt 1
+                               parse_error()
+                       stop_parsing()
+                       return
+               # Anything else
+               parse_error()
+               return
 
+       # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
+       ins_mode_after_frameset = (t) ->
+               if is_space_tok t
+                       insert_character t
+                       return
+               if t.type is TYPE_COMMENT
+                       insert_comment t
+                       return
+               if t.type is TYPE_DOCTYPE
+                       parse_error()
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'html'
+                       ins_mode_in_body t
+                       return
+               if t.type is TYPE_END_TAG and t.name is 'html'
+                       insert_mode = ins_mode_after_after_frameset
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'noframes'
+                       ins_mode_in_head t
+                       return
+               if t.type is TYPE_EOF
+                       stop_parsing()
+                       return
+               # Anything else
+               parse_error()
+               return
 
+       # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
+       ins_mode_after_after_body = (t) ->
+               if t.type is TYPE_COMMENT
+                       insert_comment t, [doc, doc.children.length]
+                       return
+               if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
+                       ins_mode_in_body t
+                       return
+               if t.type is TYPE_EOF
+                       stop_parsing()
+                       return
+               # Anything else
+               parse_error()
+               insertion_mode = ins_mode_in_body
+               return
 
+       # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
+       ins_mode_after_after_frameset = (t) ->
+               if t.type is TYPE_COMMENT
+                       insert_comment t, [doc, doc.children.length]
+                       return
+               if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
+                       ins_mode_in_body t
+                       return
+               if t.type is TYPE_EOF
+                       stop_parsing()
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'noframes'
+                       ins_mode_in_head t
+                       return
+               # Anything else
+               parse_error()
+               return
 
 
 
@@ -2578,6 +2763,7 @@ parse_html = (txt, parse_error_cb = null) ->
        temporary_buffer = null
        pending_table_character_tokens = []
        head_element_pointer = null
+       flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
 
        # tokenizer initialization
        tok_state = tok_state_data
@@ -2587,8 +2773,10 @@ parse_html = (txt, parse_error_cb = null) ->
                t = tok_state()
                if t?
                        insertion_mode t
+                       # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
        return doc.children
 
+test_results = passed: 0, failed: 0
 # everything below is tests on the above
 test_equals = (description, output, expected_output) ->
        if output is expected_output
@@ -2625,8 +2813,13 @@ test_parser = (args) ->
                        console.log " parse errs: #{JSON.stringify parse_errors}"
                else
                        console.log "   No parse errors"
+               test_results.failed += 1
        else
-               console.log "passed \"#{args.name}\""
+               #console.log "passed \"#{args.name}\""
+               test_results.passed += 1
+test_summary = ->
+       console.log "Tests passed: #{test_results.passed}"
+       console.log "Tests Failed: #{test_results.failed}"
 
 test_parser name: "empty", \
        html: "",
@@ -2801,3 +2994,4 @@ test_parser name: "html5lib tables 16", \
 #test_parser name: "html5lib tables 17", \
 #      html: '<table><tr><td><svg><desc><td>',
 #      expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[svg:"svg",{},[svg:"desc",{},[]]],tag:"td",{},[]]]]'
+test_summary()