JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
finish parsing tables, pass html5lib's table tests
authorJason Woofenden <jason@jasonwoof.com>
Sun, 20 Dec 2015 16:20:47 +0000 (11:20 -0500)
committerJason Woofenden <jason@jasonwoof.com>
Sun, 20 Dec 2015 16:20:47 +0000 (11:20 -0500)
parse-html.coffee

index c63a42d..c16f690 100644 (file)
@@ -95,6 +95,8 @@ class Node
                attrs[k] = v for k, v of @attrs
                return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
        acknowledge_self_closing: ->
+               @flag 'did_self_close', true
+       flag: ->
                # fixfull
        serialize: (shallow = false, show_ids = false) -> # for unit tests
                ret = ''
@@ -356,6 +358,7 @@ parse_html = (txt, parse_error_cb = null) ->
        flag_foster_parenting = null
        form_element_pointer = null
        temporary_buffer = null
+       pending_table_character_tokens = null
 
        parse_error = ->
                if parse_error_cb?
@@ -1053,7 +1056,7 @@ parse_html = (txt, parse_error_cb = null) ->
 
        # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
        # position should be [node, index_within_children]
-       tree_insert_comment = (t, position = null) ->
+       insert_comment = (t, position = null) ->
                position ?= adjusted_insertion_location()
                position[0].children.splice position[1], 0, t
 
@@ -1086,7 +1089,7 @@ parse_html = (txt, parse_error_cb = null) ->
                        insert_character t
                        return
                if t.type is TYPE_COMMENT
-                       tree_insert_comment t
+                       insert_comment t
                        return
                if t.type is TYPE_DOCTYPE
                        parse_error()
@@ -1187,7 +1190,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                                insert_character t
                                                flag_frameset_ok = false
                        when TYPE_COMMENT
-                               tree_insert_comment t
+                               insert_comment t
                        when TYPE_DOCTYPE
                                parse_error()
                        when TYPE_START_TAG
@@ -1399,7 +1402,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                else
                                        ins_mode_in_table_else t
                        when TYPE_COMMENT
-                               tree_insert_comment t
+                               insert_comment t
                        when TYPE_DOCTYPE
                                parse_error()
                        when TYPE_START_TAG
@@ -1479,22 +1482,116 @@ parse_html = (txt, parse_error_cb = null) ->
                                ins_mode_in_table_else t
 
 
-       # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
+       # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
        ins_mode_in_table_text = (t) ->
-               switch t.type
-                       when TYPE_TEXT
-                               switch t.text
-                                       when "\u0000"
-                                               parse_error()
-                                               return
-               console.log "unimplemented ins_mode_in_table_text"
-               # FIXME CONTINUE
+               if t.type is TYPE_TEXT and t.text is "\u0000"
+                       # huh? I thought the tokenizer didn't emit these
+                       parse_error()
+                       return
+               if t.type is TYPE_TEXT
+                       pending_table_character_tokens.push t
+                       return
+               # Anything else
+               all_space = true
+               for old in pending_table_character_tokens
+                       unless space_chars.indexOf(old.text) > -1
+                               all_space = false
+                               break
+               if all_space
+                       for old in pending_table_character_tokens
+                               insert_character old
+               else
+                       for old in pending_table_character_tokens
+                               ins_mode_table_else old
+               pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
+               insertion_mode = original_insertion_mode
+               insertion_mode t
+
+       # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
+       ins_mode_in_caption = (t) ->
+               if t.type is TYPE_END_TAG and t.name is 'caption'
+                       if is_in_table_scope 'caption'
+                               generate_implied_end_tags()
+                               if open_els[0].name isnt 'caption'
+                                       parse_error()
+                               loop
+                                       el = open_els.shift()
+                                       if el.name is 'caption'
+                                               break
+                               clear_afe_to_marker()
+                               insertion_mode = in_table
+                       else
+                               parse_error()
+                               # fragment case
+                       return
+               if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
+                       parse_error()
+                       if is_in_table_scope 'caption'
+                               loop
+                                       el = open_els.shift()
+                                       if el.name is 'caption'
+                                               break
+                               clear_afe_to_marker()
+                               insertion_mode = in_table
+                               insertion_mode t
+                       # else fragment case
+                       return
+               if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
+                       parse_error()
+                       return
+               # Anything else
+               ins_mode_in_body t
+
+       # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
+       ins_mode_in_column_group = (t) ->
+               if t.type is TYPE_TEXT and space_chars.indexOf(t.text) > -1
+                       insert_character t
+                       return
+               if t.type is TYPE_COMMENT
+                       insert_comment t
+                       return
+               if t.type is TYPE_DOCTYPE
+                       parse_error()
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'html'
+                       ins_mode_in_body t
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'col'
+                       el = insert_html_element t
+                       open_els.shift()
+                       el.acknowledge_self_closing()
+                       return
+               if t.type is TYPE_END_TAG and t.name is 'colgroup'
+                       if open_els[0].name is 'colgroup'
+                               open_els[0].shift()
+                               insertion_mode = ins_mode_in_table
+                       else
+                               parse_error()
+                       return
+               if t.type is TYPE_END_TAG and t.name is 'col'
+                       parse_error()
+                       return
+               if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
+                       ins_mode_in_head t
+                       return
+               if t.type is TYPE_EOF
+                       ins_mode_in_body t
+                       return
+               # Anything else
+               if open_els[0].name isnt 'colgroup'
+                       parse_error()
+                       return
+               open_els.shift()
+               insertion_mode = ins_mode_in_table
+               insertion_mode t
+               return
 
        # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
        ins_mode_in_table_body = (t) ->
                if t.type is TYPE_START_TAG and t.name is 'tr'
                        clear_stack_to_table_body_context()
                        insert_html_element t
+                       insertion_mode = ins_mode_in_row
                        return
                if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
                        parse_error()
@@ -2210,6 +2307,7 @@ parse_html = (txt, parse_error_cb = null) ->
        flag_foster_parenting = false
        form_element_pointer = null
        temporary_buffer = null
+       pending_table_character_tokens = []
 
        # tokenizer initialization
        tok_state = tok_state_data
@@ -2380,3 +2478,55 @@ test_parser name: "html5lib aaa02 1", \
 test_parser name: "html5lib aaa02 2", \
        html: '<a><div><style></style><address><a>',
        expected: 'tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"style",{},[]],tag:"address",{},[tag:"a",{},[],tag:"a",{},[]]]'
+test_parser name: "html5lib tables 1", \
+       html: '<table><th>',
+       expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"th",{},[]]]]'
+test_parser name: "html5lib tables 2", \
+       html: '<table><td>',
+       expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
+test_parser name: "html5lib tables 3", \
+       html: "<table><col foo='bar'>",
+       expected: 'tag:"table",{},[tag:"colgroup",{},[tag:"col",{"foo":"bar"},[]]]'
+test_parser name: "html5lib tables 4", \
+       html: '<table><colgroup></html>foo',
+       expected: 'text:"foo",tag:"table",{},[tag:"colgroup",{},[]]'
+test_parser name: "html5lib tables 5", \
+       html: '<table></table><p>foo',
+       expected: 'tag:"table",{},[],tag:"p",{},[text:"foo"]'
+test_parser name: "html5lib tables 6", \
+       html: '<table></body></caption></col></colgroup></html></tbody></td></tfoot></th></thead></tr><td>',
+       expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
+test_parser name: "html5lib tables 7", \
+       html: '<table><select><option>3</select></table>',
+       expected: 'tag:"select",{},[tag:"option",{},[text:"3"]],tag:"table",{},[]'
+test_parser name: "html5lib tables 8", \
+       html: '<table><select><table></table></select></table>',
+       expected: 'tag:"select",{},[],tag:"table",{},[],tag:"table",{},[]'
+test_parser name: "html5lib tables 9", \
+       html: '<table><select></table>',
+       expected: 'tag:"select",{},[],tag:"table",{},[]'
+test_parser name: "html5lib tables 10", \
+       html: '<table><select><option>A<tr><td>B</td></tr></table>',
+       expected: 'tag:"select",{},[tag:"option",{},[text:"A"]],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
+test_parser name: "html5lib tables 11", \
+       html: '<table><td></body></caption></col></colgroup></html>foo',
+       expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
+test_parser name: "html5lib tables 12", \
+       html: '<table><td>A</table>B',
+       expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"A"]]]],text:"B"'
+test_parser name: "html5lib tables 13", \
+       html: '<table><tr><caption>',
+       expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[]],tag:"caption",{},[]]'
+test_parser name: "html5lib tables 14", \
+       html: '<table><tr></body></caption></col></colgroup></html></td></th><td>foo',
+       expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
+test_parser name: "html5lib tables 15", \
+       html: '<table><td><tr>',
+       expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]],tag:"tr",{},[]]]'
+test_parser name: "html5lib tables 16", \
+       html: '<table><td><button><td>',
+       expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[tag:"button",{},[]],tag:"td",{},[]]]]'
+# TODO implement svg parsing
+#test_parser name: "html5lib tables 17", \
+#      html: '<table><tr><td><svg><desc><td>',
+#      expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[svg:"svg",{},[svg:"desc",{},[]]],tag:"td",{},[]]]]'