JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
fix implied_end_tags and </p>
authorJason Woofenden <jason@jasonwoof.com>
Sat, 19 Dec 2015 14:39:46 +0000 (09:39 -0500)
committerJason Woofenden <jason@jasonwoof.com>
Sat, 19 Dec 2015 14:39:46 +0000 (09:39 -0500)
parse-html.coffee

index c71567d..25fa20d 100644 (file)
@@ -553,7 +553,7 @@ parse_html = (txt, parse_error_cb = null) ->
                        tree_insert_element el
                        afe[i] = el
                        break if i is 0
-                       i -= 1
+                       i -= 1 # Advance
 
        # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
        # adoption agency algorithm
@@ -562,6 +562,10 @@ parse_html = (txt, parse_error_cb = null) ->
        #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
        #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
        adoption_agency = (subject) ->
+               debug_log "adoption_agency()"
+               debug_log "tree: #{serialize_els tree.children, false, true}"
+               debug_log "open_els: #{serialize_els open_els, true, true}"
+               debug_log "afe: #{serialize_els afe, true, true}"
                if open_els[0].name is subject
                        el = open_els[0]
                        open_els.shift()
@@ -570,6 +574,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                if t is el
                                        afe.splice i, 1
                                        break
+                       debug_log "aaa: starting off with subject on top of stack, exiting"
                        return
                outer = 0
                loop
@@ -590,6 +595,7 @@ parse_html = (txt, parse_error_cb = null) ->
                        # If there is no such element, then abort these steps and instead
                        # act as described in the "any other end tag" entry above.
                        if fe is null
+                               debug_log "aaa: fe not found in afe"
                                in_body_any_other_end_tag subject
                                return
                        # 6. If formatting element is not in the stack of open elements,
@@ -601,6 +607,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                        in_open_els = true
                                        break
                        unless in_open_els
+                               debug_log "aaa: fe not found in open_els"
                                parse_error()
                                # "remove it from the list" must mean afe, since it's not in open_els
                                afe.splice fe_of_afe, 1
@@ -609,6 +616,7 @@ parse_html = (txt, parse_error_cb = null) ->
                        # the element is not in scope, then this is a parse error; abort
                        # these steps.
                        unless el_is_in_scope fe
+                               debug_log "aaa: fe not in scope"
                                parse_error()
                                return
                        # 8. If formatting element is not the current node, this is a parse
@@ -634,6 +642,7 @@ parse_html = (txt, parse_error_cb = null) ->
                        # formatting element from the list of active formatting elements,
                        # and finally abort these steps.
                        if fb is null
+                               debug_log "aaa: no fb"
                                loop
                                        t = open_els.shift()
                                        if t is fe
@@ -666,8 +675,8 @@ parse_html = (txt, parse_error_cb = null) ->
                                                break
                                node = node_next ? node_above
                                debug_log "inner loop #{inner}"
-                               debug_log "open_els: #{serialize_els open_els, true, true}"
                                debug_log "tree: #{serialize_els tree.children, false, true}"
+                               debug_log "open_els: #{serialize_els open_els, true, true}"
                                debug_log "afe: #{serialize_els afe, true, true}"
                                debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
                                debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
@@ -845,8 +854,8 @@ parse_html = (txt, parse_error_cb = null) ->
                if open_els[0].name isnt 'p'
                        parse_error()
                while open_els.length > 1 # just in case
-                       t = open_els.shift()
-                       if t.name is 'p'
+                       el = open_els.shift()
+                       if el.name is 'p'
                                return
        close_p_if_in_button_scope = ->
                if is_in_button_scope 'p'
@@ -855,6 +864,7 @@ parse_html = (txt, parse_error_cb = null) ->
        # http://www.w3.org/TR/html5/syntax.html#insert-a-character
        tree_insert_text = (t) ->
                dest = adjusted_insertion_location()
+               # fixfull check for Document node
                if dest[1] > 0
                        prev = dest[0].children[dest[1] - 1]
                        if prev.type is TYPE_TEXT
@@ -1019,7 +1029,7 @@ parse_html = (txt, parse_error_cb = null) ->
        # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
        # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
        generate_implied_end_tags = (except = null) ->
-               while end_tag_implied[open_els[0]] and open_els[0].name isnt except
+               while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
                        open_els.shift()
 
        # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
@@ -1106,11 +1116,11 @@ parse_html = (txt, parse_error_cb = null) ->
                                                                if el is found
                                                                        open_els.splice i, 1
                                                reconstruct_active_formatting_elements()
-                                               el = tree_insert_element t
+                                               el = insert_html_element t
                                                afe.unshift el
                                        when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
                                                reconstruct_active_formatting_elements()
-                                               el = tree_insert_element t
+                                               el = insert_html_element t
                                                afe.unshift el
                                        when 'table'
                                                # fixfull quirksmode thing
@@ -1120,7 +1130,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                        # TODO lots more to implement here
                                        else # any other start tag
                                                reconstruct_active_formatting_elements()
-                                               tree_insert_element t
+                                               insert_html_element t
                        when TYPE_EOF
                                ok_tags = {
                                        dd: true, dt: true, li: true, p: true, tbody: true, td: true,
@@ -1160,7 +1170,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                                unless is_in_button_scope 'p'
                                                        parse_error()
                                                        insert_html_element new_open_tag 'p'
-                                                       close_p_element()
+                                               close_p_element()
                                        # TODO lots more close tags to implement here
                                        when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
                                                adoption_agency t.name
@@ -1844,119 +1854,98 @@ test_parser = (args) ->
        prev_node_id = 0 # reset counter
        parsed = parse_html args.html, errors_cb
        serialized = serialize_els parsed, false, false
-       if serialized isnt args.expected # or parse_errors.length isnt args.errors
+       if serialized isnt args.expected
                debug_log_each (str) ->
                        console.log str
                console.log "FAILED: \"#{args.name}\""
-       else
-               console.log "passed \"#{args.name}\""
-       if serialized isnt args.expected
                console.log "      Input: #{args.html}"
                console.log "    Correct: #{args.expected}"
                console.log "     Output: #{serialized}"
-               if parse_errors.length isnt args.errors
-                       console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
+               if parse_errors.length > 0
+                       console.log " parse errs: #{JSON.stringify parse_errors}"
+               else
+                       console.log "   No parse errors"
+       else
+               console.log "passed \"#{args.name}\""
 
 test_parser name: "empty", \
        html: "",
-       expected: '',
-       errors: 0
+       expected: ''
 test_parser name: "just text", \
        html: "abc",
-       expected: 'text:"abc"',
-       errors: 0
+       expected: 'text:"abc"'
 test_parser name: "named entity", \
        html: "a&amp;1234",
-       expected: 'text:"a&1234"',
-       errors: 0
+       expected: 'text:"a&1234"'
 test_parser name: "broken named character references", \
        html: "1&amp2&&amp;3&aabbcc;",
-       expected: 'text:"1&2&&3&aabbcc;"',
-       errors: 2
+       expected: 'text:"1&2&&3&aabbcc;"'
 test_parser name: "numbered entity overrides", \
        html: "1&#X80&#x80; &#x83",
-       expected: 'text:"1€€ ƒ"',
-       errors: 0
+       expected: 'text:"1€€ ƒ"'
 test_parser name: "open tag", \
        html: "foo<span>bar",
-       expected: 'text:"foo",tag:"span",{},[text:"bar"]',
-       errors: 1 # no close tag
+       expected: 'text:"foo",tag:"span",{},[text:"bar"]'
 test_parser name: "open tag with attributes", \
        html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
-       expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
-       errors: 1 # no close tag
+       expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
 test_parser name: "open tag with attributes of various quotings", \
        html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
-       expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
-       errors: 1 # no close tag
+       expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]'
 test_parser name: "attribute entity exceptions dq", \
        html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
-       expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
-       errors: 2 # no close tag, &amp= in attr
+       expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
 test_parser name: "attribute entity exceptions sq", \
        html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
-       expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
-       errors: 2 # no close tag, &amp= in attr
+       expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
 test_parser name: "attribute entity exceptions uq", \
        html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
-       expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
-       errors: 2 # no close tag, &amp= in attr
+       expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
 test_parser name: "matching closing tags", \
        html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
-       expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
-       errors: 0
+       expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"'
 test_parser name: "missing closing tag inside", \
        html: "foo<div>bar<span>baz</div>qux",
-       expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
-       errors: 1 # close tag mismatch
+       expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"'
 test_parser name: "mis-matched closing tags", \
        html: "<span>12<div>34</span>56</div>78",
-       expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
-       errors: 2 # misplaced </span>, no </span> at the end
+       expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]'
 test_parser name: "mis-matched formatting elements", \
        html: "12<b>34<i>56</b>78</i>90",
-       expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
-       errors: 1 # no idea how many their should be
+       expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"'
 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
        html: '<p>1<b>2<i>3</b>4</i>5</p>',
-       expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]',
-       errors: 1
+       expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]'
 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
        html: '<b>1<p>2</b>3</p>',
-       expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]',
-       errors: 1
+       expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]'
 test_parser name: "crazy formatting elements test", \
        html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
        # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
        # firefox does this:
-       expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"',
-       errors: 6 # no idea how many there should be
+       expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
 test_parser name: "html5lib aaa 1", \
        html: '<a><p></a></p>',
-       expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]',
-       errors: 2
+       expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]'
 test_parser name: "html5lib aaa 2", \
        html: '<a>1<p>2</a>3</p>',
-       expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]',
-       errors: 2
+       expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]'
 test_parser name: "html5lib aaa 3", \
        html: '<a>1<button>2</a>3</button>',
-       expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]',
-       errors: 2
+       expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]'
 test_parser name: "html5lib aaa 4", \
        html: '<a>1<b>2</a>3</b>',
-       expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]',
-       errors: 2
+       expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]'
 test_parser name: "html5lib aaa 5 (two divs deep)", \
        html: '<a>1<div>2<div>3</a>4</div>5</div>',
-       expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]',
-       errors: 3
+       expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]'
 test_parser name: "html5lib aaa 6 (foster parenting)", \
        html: '<table><a>1<p>2</a>3</p>',
-       expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]',
-       errors: 10
+       expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]'
+test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \
+       html: '<p>1<s id="A">2<b id="B">3</p>4</s>5</b>',
+       expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]'
 test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
        html: '<table><a>1<td>2</td>3</table>',
-       expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]',
-       errors: 10
+       expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]'