JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
implement noa's ark, junk after attribute name
authorJason Woofenden <jason@jasonwoof.com>
Sat, 19 Dec 2015 17:32:59 +0000 (12:32 -0500)
committerJason Woofenden <jason@jasonwoof.com>
Sat, 19 Dec 2015 17:32:59 +0000 (12:32 -0500)
parse-html.coffee

index aaecff3..86d0136 100644 (file)
@@ -105,8 +105,17 @@ class Node
                                        ret += "##{@id},"
                                if shallow
                                        break
-                               ret += JSON.stringify @attrs
-                               ret += ',['
+                               attr_keys = []
+                               for k of @attrs
+                                       attr_keys.push k
+                               attr_keys.sort()
+                               ret += '{'
+                               sep = ''
+                               for k in attr_keys
+                                       ret += sep
+                                       sep = ','
+                                       ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
+                               ret += '},['
                                sep = ''
                                for c in @children
                                        ret += sep
@@ -347,6 +356,21 @@ parse_html = (txt, parse_error_cb = null) ->
                else
                        console.log "Parse error at character #{cur} of #{txt.length}"
 
+       afe_push = (new_el) ->
+               matches = 0
+               for el, i in afe
+                       if el.name is new_el.name and el.namespace is new_el.namespace
+                               for k, v of el.attrs
+                                       continue unless new_el.attrs[k] is v
+                               for k, v of new_el.attrs
+                                       continue unless el.attrs[k] is v
+                               matches += 1
+                               if matches is 3
+                                       afe.splice i, 1
+                                       break
+               afe.unshift new_el
+       afe_push_marker = ->
+               afe.unshift new_afe_marker()
 
        # the functions below impliment the Tree Contstruction algorithm
        # http://www.w3.org/TR/html5/syntax.html#tree-construction
@@ -859,7 +883,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                return
        close_p_if_in_button_scope = ->
                if is_in_button_scope 'p'
-                       close_a_p_element()
+                       close_p_element()
 
        # http://www.w3.org/TR/html5/syntax.html#insert-a-character
        tree_insert_text = (t) ->
@@ -1117,11 +1141,11 @@ parse_html = (txt, parse_error_cb = null) ->
                                                                        open_els.splice i, 1
                                                reconstruct_active_formatting_elements()
                                                el = insert_html_element t
-                                               afe.unshift el
+                                               afe_push el
                                        when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
                                                reconstruct_active_formatting_elements()
                                                el = insert_html_element t
-                                               afe.unshift el
+                                               afe_push el
                                        when 'table'
                                                # fixfull quirksmode thing
                                                close_p_if_in_button_scope()
@@ -1248,7 +1272,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                switch t.name
                                        when 'caption'
                                                clear_stack_to_table_context()
-                                               afe.unshift new_afe_marker()
+                                               afe_push_marker()
                                                insert_html_element t
                                                insertion_mode = ins_mode_in_caption
                                        when 'colgroup'
@@ -1378,7 +1402,7 @@ parse_html = (txt, parse_error_cb = null) ->
                        clear_stack_to_table_row_context()
                        insert_html_element t
                        insertion_mode = ins_mode_in_cell
-                       afe.unshift new_afe_marker()
+                       afe_push_marker()
                        return
                if t.type is TYPE_END_TAG and t.name is 'tr'
                        if is_in_table_scope 'tr'
@@ -1626,6 +1650,41 @@ parse_html = (txt, parse_error_cb = null) ->
                                        tok_cur_tag.attrs_a[0][0] += c
                return null
 
+       # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
+       tok_state_after_attribute_name = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
+                       return
+               if c is '/'
+                       tok_state = tok_state_self_closing_start_tag
+                       return
+               if c is '='
+                       tok_state = tok_state_before_attribute_value
+                       return
+               if c is '>'
+                       tok_state = tok_state_data
+                       return
+               if uc_alpha.indexOf(c) > -1
+                       tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
+                       tok_state = tok_state_attribute_name
+                       return
+               if c is "\u0000"
+                       parse_error()
+                       tok_cur_tag.attrs_a.unshift ["\ufffd", '']
+                       tok_state = tok_state_attribute_name
+                       return
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       cur -= 1 # reconsume
+                       return
+               if c is '"' or c is "'" or c is '<'
+                       parse_error()
+                       # fall through to Anything else
+               # Anything else
+               tok_cur_tag.attrs_a.unshift [c, '']
+               tok_state = tok_state_attribute_name
+
        # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
        tok_state_before_attribute_value = ->
                switch c = txt.charAt(cur++)
@@ -1971,3 +2030,15 @@ test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \
 test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \
        html: '<div><a><b><u><i><code><div></a>',
        expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]'
+test_parser name: "html5lib aaa 16 (correctly nested 4b)", \
+       html: '<b><b><b><b>x</b></b></b></b>y',
+       expected: 'tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]],text:"y"'
+test_parser name: "html5lib aaa 17 (formatting, implied /p, noah's ark)", \
+       html: '<p><b><b><b><b><p>x',
+       expected: 'tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[]]]]],tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]]'
+test_parser name: "variation on html5lib aaa 17 (with attributes in various orders)", \
+       html: '<p><b c="d" e="f"><b e="f" c="d"><b e="f" c="d"><b c="d" e="f"><p>x',
+       expected: 'tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[]]]]],tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[text:"x"]]]]'
+test_parser name: "junk after attribute close-quote", \
+       html: '<p><b c="d", e="f">foo<p>x',
+       expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'