From 12ed8923d45bd72d534bfadb0dc5a644c97f3bc8 Mon Sep 17 00:00:00 2001 From: Jason Woofenden Date: Sat, 19 Dec 2015 12:32:59 -0500 Subject: [PATCH] implement noa's ark, junk after attribute name --- parse-html.coffee | 85 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 78 insertions(+), 7 deletions(-) diff --git a/parse-html.coffee b/parse-html.coffee index aaecff3..86d0136 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -105,8 +105,17 @@ class Node ret += "##{@id}," if shallow break - ret += JSON.stringify @attrs - ret += ',[' + attr_keys = [] + for k of @attrs + attr_keys.push k + attr_keys.sort() + ret += '{' + sep = '' + for k in attr_keys + ret += sep + sep = ',' + ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}" + ret += '},[' sep = '' for c in @children ret += sep @@ -347,6 +356,21 @@ parse_html = (txt, parse_error_cb = null) -> else console.log "Parse error at character #{cur} of #{txt.length}" + afe_push = (new_el) -> + matches = 0 + for el, i in afe + if el.name is new_el.name and el.namespace is new_el.namespace + for k, v of el.attrs + continue unless new_el.attrs[k] is v + for k, v of new_el.attrs + continue unless el.attrs[k] is v + matches += 1 + if matches is 3 + afe.splice i, 1 + break + afe.unshift new_el + afe_push_marker = -> + afe.unshift new_afe_marker() # the functions below impliment the Tree Contstruction algorithm # http://www.w3.org/TR/html5/syntax.html#tree-construction @@ -859,7 +883,7 @@ parse_html = (txt, parse_error_cb = null) -> return close_p_if_in_button_scope = -> if is_in_button_scope 'p' - close_a_p_element() + close_p_element() # http://www.w3.org/TR/html5/syntax.html#insert-a-character tree_insert_text = (t) -> @@ -1117,11 +1141,11 @@ parse_html = (txt, parse_error_cb = null) -> open_els.splice i, 1 reconstruct_active_formatting_elements() el = insert_html_element t - afe.unshift el + afe_push el when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u' reconstruct_active_formatting_elements() el = insert_html_element t - afe.unshift el + afe_push el when 'table' # fixfull quirksmode thing close_p_if_in_button_scope() @@ -1248,7 +1272,7 @@ parse_html = (txt, parse_error_cb = null) -> switch t.name when 'caption' clear_stack_to_table_context() - afe.unshift new_afe_marker() + afe_push_marker() insert_html_element t insertion_mode = ins_mode_in_caption when 'colgroup' @@ -1378,7 +1402,7 @@ parse_html = (txt, parse_error_cb = null) -> clear_stack_to_table_row_context() insert_html_element t insertion_mode = ins_mode_in_cell - afe.unshift new_afe_marker() + afe_push_marker() return if t.type is TYPE_END_TAG and t.name is 'tr' if is_in_table_scope 'tr' @@ -1626,6 +1650,41 @@ parse_html = (txt, parse_error_cb = null) -> tok_cur_tag.attrs_a[0][0] += c return null + # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state + tok_state_after_attribute_name = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\n" or c is "\u000c" or c is ' ' + return + if c is '/' + tok_state = tok_state_self_closing_start_tag + return + if c is '=' + tok_state = tok_state_before_attribute_value + return + if c is '>' + tok_state = tok_state_data + return + if uc_alpha.indexOf(c) > -1 + tok_cur_tag.attrs_a.unshift [c.toLowerCase(), ''] + tok_state = tok_state_attribute_name + return + if c is "\u0000" + parse_error() + tok_cur_tag.attrs_a.unshift ["\ufffd", ''] + tok_state = tok_state_attribute_name + return + if c is '' # EOF + parse_error() + tok_state = tok_state_data + cur -= 1 # reconsume + return + if c is '"' or c is "'" or c is '<' + parse_error() + # fall through to Anything else + # Anything else + tok_cur_tag.attrs_a.unshift [c, ''] + tok_state = tok_state_attribute_name + # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state tok_state_before_attribute_value = -> switch c = txt.charAt(cur++) @@ -1971,3 +2030,15 @@ test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \ test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \ html: '
', expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]' +test_parser name: "html5lib aaa 16 (correctly nested 4b)", \ + html: 'xy', + expected: 'tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]],text:"y"' +test_parser name: "html5lib aaa 17 (formatting, implied /p, noah's ark)", \ + html: '

x', + expected: 'tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[]]]]],tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]]' +test_parser name: "variation on html5lib aaa 17 (with attributes in various orders)", \ + html: '

x', + expected: 'tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[]]]]],tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[text:"x"]]]]' +test_parser name: "junk after attribute close-quote", \ + html: '

foo

x', + expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]' -- 1.7.10.4