From adc7477c34f3a2aa480e7f2af5ea954d2421d000 Mon Sep 17 00:00:00 2001 From: Jason Woofenden Date: Tue, 22 Dec 2015 17:30:45 -0500 Subject: [PATCH] implement rest of tokenizer states --- parse-html.coffee | 421 +++++++++++++++++++++++++++++++++++++++++++++++++---- test.coffee | 11 +- 2 files changed, 398 insertions(+), 34 deletions(-) diff --git a/parse-html.coffee b/parse-html.coffee index e193118..f64c734 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -181,6 +181,11 @@ digits = "0123456789" alnum = lc_alpha + uc_alpha + digits hex_chars = digits + "abcdefABCDEF" +is_uc_alpha = (str) -> + return str.length is 1 and uc_alpha.indexOf(str) > -1 +is_lc_alpha = (str) -> + return str.length is 1 and lc_alpha.indexOf(str) > -1 + # some SVG elements have dashes in them tag_name_chars = alnum + "-" @@ -191,6 +196,15 @@ is_space = (txt) -> is_space_tok = (t) -> return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1 +is_input_hidden_tok = (t) -> + return unless t.type is TYPE_START_TAG + for a of t.attrs_a + if a[0] is 'type' + if a[1].toLowerCase() is 'hidden' + return true + return false + return false + # https://en.wikipedia.org/wiki/Whitespace_character#Unicode whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000" @@ -450,7 +464,7 @@ parse_html = (txt, parse_error_cb = null) -> for t in open_els if t.name is tag_name and (namespace is null or namespace is t.namespace) return true - if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option' + if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option' return false return false # this checks for a particular element, not by name @@ -499,9 +513,11 @@ parse_html = (txt, parse_error_cb = null) -> return clear_afe_to_marker = -> loop + return unless afe.length > 0 # this happens in fragment case, ?spec error el = afe.shift() if el.type is TYPE_AFE_MARKER return + return # 8.2.3.1 ... # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately @@ -1018,7 +1034,7 @@ parse_html = (txt, parse_error_cb = null) -> # last template's template contents, after its last child (if # any), and abort these substeps. if last_template and (last_table is null or last_template_i < last_table_i) - target = template # fixfull should be it's contents + target = last_template # fixfull should be it's contents target_i = target.children.length break # 4. If there is no last table, then let adjusted insertion @@ -1272,7 +1288,7 @@ parse_html = (txt, parse_error_cb = null) -> return if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false insert_html_element t - insertion_mode = in_head_noscript # FIXME implement + insertion_mode = ins_mode_in_head_noscript # FIXME implement return if t.type is TYPE_START_TAG and t.name is 'script' ail = adjusted_insertion_location() @@ -1614,7 +1630,7 @@ parse_html = (txt, parse_error_cb = null) -> when 'style', 'script', 'template' ins_mode_in_head t when 'input' - if token_is_input_hidden t + if is_input_hidden_tok t ins_mode_in_table_else t else parse_error() @@ -1691,7 +1707,7 @@ parse_html = (txt, parse_error_cb = null) -> if el.name is 'caption' break clear_afe_to_marker() - insertion_mode = in_table + insertion_mode = ins_mode_in_table else parse_error() # fragment case @@ -1704,7 +1720,7 @@ parse_html = (txt, parse_error_cb = null) -> if el.name is 'caption' break clear_afe_to_marker() - insertion_mode = in_table + insertion_mode = ins_mode_in_table insertion_mode t # else fragment case return @@ -1735,7 +1751,7 @@ parse_html = (txt, parse_error_cb = null) -> return if t.type is TYPE_END_TAG and t.name is 'colgroup' if open_els[0].name is 'colgroup' - open_els[0].shift() + open_els.shift() insertion_mode = ins_mode_in_table else parse_error() @@ -2049,7 +2065,7 @@ parse_html = (txt, parse_error_cb = null) -> if t.type is TYPE_END_TAG parse_error() return - if t.type is EOF + if t.type is TYPE_EOF unless template_tag_is_open() stop_parsing() return @@ -2120,7 +2136,7 @@ parse_html = (txt, parse_error_cb = null) -> open_els.shift() t.acknowledge_self_closing() return - if t.type is TYPE_START TAG and t.name is 'noframes' + if t.type is TYPE_START_TAG and t.name is 'noframes' ins_mode_in_head t return if t.type is TYPE_EOF @@ -2291,10 +2307,10 @@ parse_html = (txt, parse_error_cb = null) -> tok_cur_tag = new_comment_token '?' tok_state = tok_state_bogus_comment else - if lc_alpha.indexOf(c) > -1 + if is_lc_alpha(c) tok_cur_tag = new_open_tag c tok_state = tok_state_tag_name - else if uc_alpha.indexOf(c) > -1 + else if is_uc_alpha(c) tok_cur_tag = new_open_tag c.toLowerCase() tok_state = tok_state_tag_name else @@ -2315,10 +2331,10 @@ parse_html = (txt, parse_error_cb = null) -> tok_state = tok_state_data return new_text_node ' -1 + if is_uc_alpha(c) tok_cur_tag = new_end_tag c.toLowerCase() tok_state = tok_state_tag_name - else if lc_alpha.indexOf(c) > -1 + else if is_lc_alpha(c) tok_cur_tag = new_end_tag c tok_state = tok_state_tag_name else @@ -2346,7 +2362,7 @@ parse_html = (txt, parse_error_cb = null) -> parse_error() tok_state = tok_state_data else - if uc_alpha.indexOf(c) > -1 + if is_uc_alpha(c) tok_cur_tag.name += c.toLowerCase() else tok_cur_tag.name += c @@ -2367,12 +2383,12 @@ parse_html = (txt, parse_error_cb = null) -> # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state tok_state_rcdata_end_tag_open = -> c = txt.charAt(cur++) - if uc_alpha.indexOf(c) > -1 + if is_uc_alpha(c) tok_cur_tag = new_end_tag c.toLowerCase() temporary_buffer += c tok_state = tok_state_rcdata_end_tag_name return null - if lc_alpha.indexOf(c) > -1 + if is_lc_alpha(c) tok_cur_tag = new_end_tag c temporary_buffer += c tok_state = tok_state_rcdata_end_tag_name @@ -2410,11 +2426,11 @@ parse_html = (txt, parse_error_cb = null) -> tok_state = tok_state_data return tok_cur_tag # else fall through to "Anything else" - if uc_alpha.indexOf(c) > -1 + if is_uc_alpha(c) tok_cur_tag.name += c.toLowerCase() temporary_buffer += c return null - if lc_alpha.indexOf(c) > -1 + if is_lc_alpha(c) tok_cur_tag.name += c temporary_buffer += c return null @@ -2438,12 +2454,12 @@ parse_html = (txt, parse_error_cb = null) -> # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state tok_state_rawtext_end_tag_open = -> c = txt.charAt(cur++) - if uc_alpha.indexOf(c) > -1 + if is_uc_alpha(c) tok_cur_tag = new_end_tag c.toLowerCase() temporary_buffer += c tok_state = tok_state_rawtext_end_tag_name return null - if lc_alpha.indexOf(c) > -1 + if is_lc_alpha(c) tok_cur_tag = new_end_tag c temporary_buffer += c tok_state = tok_state_rawtext_end_tag_name @@ -2471,11 +2487,11 @@ parse_html = (txt, parse_error_cb = null) -> tok_state = tok_state_data return tok_cur_tag # else fall through to "Anything else" - if uc_alpha.indexOf(c) > -1 + if is_uc_alpha(c) tok_cur_tag.name += c.toLowerCase() temporary_buffer += c return null - if lc_alpha.indexOf(c) > -1 + if is_lc_alpha(c) tok_cur_tag.name += c temporary_buffer += c return null @@ -2484,7 +2500,334 @@ parse_html = (txt, parse_error_cb = null) -> cur -= 1 # reconsume the input character return new_character_token ' + c = txt.charAt(cur++) + if c is '/' + temporary_buffer = '' + tok_state = tok_state_script_data_end_tag_open + return + if c is '!' + tok_state = tok_state_script_data_escape_start + return new_character_token ' + c = txt.charAt(cur++) + if is_uc_alpha(c) + tok_cur_tag = new_end_tag c.toLowerCase() + temporary_buffer += c + tok_state = tok_state_script_data_end_tag_name + return + if is_lc_alpha(c) + tok_cur_tag = new_end_tag c + temporary_buffer += c + tok_state = tok_state_script_data_end_tag_name + return + # Anything else + tok_state = tok_state_script_data + cur -= 1 # Reconsume + return new_character_token ' + c = txt.charAt(cur++) + if c is "\t" or c is "\n" or c is "\u000c" or c is ' ' + if is_appropriate_end_tag tok_cur_tag + tok_state = tok_state_before_attribute_name + return + # fall through + if c is '/' + if is_appropriate_end_tag tok_cur_tag + tok_state = tok_state_self_closing_start_tag + return + # fall through + if is_uc_alpha(c) + tok_cur_tag.name += c.toLowerCase() + temporary_buffer += c + return + if is_lc_alpha(c) + tok_cur_tag.name += c + temporary_buffer += c + return + # Anything else + tok_state = tok_state_script_data + cur -= 1 # Reconsume + return new_character_token " + c = txt.charAt(cur++) + if c is '-' + tok_state = tok_state_script_data_escape_start_dash + return new_character_token '-' + # Anything else + tok_state = tok_state_script_data + cur -= 1 # Reconsume + return + + # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state + tok_state_script_data_escape_start_dash = -> + c = txt.charAt(cur++) + if c is '-' + tok_state = tok_state_script_data_escaped_dash_dash + return new_character_token '-' + # Anything else + tok_state = tok_state_script_data + cur -= 1 # Reconsume + return + + # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state + tok_state_script_data_escaped = -> + c = txt.charAt(cur++) + if c is '-' + tok_state = tok_state_script_data_escaped_dash + return new_character_token '-' + if c is '<' + tok_state = tok_state_script_data_escaped_less_than_sign + return + if c is "\u0000" + parse_error() + return new_character_token "\ufffd" + if c is '' # EOF + tok_state = tok_state_data + parse_error() + cur -= 1 # Reconsume + return + # Anything else + return new_character_token c + + # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state + tok_state_script_data_escaped_dash = -> + c = txt.charAt(cur++) + if c is '-' + tok_state = tok_state_script_data_escaped_dash_dash + return new_character_token '-' + if c is '<' + tok_state = tok_state_script_data_escaped_less_than_sign + return + if c is "\u0000" + parse_error() + tok_state = tok_state_script_data_escaped + return new_character_token "\ufffd" + if c is '' # EOF + tok_state = tok_state_data + parse_error() + cur -= 1 # Reconsume + return + # Anything else + tok_state = tok_state_script_data_escaped + return new_character_token c + + # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state + tok_state_script_data_escaped_dash_dash = -> + c = txt.charAt(cur++) + if c is '-' + return new_character_token '-' + if c is '<' + tok_state = tok_state_script_data_escaped_less_than_sign + return + if c is '>' + tok_state = tok_state_script_data + return new_character_token '>' + if c is "\u0000" + parse_error() + tok_state = tok_state_script_data_escaped + return new_character_token "\ufffd" + if c is '' # EOF + parse_error() + tok_state = tok_state_data + cur -= 1 # Reconsume + return + # Anything else + tok_state = tok_state_script_data_escaped + return new_character_token c + + # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state + tok_state_script_data_escaped_less_than_sign = -> + c = txt.charAt(cur++) + if c is '/' + temporary_buffer = '' + tok_state = tok_state_script_data_escaped_end_tag_open + return + if is_uc_alpha(c) + temporary_buffer = c.toLowerCase() # yes, really + tok_state = tok_state_script_data_double_escape_start + return new_character_token "<#{c}" # fixfull split + if is_lc_alpha(c) + temporary_buffer = c + tok_state = tok_state_script_data_double_escape_start + return new_character_token "<#{c}" # fixfull split + # Anything else + tok_state = tok_state_script_data_escaped + cur -= 1 # Reconsume + return new_character_token c + + # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state + tok_state_script_data_escaped_end_tag_open = -> + c = txt.charAt(cur++) + if is_uc_alpha(c) + tok_cur_tag = new_end_tag c.toLowerCase() + temporary_buffer += c + tok_state = tok_state_script_data_escaped_end_tag_name + return + if is_lc_alpha(c) + tok_cur_tag = new_end_tag c + temporary_buffer += c + tok_state = tok_state_script_data_escaped_end_tag_name + return + # Anything else + tok_state = tok_state_script_data_escaped + cur -= 1 # Reconsume + return new_character_token ' + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + if is_appropriate_end_tag tok_cur_tag + tok_state = tok_state_before_attribute_name + return + # fall through + if c is '/' + if is_appropriate_end_tag tok_cur_tag + tok_state = tok_state_self_closing_start_tag + return + # fall through + if is_uc_alpha(c) + tok_cur_tag.name += c.toLowerCase() + temporary_buffer += c.toLowerCase() + return + if is_lc_alpha(c) + tok_cur_tag.name += c + temporary_buffer += c.toLowerCase() + return + # Anything else + tok_state = tok_state_script_data_escaped + cur -= 1 # Reconsume + return new_character_token " + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>' + if temporary_buffer is 'script' + tok_state = tok_state_script_data_double_escaped + else + tok_state = tok_state_script_data_escaped + return new_character_token c + if is_uc_alpha(c) + temporary_buffer += c.toLowerCase() # yes, really lowercase + return new_character_token c + if is_lc_alpha(c) + temporary_buffer += c + return new_character_token c + # Anything else + tok_state = tok_state_script_data_escaped + cur -= 1 # Reconsume + return + + # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state + tok_state_script_data_double_escaped = -> + c = txt.charAt(cur++) + if c is '-' + tok_state = tok_state_script_data_double_escaped_dash + return new_character_token '-' + if c is '<' + tok_state = tok_state_script_data_double_escaped_less_than_sign + return new_character_token '<' + if c is "\u0000" + parse_error() + return new_character_token "\ufffd" + if c is '' # EOF + parse_error() + tok_state = tok_state_data + cur -= 1 # Reconsume + return + # Anything else + return new_character_token c + + # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state + tok_state_script_data_double_escaped_dash = -> + c = txt.charAt(cur++) + if c is '-' + tok_state = tok_state_script_data_double_escaped_dash_dash + return new_character_token '-' + if c is '<' + tok_state = tok_state_script_data_double_escaped_less_than_sign + return new_character_token '<' + if c is "\u0000" + parse_error() + tok_state = tok_state_script_data_double_escaped + return new_character_token "\ufffd" + if c is '' # EOF + parse_error() + tok_state = tok_state_data + cur -= 1 # Reconsume + return + # Anything else + tok_state = tok_state_script_data_double_escaped + return new_character_token c + + # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state + tok_state_script_data_double_escaped_dash_dash = -> + c = txt.charAt(cur++) + if c is '-' + return new_character_token '-' + if c is '<' + tok_state = tok_state_script_data_double_escaped_less_than_sign + return new_character_token '<' + if c is '>' + tok_state = tok_state_script_data + return new_character_token '>' + if c is "\u0000" + parse_error() + tok_state = tok_state_script_data_double_escaped + return new_character_token "\ufffd" + if c is '' # EOF + parse_error() + tok_state = tok_state_data + cur -= 1 # Reconsume + return + # Anything else + tok_state = tok_state_script_data_double_escaped + return new_character_token c + + # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state + tok_state_script_data_double_escaped_less_than_sign = -> + c = txt.charAt(cur++) + if c is '/' + temporary_buffer = '' + tok_state = tok_state_script_data_double_escape_end + return new_character_token '/' + # Anything else + tok_state = tok_state_script_data_double_escaped + cur -= 1 # Reconsume + return + + # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state + tok_state_script_data_double_escape_end = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>' + if temporary_buffer is 'script' + tok_state = tok_state_script_data_escaped + else + tok_state = tok_state_script_data_double_escaped + return new_character_token c + if is_uc_alpha(c) + temporary_buffer += c.toLowerCase() # yes, really lowercase + return new_character_token c + if is_lc_alpha(c) + temporary_buffer += c + return new_character_token c + # Anything else + tok_state = tok_state_script_data_double_escaped + cur -= 1 # Reconsume + return # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state tok_state_before_attribute_name = -> @@ -2510,7 +2853,7 @@ parse_html = (txt, parse_error_cb = null) -> parse_error() tok_state = tok_state_data else - if uc_alpha.indexOf(c) > -1 + if is_uc_alpha(c) attr_name = c.toLowerCase() else attr_name = c @@ -2543,7 +2886,7 @@ parse_html = (txt, parse_error_cb = null) -> parse_error() tok_state = tok_state_data else - if uc_alpha.indexOf(c) > -1 + if is_uc_alpha(c) tok_cur_tag.attrs_a[0][0] = c.toLowerCase() else tok_cur_tag.attrs_a[0][0] += c @@ -2563,7 +2906,7 @@ parse_html = (txt, parse_error_cb = null) -> if c is '>' tok_state = tok_state_data return - if uc_alpha.indexOf(c) > -1 + if is_uc_alpha(c) tok_cur_tag.attrs_a.unshift [c.toLowerCase(), ''] tok_state = tok_state_attribute_name return @@ -2691,6 +3034,24 @@ parse_html = (txt, parse_error_cb = null) -> cur -= 1 # we didn't handle that char return null + # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state + tok_state_self_closing_start_tag = -> + c = txt.charAt(cur++) + if c is '>' + tok_cur_tag.flag 'self-closing' + tok_state = tok_state_data + return tok_cur_tag + if c is '' + parse_error() + tok_state = tok_state_data + cur -= 1 # Reconsume + return + # Anything else + parse_error() + tok_state = tok_state_before_attribute_name + cur -= 1 # Reconsume + return + # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state # WARNING: put a comment token in tok_cur_tag before setting this state tok_state_bogus_comment = -> @@ -2718,7 +3079,7 @@ parse_html = (txt, parse_error_cb = null) -> tok_state = tok_state_doctype return acn = adjusted_current_node() - if acn and acn.namespace isnt NS_HTML and text.substr(cur, 7) is '[CDATA[' + if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA[' cur += 7 tok_state = tok_state_cdata_section return @@ -2881,7 +3242,7 @@ parse_html = (txt, parse_error_cb = null) -> c = txt.charAt(cur++) if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' return - if uc_alpha.indexOf(c) > -1 + if is_uc_alpha(c) tok_cur_tag = new_doctype_token c.toLowerCase() tok_state = tok_state_doctype_name return @@ -2917,7 +3278,7 @@ parse_html = (txt, parse_error_cb = null) -> if c is '>' tok_state = tok_state_data return tok_cur_tag - if uc_alpha.indexOf(c) > -1 + if is_uc_alpha(c) tok_cur_tag.name += c.toLowerCase() return if c is "\u0000" @@ -3379,7 +3740,7 @@ parse_html = (txt, parse_error_cb = null) -> pending_table_character_tokens = [] head_element_pointer = null flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case) - context_element = null # FIXME initialize from args.fragment + context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments # tokenizer initialization tok_state = tok_state_data diff --git a/test.coffee b/test.coffee index 9e82bdd..3098bbd 100644 --- a/test.coffee +++ b/test.coffee @@ -7390,6 +7390,7 @@ tests = [ expected: "| \n" }, { name: "tests_innerHTML_1.dat #85" + html: "" fragment: "html" expected: "| \n| \n" }, { @@ -7832,6 +7833,8 @@ test_parser = (args) -> # console.log str console.log "FAILED: \"#{args.name}\"" console.log " Input: #{args.html}" + if args.fragment? + console.log " Fragment: #{args.fragment}" console.log " Correct: #{args.expected}" console.log " Output: #{serialized}" if parse_errors.length > 0 @@ -7843,8 +7846,7 @@ test_parser = (args) -> console.log "passed \"#{args.name}\"" test_results.passed += 1 test_summary = -> - console.log "Tests passed: #{test_results.passed}" - console.log "Tests Failed: #{test_results.failed}" + console.log "Tests passed: #{test_results.passed}, Tests Failed: #{test_results.failed}" next_test = 0 @@ -7852,11 +7854,12 @@ run_tests_and_breathe = -> start_time = new Date() loop if next_test >= tests.length + test_summary() return test_parser tests[next_test] next_test += 1 now = new Date() if now - start_time > 100 # miliseconds - setTimeout run_tests_and_breathe, 1 + break + setTimeout run_tests_and_breathe, 1 run_tests_and_breathe() -test_summary() -- 1.7.10.4