X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=f64c734e13e7dab898248150aad813cacce83041;hb=adc7477c34f3a2aa480e7f2af5ea954d2421d000;hp=c6ed9a5769cdb683f41dbe747f3fd9b82a4033ba;hpb=ffc91832d8b2c91ddd4407cf4036b6fc0eeca928;p=peach-html5-editor.git diff --git a/parse-html.coffee b/parse-html.coffee index c6ed9a5..f64c734 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -47,6 +47,12 @@ # 0: a "end of the list", "current node", "bottommost", "last" +# browser +# note: to get this to run outside a browser, you'll have to write a native +# implementation of decode_named_char_ref() +unless module?.exports? + window.wheic = {} + module = exports: window.wheic # Each node is an obect of the Node class. Here are the Node types: TYPE_TAG = 0 # name, {attributes}, [children] @@ -158,8 +164,10 @@ new_element = (name) -> new_text_node = (txt) -> return new Node TYPE_TEXT, text: txt new_character_token = new_text_node -new_comment_node = (txt) -> +new_comment_token = (txt) -> return new Node TYPE_COMMENT, text: txt +new_doctype_token = (name) -> + return new Node TYPE_DOCTYPE, name: name new_eof_token = -> return new Node TYPE_EOF new_afe_marker = -> @@ -173,6 +181,11 @@ digits = "0123456789" alnum = lc_alpha + uc_alpha + digits hex_chars = digits + "abcdefABCDEF" +is_uc_alpha = (str) -> + return str.length is 1 and uc_alpha.indexOf(str) > -1 +is_lc_alpha = (str) -> + return str.length is 1 and lc_alpha.indexOf(str) > -1 + # some SVG elements have dashes in them tag_name_chars = alnum + "-" @@ -183,6 +196,15 @@ is_space = (txt) -> is_space_tok = (t) -> return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1 +is_input_hidden_tok = (t) -> + return unless t.type is TYPE_START_TAG + for a of t.attrs_a + if a[0] is 'type' + if a[1].toLowerCase() is 'hidden' + return true + return false + return false + # https://en.wikipedia.org/wiki/Whitespace_character#Unicode whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000" @@ -367,6 +389,7 @@ parse_html = (txt, parse_error_cb = null) -> pending_table_character_tokens = null head_element_pointer = null flag_fragment_parsing = null + context_element = null stop_parsing = -> flag_parsing = false @@ -441,7 +464,7 @@ parse_html = (txt, parse_error_cb = null) -> for t in open_els if t.name is tag_name and (namespace is null or namespace is t.namespace) return true - if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option' + if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option' return false return false # this checks for a particular element, not by name @@ -490,9 +513,11 @@ parse_html = (txt, parse_error_cb = null) -> return clear_afe_to_marker = -> loop + return unless afe.length > 0 # this happens in fragment case, ?spec error el = afe.shift() if el.type is TYPE_AFE_MARKER return + return # 8.2.3.1 ... # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately @@ -617,6 +642,14 @@ parse_html = (txt, parse_error_cb = null) -> node = open_els[node_i] # 19. Return to the step labeled loop. + # 8.2.3.2 + + # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node + adjusted_current_node = -> + if open_els.length is 1 and flag_fragment_parsing + return context_element + return open_els[0] + # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements # this implementation is structured (mostly) as described at the link above. # capitalized comments are the "labels" described at the link above. @@ -1001,7 +1034,7 @@ parse_html = (txt, parse_error_cb = null) -> # last template's template contents, after its last child (if # any), and abort these substeps. if last_template and (last_table is null or last_template_i < last_table_i) - target = template # fixfull should be it's contents + target = last_template # fixfull should be it's contents target_i = target.children.length break # 4. If there is no last table, then let adjusted insertion @@ -1140,12 +1173,12 @@ parse_html = (txt, parse_error_cb = null) -> if is_space_tok t return if t.type is TYPE_COMMENT - # fixfull this is supposed to be "the last child of the document object" + # ?fixfull doc.children.push t return if t.type is TYPE_DOCTYPE + # FIXME check identifiers, set quirks, etc # fixfull - t.name = 'html' doc.children.push t insertion_mode = ins_mode_before_html return @@ -1167,6 +1200,7 @@ parse_html = (txt, parse_error_cb = null) -> return if t.type is TYPE_START_TAG and t.name is 'html' el = token_to_element t, NS_HTML, doc + doc.children.push el open_els.unshift(el) # fixfull (big paragraph in spec about manifest, fragment, urls, etc) insertion_mode = ins_mode_before_head @@ -1247,19 +1281,19 @@ parse_html = (txt, parse_error_cb = null) -> # fixfull encoding stuff return if t.type is TYPE_START_TAG and t.name is 'title' - parse_generic_rcdata_element t + parse_generic_rcdata_text t return if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style')) parse_generic_raw_text t return if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false insert_html_element t - insertion_mode = in_head_noscript # FIXME implement + insertion_mode = ins_mode_in_head_noscript # FIXME implement return if t.type is TYPE_START_TAG and t.name is 'script' ail = adjusted_insertion_location() el = token_to_element t, NS_HTML, ail - el.flag_parser_inserted true # FIXME implement + el.flag 'parser-inserted', true # FIXME implement # fixfull frament case ail[0].children.splice ail[1], 0, el open_els.unshift el @@ -1300,12 +1334,12 @@ parse_html = (txt, parse_error_cb = null) -> parse_error() return ins_mode_in_head_else t - + # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript ins_mode_in_head_noscript = (t) -> # FIXME ?fixfull console.log "ins_mode_in_head_noscript unimplemented" - + # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode ins_mode_after_head_else = (t) -> body_tok = new_open_tag 'body' @@ -1596,7 +1630,7 @@ parse_html = (txt, parse_error_cb = null) -> when 'style', 'script', 'template' ins_mode_in_head t when 'input' - if token_is_input_hidden t + if is_input_hidden_tok t ins_mode_in_table_else t else parse_error() @@ -1673,7 +1707,7 @@ parse_html = (txt, parse_error_cb = null) -> if el.name is 'caption' break clear_afe_to_marker() - insertion_mode = in_table + insertion_mode = ins_mode_in_table else parse_error() # fragment case @@ -1686,7 +1720,7 @@ parse_html = (txt, parse_error_cb = null) -> if el.name is 'caption' break clear_afe_to_marker() - insertion_mode = in_table + insertion_mode = ins_mode_in_table insertion_mode t # else fragment case return @@ -1717,7 +1751,7 @@ parse_html = (txt, parse_error_cb = null) -> return if t.type is TYPE_END_TAG and t.name is 'colgroup' if open_els[0].name is 'colgroup' - open_els[0].shift() + open_els.shift() insertion_mode = ins_mode_in_table else parse_error() @@ -2031,7 +2065,7 @@ parse_html = (txt, parse_error_cb = null) -> if t.type is TYPE_END_TAG parse_error() return - if t.type is EOF + if t.type is TYPE_EOF unless template_tag_is_open() stop_parsing() return @@ -2102,7 +2136,7 @@ parse_html = (txt, parse_error_cb = null) -> open_els.shift() t.acknowledge_self_closing() return - if t.type is TYPE_START TAG and t.name is 'noframes' + if t.type is TYPE_START_TAG and t.name is 'noframes' ins_mode_in_head t return if t.type is TYPE_EOF @@ -2270,12 +2304,13 @@ parse_html = (txt, parse_error_cb = null) -> tok_state = tok_state_end_tag_open when '?' parse_error() + tok_cur_tag = new_comment_token '?' tok_state = tok_state_bogus_comment else - if lc_alpha.indexOf(c) > -1 + if is_lc_alpha(c) tok_cur_tag = new_open_tag c tok_state = tok_state_tag_name - else if uc_alpha.indexOf(c) > -1 + else if is_uc_alpha(c) tok_cur_tag = new_open_tag c.toLowerCase() tok_state = tok_state_tag_name else @@ -2296,14 +2331,15 @@ parse_html = (txt, parse_error_cb = null) -> tok_state = tok_state_data return new_text_node ' -1 + if is_uc_alpha(c) tok_cur_tag = new_end_tag c.toLowerCase() tok_state = tok_state_tag_name - else if lc_alpha.indexOf(c) > -1 + else if is_lc_alpha(c) tok_cur_tag = new_end_tag c tok_state = tok_state_tag_name else parse_error() + tok_cur_tag = new_comment_token '/' tok_state = tok_state_bogus_comment return null @@ -2326,7 +2362,7 @@ parse_html = (txt, parse_error_cb = null) -> parse_error() tok_state = tok_state_data else - if uc_alpha.indexOf(c) > -1 + if is_uc_alpha(c) tok_cur_tag.name += c.toLowerCase() else tok_cur_tag.name += c @@ -2347,12 +2383,12 @@ parse_html = (txt, parse_error_cb = null) -> # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state tok_state_rcdata_end_tag_open = -> c = txt.charAt(cur++) - if uc_alpha.indexOf(c) > -1 + if is_uc_alpha(c) tok_cur_tag = new_end_tag c.toLowerCase() temporary_buffer += c tok_state = tok_state_rcdata_end_tag_name return null - if lc_alpha.indexOf(c) > -1 + if is_lc_alpha(c) tok_cur_tag = new_end_tag c temporary_buffer += c tok_state = tok_state_rcdata_end_tag_name @@ -2390,11 +2426,11 @@ parse_html = (txt, parse_error_cb = null) -> tok_state = tok_state_data return tok_cur_tag # else fall through to "Anything else" - if uc_alpha.indexOf(c) > -1 + if is_uc_alpha(c) tok_cur_tag.name += c.toLowerCase() temporary_buffer += c return null - if lc_alpha.indexOf(c) > -1 + if is_lc_alpha(c) tok_cur_tag.name += c temporary_buffer += c return null @@ -2418,12 +2454,12 @@ parse_html = (txt, parse_error_cb = null) -> # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state tok_state_rawtext_end_tag_open = -> c = txt.charAt(cur++) - if uc_alpha.indexOf(c) > -1 + if is_uc_alpha(c) tok_cur_tag = new_end_tag c.toLowerCase() temporary_buffer += c tok_state = tok_state_rawtext_end_tag_name return null - if lc_alpha.indexOf(c) > -1 + if is_lc_alpha(c) tok_cur_tag = new_end_tag c temporary_buffer += c tok_state = tok_state_rawtext_end_tag_name @@ -2451,11 +2487,11 @@ parse_html = (txt, parse_error_cb = null) -> tok_state = tok_state_data return tok_cur_tag # else fall through to "Anything else" - if uc_alpha.indexOf(c) > -1 + if is_uc_alpha(c) tok_cur_tag.name += c.toLowerCase() temporary_buffer += c return null - if lc_alpha.indexOf(c) > -1 + if is_lc_alpha(c) tok_cur_tag.name += c temporary_buffer += c return null @@ -2464,7 +2500,334 @@ parse_html = (txt, parse_error_cb = null) -> cur -= 1 # reconsume the input character return new_character_token ' + c = txt.charAt(cur++) + if c is '/' + temporary_buffer = '' + tok_state = tok_state_script_data_end_tag_open + return + if c is '!' + tok_state = tok_state_script_data_escape_start + return new_character_token ' + c = txt.charAt(cur++) + if is_uc_alpha(c) + tok_cur_tag = new_end_tag c.toLowerCase() + temporary_buffer += c + tok_state = tok_state_script_data_end_tag_name + return + if is_lc_alpha(c) + tok_cur_tag = new_end_tag c + temporary_buffer += c + tok_state = tok_state_script_data_end_tag_name + return + # Anything else + tok_state = tok_state_script_data + cur -= 1 # Reconsume + return new_character_token ' + c = txt.charAt(cur++) + if c is "\t" or c is "\n" or c is "\u000c" or c is ' ' + if is_appropriate_end_tag tok_cur_tag + tok_state = tok_state_before_attribute_name + return + # fall through + if c is '/' + if is_appropriate_end_tag tok_cur_tag + tok_state = tok_state_self_closing_start_tag + return + # fall through + if is_uc_alpha(c) + tok_cur_tag.name += c.toLowerCase() + temporary_buffer += c + return + if is_lc_alpha(c) + tok_cur_tag.name += c + temporary_buffer += c + return + # Anything else + tok_state = tok_state_script_data + cur -= 1 # Reconsume + return new_character_token " + c = txt.charAt(cur++) + if c is '-' + tok_state = tok_state_script_data_escape_start_dash + return new_character_token '-' + # Anything else + tok_state = tok_state_script_data + cur -= 1 # Reconsume + return + + # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state + tok_state_script_data_escape_start_dash = -> + c = txt.charAt(cur++) + if c is '-' + tok_state = tok_state_script_data_escaped_dash_dash + return new_character_token '-' + # Anything else + tok_state = tok_state_script_data + cur -= 1 # Reconsume + return + + # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state + tok_state_script_data_escaped = -> + c = txt.charAt(cur++) + if c is '-' + tok_state = tok_state_script_data_escaped_dash + return new_character_token '-' + if c is '<' + tok_state = tok_state_script_data_escaped_less_than_sign + return + if c is "\u0000" + parse_error() + return new_character_token "\ufffd" + if c is '' # EOF + tok_state = tok_state_data + parse_error() + cur -= 1 # Reconsume + return + # Anything else + return new_character_token c + + # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state + tok_state_script_data_escaped_dash = -> + c = txt.charAt(cur++) + if c is '-' + tok_state = tok_state_script_data_escaped_dash_dash + return new_character_token '-' + if c is '<' + tok_state = tok_state_script_data_escaped_less_than_sign + return + if c is "\u0000" + parse_error() + tok_state = tok_state_script_data_escaped + return new_character_token "\ufffd" + if c is '' # EOF + tok_state = tok_state_data + parse_error() + cur -= 1 # Reconsume + return + # Anything else + tok_state = tok_state_script_data_escaped + return new_character_token c + + # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state + tok_state_script_data_escaped_dash_dash = -> + c = txt.charAt(cur++) + if c is '-' + return new_character_token '-' + if c is '<' + tok_state = tok_state_script_data_escaped_less_than_sign + return + if c is '>' + tok_state = tok_state_script_data + return new_character_token '>' + if c is "\u0000" + parse_error() + tok_state = tok_state_script_data_escaped + return new_character_token "\ufffd" + if c is '' # EOF + parse_error() + tok_state = tok_state_data + cur -= 1 # Reconsume + return + # Anything else + tok_state = tok_state_script_data_escaped + return new_character_token c + + # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state + tok_state_script_data_escaped_less_than_sign = -> + c = txt.charAt(cur++) + if c is '/' + temporary_buffer = '' + tok_state = tok_state_script_data_escaped_end_tag_open + return + if is_uc_alpha(c) + temporary_buffer = c.toLowerCase() # yes, really + tok_state = tok_state_script_data_double_escape_start + return new_character_token "<#{c}" # fixfull split + if is_lc_alpha(c) + temporary_buffer = c + tok_state = tok_state_script_data_double_escape_start + return new_character_token "<#{c}" # fixfull split + # Anything else + tok_state = tok_state_script_data_escaped + cur -= 1 # Reconsume + return new_character_token c + + # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state + tok_state_script_data_escaped_end_tag_open = -> + c = txt.charAt(cur++) + if is_uc_alpha(c) + tok_cur_tag = new_end_tag c.toLowerCase() + temporary_buffer += c + tok_state = tok_state_script_data_escaped_end_tag_name + return + if is_lc_alpha(c) + tok_cur_tag = new_end_tag c + temporary_buffer += c + tok_state = tok_state_script_data_escaped_end_tag_name + return + # Anything else + tok_state = tok_state_script_data_escaped + cur -= 1 # Reconsume + return new_character_token ' + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + if is_appropriate_end_tag tok_cur_tag + tok_state = tok_state_before_attribute_name + return + # fall through + if c is '/' + if is_appropriate_end_tag tok_cur_tag + tok_state = tok_state_self_closing_start_tag + return + # fall through + if is_uc_alpha(c) + tok_cur_tag.name += c.toLowerCase() + temporary_buffer += c.toLowerCase() + return + if is_lc_alpha(c) + tok_cur_tag.name += c + temporary_buffer += c.toLowerCase() + return + # Anything else + tok_state = tok_state_script_data_escaped + cur -= 1 # Reconsume + return new_character_token " + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>' + if temporary_buffer is 'script' + tok_state = tok_state_script_data_double_escaped + else + tok_state = tok_state_script_data_escaped + return new_character_token c + if is_uc_alpha(c) + temporary_buffer += c.toLowerCase() # yes, really lowercase + return new_character_token c + if is_lc_alpha(c) + temporary_buffer += c + return new_character_token c + # Anything else + tok_state = tok_state_script_data_escaped + cur -= 1 # Reconsume + return + + # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state + tok_state_script_data_double_escaped = -> + c = txt.charAt(cur++) + if c is '-' + tok_state = tok_state_script_data_double_escaped_dash + return new_character_token '-' + if c is '<' + tok_state = tok_state_script_data_double_escaped_less_than_sign + return new_character_token '<' + if c is "\u0000" + parse_error() + return new_character_token "\ufffd" + if c is '' # EOF + parse_error() + tok_state = tok_state_data + cur -= 1 # Reconsume + return + # Anything else + return new_character_token c + + # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state + tok_state_script_data_double_escaped_dash = -> + c = txt.charAt(cur++) + if c is '-' + tok_state = tok_state_script_data_double_escaped_dash_dash + return new_character_token '-' + if c is '<' + tok_state = tok_state_script_data_double_escaped_less_than_sign + return new_character_token '<' + if c is "\u0000" + parse_error() + tok_state = tok_state_script_data_double_escaped + return new_character_token "\ufffd" + if c is '' # EOF + parse_error() + tok_state = tok_state_data + cur -= 1 # Reconsume + return + # Anything else + tok_state = tok_state_script_data_double_escaped + return new_character_token c + + # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state + tok_state_script_data_double_escaped_dash_dash = -> + c = txt.charAt(cur++) + if c is '-' + return new_character_token '-' + if c is '<' + tok_state = tok_state_script_data_double_escaped_less_than_sign + return new_character_token '<' + if c is '>' + tok_state = tok_state_script_data + return new_character_token '>' + if c is "\u0000" + parse_error() + tok_state = tok_state_script_data_double_escaped + return new_character_token "\ufffd" + if c is '' # EOF + parse_error() + tok_state = tok_state_data + cur -= 1 # Reconsume + return + # Anything else + tok_state = tok_state_script_data_double_escaped + return new_character_token c + + # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state + tok_state_script_data_double_escaped_less_than_sign = -> + c = txt.charAt(cur++) + if c is '/' + temporary_buffer = '' + tok_state = tok_state_script_data_double_escape_end + return new_character_token '/' + # Anything else + tok_state = tok_state_script_data_double_escaped + cur -= 1 # Reconsume + return + + # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state + tok_state_script_data_double_escape_end = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>' + if temporary_buffer is 'script' + tok_state = tok_state_script_data_escaped + else + tok_state = tok_state_script_data_double_escaped + return new_character_token c + if is_uc_alpha(c) + temporary_buffer += c.toLowerCase() # yes, really lowercase + return new_character_token c + if is_lc_alpha(c) + temporary_buffer += c + return new_character_token c + # Anything else + tok_state = tok_state_script_data_double_escaped + cur -= 1 # Reconsume + return # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state tok_state_before_attribute_name = -> @@ -2490,7 +2853,7 @@ parse_html = (txt, parse_error_cb = null) -> parse_error() tok_state = tok_state_data else - if uc_alpha.indexOf(c) > -1 + if is_uc_alpha(c) attr_name = c.toLowerCase() else attr_name = c @@ -2523,7 +2886,7 @@ parse_html = (txt, parse_error_cb = null) -> parse_error() tok_state = tok_state_data else - if uc_alpha.indexOf(c) > -1 + if is_uc_alpha(c) tok_cur_tag.attrs_a[0][0] = c.toLowerCase() else tok_cur_tag.attrs_a[0][0] += c @@ -2543,7 +2906,7 @@ parse_html = (txt, parse_error_cb = null) -> if c is '>' tok_state = tok_state_data return - if uc_alpha.indexOf(c) > -1 + if is_uc_alpha(c) tok_cur_tag.attrs_a.unshift [c.toLowerCase(), ''] tok_state = tok_state_attribute_name return @@ -2671,6 +3034,619 @@ parse_html = (txt, parse_error_cb = null) -> cur -= 1 # we didn't handle that char return null + # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state + tok_state_self_closing_start_tag = -> + c = txt.charAt(cur++) + if c is '>' + tok_cur_tag.flag 'self-closing' + tok_state = tok_state_data + return tok_cur_tag + if c is '' + parse_error() + tok_state = tok_state_data + cur -= 1 # Reconsume + return + # Anything else + parse_error() + tok_state = tok_state_before_attribute_name + cur -= 1 # Reconsume + return + + # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state + # WARNING: put a comment token in tok_cur_tag before setting this state + tok_state_bogus_comment = -> + next_gt = txt.indexOf '>', cur + if next_gt is -1 + val = txt.substr cur + cur = txt.length + else + val = txt.substr cur, (next_gt - cur) + cur = next_gt + 1 + val = val.replace "\u0000", "\ufffd" + tok_cur_tag.text += val + tok_state = tok_state_data + return tok_cur_tag + + # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state + tok_state_markup_declaration_open = -> + if txt.substr(cur, 2) is '--' + cur += 2 + tok_cur_tag = new_comment_token '' + tok_state = tok_state_comment_start + return + if txt.substr(cur, 7).toLowerCase() is 'doctype' + cur += 7 + tok_state = tok_state_doctype + return + acn = adjusted_current_node() + if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA[' + cur += 7 + tok_state = tok_state_cdata_section + return + # Otherwise + parse_error() + tok_cur_tag = new_comment_token '!' # TODO test ("!" right?) + tok_state = tok_state_bogus_comment + return + + # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state + tok_state_comment_start = -> + switch c = txt.charAt(cur++) + when '-' + tok_state = tok_state_comment_start_dash + when "\u0000" + parse_error() + return new_character_token "\ufffd" + when '>' + parse_error() + tok_state = tok_state_data + return tok_cur_tag + when '' # EOF + parse_error() + tok_state = tok_state_data + cur -= 1 # Reconsume + return tok_cur_tag + else + tok_cur_tag.text += c + return null + + # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state + tok_state_comment_start_dash = -> + switch c = txt.charAt(cur++) + when '-' + tok_state = tok_state_comment_end + when "\u0000" + parse_error() + tok_cur_tag.text += "-\ufffd" + tok_state = tok_state_comment + when '>' + parse_error() + tok_state = tok_state_data + return tok_cur_tag + when '' # EOF + parse_error() + tok_state = tok_state_data + cur -= 1 # Reconsume + return tok_cur_tag + else + tok_cur_tag.text += "-#{c}" + tok_state = tok_state_comment + return null + + # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state + tok_state_comment = -> + switch c = txt.charAt(cur++) + when '-' + tok_state = tok_state_comment_end_dash + when "\u0000" + parse_error() + tok_cur_tag.text += "\ufffd" + when '' # EOF + parse_error() + tok_state = tok_state_data + cur -= 1 # Reconsume + return tok_cur_tag + else + tok_cur_tag.text += c + return null + + # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state + tok_state_comment_end_dash = -> + switch c = txt.charAt(cur++) + when '-' + tok_state = tok_state_comment_end + when "\u0000" + parse_error() + tok_cur_tag.text += "-\ufffd" + tok_state = tok_state_comment + when '' # EOF + parse_error() + tok_state = tok_state_data + cur -= 1 # Reconsume + return tok_cur_tag + else + tok_cur_tag.text += "-#{c}" + tok_state = tok_state_comment + return null + + # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state + tok_state_comment_end = -> + switch c = txt.charAt(cur++) + when '>' + tok_state = tok_state_data + return tok_cur_tag + when "\u0000" + parse_error() + tok_cur_tag.text += "--\ufffd" + tok_state = tok_state_comment + when '!' + parse_error() + tok_state = tok_state_comment_end_bang + when '-' + parse_error() + tok_cur_tag.text += '-' + when '' # EOF + parse_error() + tok_state = tok_state_data + cur -= 1 # Reconsume + return tok_cur_tag + else + parse_error() + tok_cur_tag.text += "--#{c}" + tok_state = tok_state_comment + return null + + # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state + tok_state_comment_end_bang = -> + switch c = txt.charAt(cur++) + when '-' + tok_cur_tag.text += "--!#{c}" + tok_state = tok_state_comment_end_dash + when '>' + tok_state = tok_state_data + return tok_cur_tag + when "\u0000" + parse_error() + tok_cur_tag.text += "--!\ufffd" + tok_state = tok_state_comment + when '' # EOF + parse_error() + tok_state = tok_state_data + cur -= 1 # Reconsume + return tok_cur_tag + else + tok_cur_tag.text += "--!#{c}" + tok_state = tok_state_comment + return null + + # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state + tok_state_doctype = -> + switch c = txt.charAt(cur++) + when "\t", "\u000a", "\u000c", ' ' + tok_state = tok_state_before_doctype_name + when '' # EOF + parse_error() + tok_state = tok_state_data + el = new_doctype_token '' + el.flag 'force-quirks', true + cur -= 1 # Reconsume + return el + else + parse_error() + tok_state = tok_state_before_doctype_name + cur -= 1 # Reconsume + return null + + # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state + tok_state_before_doctype_name = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + return + if is_uc_alpha(c) + tok_cur_tag = new_doctype_token c.toLowerCase() + tok_state = tok_state_doctype_name + return + if c is "\u0000" + parse_error() + tok_cur_tag = new_doctype_token "\ufffd" + tok_state = tok_state_doctype_name + return + if c is '>' + parse_error() + el = new_doctype_token '' + el.flag 'force-quirks', true + tok_state = tok_state_data + return el + if c is '' # EOF + parse_error() + tok_state = tok_state_data + el = new_doctype_token '' + el.flag 'force-quirks', true + cur -= 1 # Reconsume + return el + # Anything else + tok_cur_tag = new_doctype_token c + tok_state = tok_state_doctype_name + return null + + # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state + tok_state_doctype_name = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + tok_state = tok_state_after_doctype_name + return + if c is '>' + tok_state = tok_state_data + return tok_cur_tag + if is_uc_alpha(c) + tok_cur_tag.name += c.toLowerCase() + return + if c is "\u0000" + parse_error() + tok_cur_tag.name += "\ufffd" + return + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + tok_cur_tag.name += c + return null + + # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state + tok_state_after_doctype_name = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + return + if c is '>' + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + if txt.substr(cur - 1, 6).toLowerCase() is 'public' + cur += 5 + tok_state = tok_state_after_doctype_public_keyword + return + if txt.substr(cur - 1, 6).toLowerCase() is 'system' + cur += 5 + tok_state = tok_state_after_doctype_system_keyword + return + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_bogus_doctype + return null + + # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state + tok_state_after_doctype_public_keyword = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + tok_state = tok_state_before_doctype_public_identifier + return + if c is '"' + parse_error() + tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text? + tok_state = tok_state_doctype_public_identifier_double_quoted + return + if c is "'" + parse_error() + tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text? + tok_state = tok_state_doctype_public_identifier_single_quoted + return + if c is '>' + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_bogus_doctype + return null + + # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state + tok_state_before_doctype_public_identifier = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + return + if c is '"' + parse_error() + tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text? + tok_state = tok_state_doctype_public_identifier_double_quoted + return + if c is "'" + parse_error() + tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text? + tok_state = tok_state_doctype_public_identifier_single_quoted + return + if c is '>' + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_bogus_doctype + return null + + + # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state + tok_state_doctype_public_identifier_double_quoted = -> + c = txt.charAt(cur++) + if c is '"' + tok_state = tok_state_after_doctype_public_identifier + return + if c is "\u0000" + parse_error() + tok_cur_tag.public_identifier += "\ufffd" + return + if c is '>' + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + tok_cur_tag.public_identifier += c + return null + + # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state + tok_state_doctype_public_identifier_single_quoted = -> + c = txt.charAt(cur++) + if c is "'" + tok_state = tok_state_after_doctype_public_identifier + return + if c is "\u0000" + parse_error() + tok_cur_tag.public_identifier += "\ufffd" + return + if c is '>' + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + tok_cur_tag.public_identifier += c + return null + + # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state + tok_state_after_doctype_public_identifier = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + tok_state = tok_state_between_doctype_public_and_system_identifiers + return + if c is '>' + tok_state = tok_state_data + return tok_cur_tag + if c is '"' + parse_error() + tok_cur_tag.system_identifier = '' + tok_state = tok_state_doctype_system_identifier_double_quoted + return + if c is "'" + parse_error() + tok_cur_tag.system_identifier = '' + tok_state = tok_state_doctype_system_identifier_single_quoted + return + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_bogus_doctype + return null + + # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state + tok_state_between_doctype_public_and_system_identifiers = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + return + if c is '>' + tok_state = tok_state_data + return tok_cur_tag + if c is '"' + parse_error() + tok_cur_tag.system_identifier = '' + tok_state = tok_state_doctype_system_identifier_double_quoted + return + if c is "'" + parse_error() + tok_cur_tag.system_identifier = '' + tok_state = tok_state_doctype_system_identifier_single_quoted + return + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_bogus_doctype + return null + + # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state + tok_state_after_doctype_system_keyword = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + tok_state = tok_state_before_doctype_system_identifier + return + if c is '"' + parse_error() + tok_cur_tag.system_identifier = '' + tok_state = tok_state_doctype_system_identifier_double_quoted + return + if c is "'" + parse_error() + tok_cur_tag.system_identifier = '' + tok_state = tok_state_doctype_system_identifier_single_quoted + return + if c is '>' + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_bogus_doctype + return null + + # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state + tok_state_before_doctype_system_identifier = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + return + if c is '"' + tok_cur_tag.system_identifier = '' + tok_state = tok_state_doctype_system_identifier_double_quoted + return + if c is "'" + tok_cur_tag.system_identifier = '' + tok_state = tok_state_doctype_system_identifier_single_quoted + return + if c is '>' + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_bogus_doctype + return null + + # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state + tok_state_doctype_system_identifier_double_quoted = -> + c = txt.charAt(cur++) + if c is '"' + tok_state = tok_state_after_doctype_system_identifier + return + if c is "\u0000" + parse_error() + tok_cur_tag.system_identifier += "\ufffd" + return + if c is '>' + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + tok_cur_tag.system_identifier += c + return null + + # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state + tok_state_doctype_system_identifier_single_quoted = -> + c = txt.charAt(cur++) + if c is "'" + tok_state = tok_state_after_doctype_system_identifier + return + if c is "\u0000" + parse_error() + tok_cur_tag.system_identifier += "\ufffd" + return + if c is '>' + parse_error() + tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + tok_cur_tag.system_identifier += c + return null + + # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state + tok_state_after_doctype_system_identifier = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' + return + if c is '>' + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + parse_error() + tok_state = tok_state_data + tok_cur_tag.flag 'force-quirks', true + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + parse_error() + # do _not_ tok_cur_tag.flag 'force-quirks', true + tok_state = tok_state_bogus_doctype + return null + + # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state + tok_state_bogus_doctype = -> + c = txt.charAt(cur++) + if c is '>' + tok_state = tok_state_data + return tok_cur_tag + if c is '' # EOF + tok_state = tok_state_data + cur -= 1 # Reconsume + return tok_cur_tag + # Anything else + return null + + # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference # Don't set this as a state, just call it # returns a string (NOT a text node) @@ -2750,7 +3726,7 @@ parse_html = (txt, parse_error_cb = null) -> # tree constructor initialization # see comments on TYPE_TAG/etc for the structure of this data doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML - open_els = [doc] + open_els = [] afe = [] # active formatting elements template_insertion_modes = [] insertion_mode = ins_mode_initial @@ -2764,6 +3740,7 @@ parse_html = (txt, parse_error_cb = null) -> pending_table_character_tokens = [] head_element_pointer = null flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case) + context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments # tokenizer initialization tok_state = tok_state_data @@ -2776,15 +3753,6 @@ parse_html = (txt, parse_error_cb = null) -> # fixfull parse error if has self-closing flag, but it wasn't acknolwedged return doc.children -test_results = passed: 0, failed: 0 -# everything below is tests on the above -test_equals = (description, output, expected_output) -> - if output is expected_output - console.log "passed." # don't say name, so smart consoles can merge all of these - else - console.log "FAILED: \"#{description}\"" - console.log " Expected: #{expected_output}" - console.log " Actual: #{output}" serialize_els = (els, shallow, show_ids) -> serialized = '' sep = '' @@ -2793,205 +3761,12 @@ serialize_els = (els, shallow, show_ids) -> sep = ',' serialized += t.serialize shallow, show_ids return serialized -test_parser = (args) -> - debug_log_reset() - parse_errors = [] - errors_cb = (i) -> - parse_errors.push i - prev_node_id = 0 # reset counter - parsed = parse_html args.html, errors_cb - serialized = serialize_els parsed, false, false - expected = 'tag:"html",{},[tag:"head",{},[],tag:"body",{},[' + args.expected + ']]' - if serialized isnt expected - debug_log_each (str) -> - console.log str - console.log "FAILED: \"#{args.name}\"" - console.log " Input: #{args.html}" - console.log " Correct: #{expected}" - console.log " Output: #{serialized}" - if parse_errors.length > 0 - console.log " parse errs: #{JSON.stringify parse_errors}" - else - console.log " No parse errors" - test_results.failed += 1 - else - #console.log "passed \"#{args.name}\"" - test_results.passed += 1 -test_summary = -> - console.log "Tests passed: #{test_results.passed}" - console.log "Tests Failed: #{test_results.failed}" - -test_parser name: "empty", \ - html: "", - expected: '' -test_parser name: "just text", \ - html: "abc", - expected: 'text:"abc"' -test_parser name: "named entity", \ - html: "a&1234", - expected: 'text:"a&1234"' -test_parser name: "broken named character references", \ - html: "1&2&&3&aabbcc;", - expected: 'text:"1&2&&3&aabbcc;"' -test_parser name: "numbered entity overrides", \ - html: "1€€ ƒ", - expected: 'text:"1€€ ƒ"' -test_parser name: "open tag", \ - html: "foobar", - expected: 'text:"foo",tag:"span",{},[text:"bar"]' -test_parser name: "open tag with attributes", \ - html: "foobar", - expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]' -test_parser name: "open tag with attributes of various quotings", \ - html: "foobar", - expected: 'text:"foo",tag:"span",{"abc":"def","autofocus":"","g":"hij","klm":"nopqrstuv\\""},[text:"bar"]' -test_parser name: "attribute entity exceptions dq", \ - html: "foobar", - expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]' -test_parser name: "attribute entity exceptions sq", \ - html: "foobar", - expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]' -test_parser name: "attribute entity exceptions uq", \ - html: "foobar", - expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]' -test_parser name: "matching closing tags", \ - html: "foohi
1
foo
2
bar", - expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"' -test_parser name: "missing closing tag inside", \ - html: "foo
barbaz
qux", - expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"' -test_parser name: "mis-matched closing tags", \ - html: "12
3456
78", - expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]' -test_parser name: "mis-matched formatting elements", \ - html: "1234567890", - expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"' -test_parser name: "8.2.8.1 Misnested tags: ", \ - html: '

12345

', - expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]' -test_parser name: "8.2.8.2 Misnested tags:

", \ - html: '1

23

', - expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]' -test_parser name: "crazy formatting elements test", \ - html: "second
first
", - # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]' - # firefox does this: - expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"' -# tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat -test_parser name: "html5lib aaa 1", \ - html: '

', - expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]' -test_parser name: "html5lib aaa 2", \ - html: '1

23

', - expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]' -test_parser name: "html5lib aaa 3", \ - html: '1', - expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]' -test_parser name: "html5lib aaa 4", \ - html: '123', - expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]' -test_parser name: "html5lib aaa 5 (two divs deep)", \ - html: '1
2
34
5
', - expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]' -test_parser name: "html5lib aaa 6 (foster parenting)", \ - html: '1

23

', - expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]' -test_parser name: "html5lib aaa 7 (aaa, eof) 1", \ - html: '

', - expected: 'tag:"b",{},[tag:"b",{},[tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]]]' -test_parser name: "html5lib aaa 8 (aaa, eof) 2", \ - html: '

', - expected: 'tag:"b",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]' -test_parser name: "html5lib aaa 9 (aaa, eof) 3", \ - html: '

', - expected: 'tag:"a",{},[tag:"b",{},[tag:"b",{},[]]],tag:"b",{},[tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]' -test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \ - html: '

123

45', - expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]' -test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \ - html: '
13
2
', - expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]' -test_parser name: "html5lib aaa 12 (table with foster parenting, split text)", \ - html: 'AC
B
', - expected: 'text:"AC",tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]' -# TODO implement svg and namespacing -#test_parser name: "html5lib aaa 13 (svg tr input)", \ -# html: '
', -# expected: 'tag:"a",{},[svg:"svg",{},[svg:"tr",{},[svg:"input"]]]' -test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \ - html: '
', - expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"div",{},[tag:"div",{},[]]]]]]]]]]]]]' -test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \ - html: '
', - expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]' -test_parser name: "html5lib aaa 16 (correctly nested 4b)", \ - html: 'xy', - expected: 'tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]],text:"y"' -test_parser name: "html5lib aaa 17 (formatting, implied /p, noah's ark)", \ - html: '

x', - expected: 'tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[]]]]],tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]]' -test_parser name: "variation on html5lib aaa 17 (with attributes in various orders)", \ - html: '

x', - expected: 'tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[]]]]],tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[text:"x"]]]]' -test_parser name: "junk after attribute close-quote", \ - html: '

foo

x', - expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]' -test_parser name: "html5lib aaa02 1", \ - html: '12

34', - expected: 'tag:"b",{},[text:"1",tag:"i",{},[text:"2"]],tag:"i",{},[tag:"p",{},[tag:"b",{},[text:"3"],text:"4"]]' -test_parser name: "html5lib aaa02 2", \ - html: '

', - expected: 'tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"style",{},[]],tag:"address",{},[tag:"a",{},[],tag:"a",{},[]]]' -test_parser name: "html5lib tables 1", \ - html: '
', - expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"th",{},[]]]]' -test_parser name: "html5lib tables 2", \ - html: '
', - expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]' -test_parser name: "html5lib tables 3", \ - html: "", - expected: 'tag:"table",{},[tag:"colgroup",{},[tag:"col",{"foo":"bar"},[]]]' -test_parser name: "html5lib tables 4", \ - html: '
foo', - expected: 'text:"foo",tag:"table",{},[tag:"colgroup",{},[]]' -test_parser name: "html5lib tables 5", \ - html: '

foo', - expected: 'tag:"table",{},[],tag:"p",{},[text:"foo"]' -test_parser name: "html5lib tables 6", \ - html: '
', - expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]' -test_parser name: "html5lib tables 7", \ - html: '
', - expected: 'tag:"select",{},[tag:"option",{},[text:"3"]],tag:"table",{},[]' -test_parser name: "html5lib tables 8", \ - html: '
', - expected: 'tag:"select",{},[],tag:"table",{},[],tag:"table",{},[]' -test_parser name: "html5lib tables 9", \ - html: '
', - expected: 'tag:"select",{},[],tag:"table",{},[]' -test_parser name: "html5lib tables 10", \ - html: '
B
', - expected: 'tag:"select",{},[tag:"option",{},[text:"A"]],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]' -test_parser name: "html5lib tables 11", \ - html: '
foo', - expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]' -test_parser name: "html5lib tables 12", \ - html: '
A
B', - expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"A"]]]],text:"B"' -test_parser name: "html5lib tables 13", \ - html: '
', - expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[]],tag:"caption",{},[]]' -test_parser name: "html5lib tables 14", \ - html: '
foo', - expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]' -test_parser name: "html5lib tables 15", \ - html: '', - expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]],tag:"tr",{},[]]]' -test_parser name: "html5lib tables 16", \ - html: '
', - expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[tag:"button",{},[]],tag:"td",{},[]]]]' -# TODO implement svg parsing -#test_parser name: "html5lib tables 17", \ -# html: '
', -# expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[svg:"svg",{},[svg:"desc",{},[]]],tag:"td",{},[]]]]' -test_summary() + +# TODO export TYPE_* +module.exports.parse_html = parse_html +module.exports.debug_log_reset = debug_log_reset +module.exports.debug_log_each = debug_log_each +module.exports.TYPE_TAG = TYPE_TAG +module.exports.TYPE_TEXT = TYPE_TEXT +module.exports.TYPE_COMMENT = TYPE_COMMENT +module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE