From: Jason Woofenden Date: Sun, 20 Dec 2015 14:49:10 +0000 (-0500) Subject: implement lots of raw-ish text parsing X-Git-Url: https://jasonwoof.com/gitweb/?a=commitdiff_plain;h=7d7b5713e2ae68559b1cfb1fcb86cfc7b83f7967;p=peach-html5-editor.git implement lots of raw-ish text parsing --- diff --git a/parse-html.coffee b/parse-html.coffee index 86d0136..d271706 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -94,6 +94,8 @@ class Node attrs = {} attrs[k] = v for k, v of @attrs return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id + acknowledge_self_closing: -> + # fixfull serialize: (shallow = false, show_ids = false) -> # for unit tests ret = '' switch @type @@ -149,6 +151,7 @@ new_element = (name) -> return new Node TYPE_TAG, name: name new_text_node = (txt) -> return new Node TYPE_TEXT, text: txt +new_character_token = new_text_node new_comment_node = (txt) -> return new Node TYPE_COMMENT, text: txt new_eof_token = -> @@ -158,8 +161,8 @@ new_afe_marker = -> new_aaa_bookmark = -> return new Node TYPE_AAA_BOOKMARK -lc_alpha = "abcdefghijklmnopqrstuvwxqz" -uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ" +lc_alpha = "abcdefghijklmnopqrstuvwxyz" +uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" digits = "0123456789" alnum = lc_alpha + uc_alpha + digits hex_chars = digits + "abcdefABCDEF" @@ -340,15 +343,19 @@ parse_html = (txt, parse_error_cb = null) -> cur = 0 # index of next char in txt to be parsed # declare tree and tokenizer variables so they're in scope below tree = null - open_els = [] # stack of open elements + open_els = null # stack of open elements + afe = null # active formatting elements + template_insertion_modes = null insertion_mode = null + original_insertion_mode = null tok_state = null tok_cur_tag = null # partially parsed tag + flag_scripting = null flag_frameset_ok = null flag_parsing = null flag_foster_parenting = null form_element_pointer = null - afe = [] # active formatting elements + temporary_buffer = null parse_error = -> if parse_error_cb? @@ -872,7 +879,6 @@ parse_html = (txt, parse_error_cb = null) -> debug_log "AAA DONE" # http://www.w3.org/TR/html5/syntax.html#close-a-p-element - # FIXME test this (particularly emplied end tags) close_p_element = -> generate_implied_end_tags 'p' # arg is exception if open_els[0].name isnt 'p' @@ -886,7 +892,8 @@ parse_html = (txt, parse_error_cb = null) -> close_p_element() # http://www.w3.org/TR/html5/syntax.html#insert-a-character - tree_insert_text = (t) -> + # aka insert_a_character = (t) -> + insert_character = (t) -> dest = adjusted_insertion_location() # fixfull check for Document node if dest[1] > 0 @@ -1046,17 +1053,114 @@ parse_html = (txt, parse_error_cb = null) -> # http://www.w3.org/TR/html5/syntax.html#insert-a-comment # position should be [node, index_within_children] - tree_insert_a_comment = (t, position = null) -> + tree_insert_comment = (t, position = null) -> position ?= adjusted_insertion_location() position[0].children.splice position[1], 0, t + # 8.2.5.2 + # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm + parse_generic_raw_text = (t) -> + insert_html_element t + tok_state = tok_state_rawtext + original_insertion_mode = insertion_mode + insertion_mode = ins_mode_text + parse_generic_rcdata_text = (t) -> + insert_html_element t + tok_state = tok_state_rcdata + original_insertion_mode = insertion_mode + insertion_mode = ins_mode_text + # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags generate_implied_end_tags = (except = null) -> while end_tag_implied[open_els[0].name] and open_els[0].name isnt except open_els.shift() - # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody + # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead + ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control + open_els.shift() # spec says this will be a 'head' node + insertion_mode = ins_mode_after_head + insertion_mode t + ins_mode_in_head = (t) -> + if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ') + insert_character t + return + if t.type is TYPE_COMMENT + tree_insert_comment t + return + if t.type is TYPE_DOCTYPE + parse_error() + return + if t.type is TYPE_START_TAG and t.name is 'html' + ins_mode_in_body t + return + if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link') + el = insert_html_element t + open_els.shift() + el.acknowledge_self_closing() + return + if t.type is TYPE_START_TAG and t.name is 'meta' + el = insert_html_element t + open_els.shift() + el.acknowledge_self_closing() + # fixfull encoding stuff + return + if t.type is TYPE_START_TAG and t.name is 'title' + parse_generic_rcdata_element t + return + if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style')) + parse_generic_raw_text t + return + if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false + insert_html_element t + insertion_mode = in_head_noscript # FIXME implement + return + if t.type is TYPE_START_TAG and t.name is 'script' + ail = adjusted_insertion_location() + el = token_to_element t, NS_HTML, ail + el.flag_parser_inserted true # FIXME implement + # fixfull frament case + ail[0].children.splice ail[1], 0, el + open_els.unshift el + tok_state = tok_state_script_data + original_insertion_mode = insertion_mode # make sure orig... is defined + insertion_mode = ins_mode_text # FIXME implement + return + if t.type is TYPE_END_TAG and t.name is 'head' + open_els.shift() # will be a head element... spec says so + insertion_mode = ins_mode_after_head + return + if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br') + ins_mode_in_head_else t + return + if t.type is TYPE_START_TAG and t.name is 'template' + insert_html_element t + afe_push_marker() + flag_frameset_ok = false + insertion_mode = ins_mode_in_template + template_insertion_modes.unshift ins_mode_in_template # FIXME implement + return + if t.type is TYPE_END_TAG and t.name is 'template' + if template_tag_is_open() + generate_implied_end_tags + if open_els[0].name isnt 'template' + parse_error() + loop + el = open_els.shift() + if el.name is 'template' + break + clear_afe_to_marker() + template_insertion_modes.shift() + reset_insertion_mode() + else + parse_error() + return + if (t.type is TYPE_OPEN_TAG and t.name is 'head') or t.type is TYPE_END_TAG + parse_error() + return + ins_mode_in_head_else t + + # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it for node, i in open_els if node.name is name # FIXME check namespace too @@ -1077,13 +1181,13 @@ parse_html = (txt, parse_error_cb = null) -> parse_error() when "\t", "\u000a", "\u000c", "\u000d", ' ' reconstruct_active_formatting_elements() - tree_insert_text t + insert_character t else reconstruct_active_formatting_elements() - tree_insert_text t + insert_character t flag_frameset_ok = false when TYPE_COMMENT - tree_insert_a_comment t + tree_insert_comment t when TYPE_DOCTYPE parse_error() when TYPE_START_TAG @@ -1096,7 +1200,7 @@ parse_html = (txt, parse_error_cb = null) -> root_attrs[k] = v unless root_attrs[k]? when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title' # FIXME also do this for (end tag) - return tree_in_head t + return ins_mode_in_head t when 'body' parse_error() # TODO @@ -1255,6 +1359,36 @@ parse_html = (txt, parse_error_cb = null) -> el = afe.shift() if el.type is TYPE_AFE_MARKER return + + # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata + ins_mode_text = (t) -> + if t.type is TYPE_TEXT + insert_character t + return + if t.type is TYPE_EOF + parse_error() + if open_els[0].name is 'script' + open_els[0].flag 'already started', true + open_els.shift() + insertion_mode = original_insertion_mode + insertion_mode t + return + if t.type is TYPE_END_TAG and t.name is 'script' + open_els.shift() + insertion_mode = original_insertion_mode + # fixfull the spec seems to assume that I'm going to run the script + # http://www.w3.org/TR/html5/syntax.html#scriptEndTag + return + if t.type is TYPE_END_TAG + open_els.shift() + insertion_mode = original_insertion_mode + return + console.log 'warning: end of ins_mode_text reached' + + # the functions below implement the tokenizer stats described here: + # http://www.w3.org/TR/html5/syntax.html#tokenization + + # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable ins_mode_in_table = (t) -> switch t.type when TYPE_TEXT @@ -1265,7 +1399,7 @@ parse_html = (txt, parse_error_cb = null) -> else ins_mode_in_table_else t when TYPE_COMMENT - tree_insert_a_comment t + tree_insert_comment t when TYPE_DOCTYPE parse_error() when TYPE_START_TAG @@ -1309,9 +1443,9 @@ parse_html = (txt, parse_error_cb = null) -> ins_mode_in_table_else t else parse_error() - insert_html_element t + el = insert_html_element t open_els.shift() - # fixfull acknowledge sef-closing flag + el.acknowledge_self_closing() when 'form' parse_error() if form_element_pointer? @@ -1345,6 +1479,7 @@ parse_html = (txt, parse_error_cb = null) -> ins_mode_in_table_else t + # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext ins_mode_in_table_text = (t) -> switch t.type when TYPE_TEXT @@ -1355,6 +1490,7 @@ parse_html = (txt, parse_error_cb = null) -> console.log "unimplemented ins_mode_in_table_text" # FIXME CONTINUE + # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody ins_mode_in_table_body = (t) -> if t.type is TYPE_START_TAG and t.name is 'tr' clear_stack_to_table_body_context() @@ -1397,6 +1533,7 @@ parse_html = (txt, parse_error_cb = null) -> # Anything else ins_mode_in_table t + # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr ins_mode_in_row = (t) -> if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td') clear_stack_to_table_row_context() @@ -1449,7 +1586,7 @@ parse_html = (txt, parse_error_cb = null) -> clear_afe_to_marker() insertion_mode = ins_mode_in_row - # http://www.w3.org/TR/html5/syntax.html#parsing-main-intd + # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd ins_mode_in_cell = (t) -> if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th') if is_in_table_scope t.name @@ -1492,15 +1629,11 @@ parse_html = (txt, parse_error_cb = null) -> # Anything Else ins_mode_in_body t - - # the functions below implement the tokenizer stats described here: - # http://www.w3.org/TR/html5/syntax.html#tokenization - # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state tok_state_data = -> switch c = txt.charAt(cur++) when '&' - return new_text_node tokenize_character_reference() + return new_text_node parse_character_reference() when '<' tok_state = tok_state_tag_open when "\u0000" @@ -1514,7 +1647,68 @@ parse_html = (txt, parse_error_cb = null) -> # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state # not needed: tok_state_character_reference_in_data = -> - # just call tok_state_character_reference_in_data() + # just call parse_character_reference() + + # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state + tok_state_rcdata = -> + switch c = txt.charAt(cur++) + when '&' + return new_text_node parse_character_reference() + when '<' + tok_state = tok_state_rcdata_less_than_sign + when "\u0000" + parse_error() + return new_character_token "\ufffd" + when '' # EOF + return new_eof_token() + else + return new_character_token c + return null + + # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state + # not needed: tok_state_character_reference_in_rcdata = -> + # just call parse_character_reference() + + # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state + tok_state_rawtext = -> + switch c = txt.charAt(cur++) + when '<' + tok_state = tok_state_rawtext_less_than_sign + when "\u0000" + parse_error() + return new_character_token "\ufffd" + when '' # EOF + return new_eof_token() + else + return new_character_token c + return null + + # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state + tok_state_script_data = -> + switch c = txt.charAt(cur++) + when '<' + tok_state = tok_state_script_data_less_than_sign + when "\u0000" + parse_error() + return new_character_token "\ufffd" + when '' # EOF + return new_eof_token() + else + return new_character_token c + return null + + # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state + tok_state_plaintext = -> + switch c = txt.charAt(cur++) + when "\u0000" + parse_error() + return new_character_token "\ufffd" + when '' # EOF + return new_eof_token() + else + return new_character_token c + return null + # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state tok_state_tag_open = -> @@ -1587,6 +1781,140 @@ parse_html = (txt, parse_error_cb = null) -> tok_cur_tag.name += c return null + # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state + tok_state_rcdata_less_than_sign = -> + c = txt.charAt(cur++) + if c is '/' + temporary_buffer = '' + tok_state = tok_state_rcdata_end_tag_open + return null + # Anything else + tok_state = tok_state_rcdata + cur -= 1 # reconsume the input character + return new_character_token '<' + + # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state + tok_state_rcdata_end_tag_open = -> + c = txt.charAt(cur++) + if uc_alpha.indexOf(c) > -1 + tok_cur_tag = new_end_tag c.toLowerCase() + temporary_buffer += c + tok_state = tok_state_rcdata_end_tag_name + return null + if lc_alpha.indexOf(c) > -1 + tok_cur_tag = new_end_tag c + temporary_buffer += c + tok_state = tok_state_rcdata_end_tag_name + return null + # Anything else + tok_state = tok_state_rcdata + cur -= 1 # reconsume the input character + return new_character_token " + # spec says to check against "the tag name of the last start tag to + # have been emitted from this tokenizer", but this is only called from + # the various "raw" states, which I'm pretty sure all push the start + # token onto open_els. TODO: verify this after the script data states + # are implemented + debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}" + return t.type is TYPE_END_TAG and t.name is open_els[0].name + + # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state + tok_state_rcdata_end_tag_name = -> + c = txt.charAt(cur++) + if c is "\t" or c is "\n" or c is "\u000c" or c is ' ' + if is_appropriate_end_tag tok_cur_tag + tok_state = tok_state_before_attribute_name + return + # else fall through to "Anything else" + if c is '/' + if is_appropriate_end_tag tok_cur_tag + tok_state = tok_state_self_closing_start_tag # FIXME spec typo? + return + # else fall through to "Anything else" + if c is '>' + if is_appropriate_end_tag tok_cur_tag + tok_state = tok_state_data + return tok_cur_tag + # else fall through to "Anything else" + if uc_alpha.indexOf(c) > -1 + tok_cur_tag.name += c.toLowerCase() + temporary_buffer += c + return null + if lc_alpha.indexOf(c) > -1 + tok_cur_tag.name += c + temporary_buffer += c + return null + # Anything else + tok_state = tok_state_rcdata + cur -= 1 # reconsume the input character + return new_character_token ' + c = txt.charAt(cur++) + if c is '/' + temporary_buffer = '' + tok_state = tok_state_rawtext_end_tag_open + return null + # Anything else + tok_state = tok_state_rawtext + cur -= 1 # reconsume the input character + return new_character_token '<' + + # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state + tok_state_rawtext_end_tag_open = -> + c = txt.charAt(cur++) + if uc_alpha.indexOf(c) > -1 + tok_cur_tag = new_end_tag c.toLowerCase() + temporary_buffer += c + tok_state = tok_state_rawtext_end_tag_name + return null + if lc_alpha.indexOf(c) > -1 + tok_cur_tag = new_end_tag c + temporary_buffer += c + tok_state = tok_state_rawtext_end_tag_name + return null + # Anything else + tok_state = tok_state_rawtext + cur -= 1 # reconsume the input character + return new_character_token " + c = txt.charAt(cur++) + if c is "\t" or c is "\n" or c is "\u000c" or c is ' ' + if is_appropriate_end_tag tok_cur_tag + tok_state = tok_state_before_attribute_name + return + # else fall through to "Anything else" + if c is '/' + if is_appropriate_end_tag tok_cur_tag + tok_state = tok_state_self_closing_start_tag + return + # else fall through to "Anything else" + if c is '>' + if is_appropriate_end_tag tok_cur_tag + tok_state = tok_state_data + return tok_cur_tag + # else fall through to "Anything else" + if uc_alpha.indexOf(c) > -1 + tok_cur_tag.name += c.toLowerCase() + temporary_buffer += c + return null + if lc_alpha.indexOf(c) > -1 + tok_cur_tag.name += c + temporary_buffer += c + return null + # Anything else + tok_state = tok_state_rawtext + cur -= 1 # reconsume the input character + return new_character_token ' attr_name = null @@ -1721,7 +2049,7 @@ parse_html = (txt, parse_error_cb = null) -> when '"' tok_state = tok_state_after_attribute_value_quoted when '&' - tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true + tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true when "\u0000" # Parse error tok_cur_tag.attrs_a[0][1] += "\ufffd" @@ -1738,7 +2066,7 @@ parse_html = (txt, parse_error_cb = null) -> when "'" tok_state = tok_state_after_attribute_value_quoted when '&' - tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true + tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true when "\u0000" # Parse error tok_cur_tag.attrs_a[0][1] += "\ufffd" @@ -1755,7 +2083,7 @@ parse_html = (txt, parse_error_cb = null) -> when "\t", "\n", "\u000c", ' ' tok_state = tok_state_before_attribute_name when '&' - tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true + tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true when '>' tok_state = tok_state_data tmp = tok_cur_tag @@ -1795,7 +2123,7 @@ parse_html = (txt, parse_error_cb = null) -> # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference # Don't set this as a state, just call it # returns a string (NOT a text node) - tokenize_character_reference = (allowed_char = null, in_attr = false) -> + parse_character_reference = (allowed_char = null, in_attr = false) -> if cur >= txt.length return '&' switch c = txt.charAt(cur) @@ -1872,12 +2200,16 @@ parse_html = (txt, parse_error_cb = null) -> # see comments on TYPE_TAG/etc for the structure of this data tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML open_els = [tree] + afe = [] # active formatting elements + template_insertion_modes = [] insertion_mode = ins_mode_in_body + original_insertion_mode = insertion_mode # TODO check spec + flag_scripting = true # TODO might need an extra flag to get