From a88ccdd930221ffd086134f2e3890602d9e17d9d Mon Sep 17 00:00:00 2001 From: Jason Woofenden Date: Thu, 24 Dec 2015 13:49:56 -0500 Subject: [PATCH] implement fragment parsing algorithm --- parse-html.coffee | 104 ++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 91 insertions(+), 13 deletions(-) diff --git a/parse-html.coffee b/parse-html.coffee index 20bc99c..425fe3c 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -817,8 +817,8 @@ parse_html = (args) -> loop if node_i is open_els.length - 1 last = true - # fixfull (fragment case) - + if flag_fragment_parsing + node = context_element # 4. If node is a select element, run these substeps: if node.name is 'select' and node.namespace is NS_HTML # 1. If last is true, jump to the step below labeled done. @@ -1561,6 +1561,7 @@ parse_html = (args) -> if t.type is TYPE_START_TAG and t.name is 'html' el = token_to_element t, NS_HTML, doc doc.children.push el + el.document = doc open_els.unshift(el) # fixfull (big paragraph in spec about manifest, fragment, urls, etc) ins_mode = ins_mode_before_head @@ -1574,7 +1575,7 @@ parse_html = (args) -> # Anything else el = token_to_element new_open_tag('html'), NS_HTML, doc doc.children.push el - el.parent = doc + el.document = doc open_els.unshift el # ?fixfull browsing context ins_mode = ins_mode_before_head @@ -4659,8 +4660,9 @@ parse_html = (args) -> # see comments on TYPE_TAG/etc for the structure of this data txt = args.html cur = 0 - doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML + doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this + fragment_root = null # fragment parsing algorithm returns children of this open_els = [] afe = [] # active formatting elements template_ins_modes = [] @@ -4674,21 +4676,92 @@ parse_html = (args) -> temporary_buffer = null pending_table_character_tokens = [] head_element_pointer = null - flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case) - context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments + flag_fragment_parsing = false + context_element = null prev_node_id = 0 # just for debugging # tokenizer initialization tok_state = tok_state_data - # text pre-processing - # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream - txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this - txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this + parse_init = -> + # fragment parsing (text arg) + if args.fragment? + # this handles the fragment from the tests in the format described here: + # https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/README.md + f = args.fragment + ns = NS_HTML + if f.substr(0, 5) is 'math ' + f = f.substr 5 + ns = NS_MATHML + else if f.substr(0, 4) is 'svg ' + f = f.substr 4 + ns = NS_SVG + t = new_open_tag f + context_element = token_to_element t, ns + context_element.document = new Node TYPE_TAG, name: 'document', namespace: NS_HTML + context_element.document.flag 'quirks mode', QUIRKS_NO + # fragment parsing (Node arg) + if args.context? + context_element = args.context + + # http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments + # fragment parsing algorithm + if context_element? + flag_fragment_parsing = true + doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML + # search up the tree from context, to try to find it's document, + # because this file only puts a "document" property on the root + # element. + old_doc = null + el = context_element + loop + if el.document? + old_doc = el.document + break + if el.parent + el = el.parent + else + break + if old_doc + doc.flag 'quirks mode', old_doc.flag 'quirks mode' + # set tok_state + if context_element.namespace is NS_HTML + switch context_element.name + when 'title', 'textarea' + tok_state = tok_state_rcdata + when 'style', 'xmp', 'iframe', 'noembed', 'noframes' + tok_state = tok_state_rawtext + when 'script' + tok_state = tok_state_script_data + when 'noscript' + if flag_scripting + tok_state = tok_state_rawtext + when 'plaintext' + tok_state = tok_state_plaintext + fragment_root = new Node TYPE_TAG, name: 'html', namespace: NS_HTML + doc.children.push fragment_root + fragment_root.document = doc + open_els = [fragment_root] + if context_element.name is 'template' and context_element.namespace is NS_HTML + template_ins_modes.unshift ins_mode_in_template + # fixfull create token for context (it should have it's original one already) + reset_ins_mode() + # set form_element pointer... in the foreign doc?! + el = context_element + loop + if el.name is 'form' and el.namespace is NS_HTML + form_element_pointer = el + break + if el.parent + el = el.parent + else + break + + # text pre-processing + # FIXME check http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream + txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this + txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this - if args.name is "webkit01.dat #12" - console.log "hi" - # proccess input # http://www.w3.org/TR/html5/syntax.html#tree-construction parse_main_loop = -> while flag_parsing @@ -4696,7 +4769,12 @@ parse_html = (args) -> if t? process_token t # fixfull parse error if has self-closing flag, but it wasn't acknolwedged + return + parse_init() parse_main_loop() + + if flag_fragment_parsing + return fragment_root.children return doc.children serialize_els = (els, shallow, show_ids) -> -- 1.7.10.4