X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=425fe3c6e32b7991ef51daabb137774f11225779;hb=a88ccdd930221ffd086134f2e3890602d9e17d9d;hp=f5437c9ffb07849d241fce35fdcc32c9549b035f;hpb=43cf85a626c514ced655824ec92f7a39178855af;p=peach-html5-editor.git diff --git a/parse-html.coffee b/parse-html.coffee index f5437c9..425fe3c 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -84,6 +84,11 @@ NS_HTML = 1 NS_MATHML = 2 NS_SVG = 3 +# quirks mode constants +QUIRKS_NO = 1 +QUIRKS_LIMITED = 2 +QUIRKS_YES = 3 + g_debug_log = [] debug_log_reset = -> g_debug_log = [] @@ -249,6 +254,64 @@ unicode_fixes[0x9C] = "\u0153" unicode_fixes[0x9E] = "\u017E" unicode_fixes[0x9F] = "\u0178" +quirks_yes_pi_prefixes = [ + "+//silmaril//dtd html pro v0r11 19970101//" + "-//as//dtd html 3.0 aswedit + extensions//" + "-//advasoft ltd//dtd html 3.0 aswedit + extensions//" + "-//ietf//dtd html 2.0 level 1//" + "-//ietf//dtd html 2.0 level 2//" + "-//ietf//dtd html 2.0 strict level 1//" + "-//ietf//dtd html 2.0 strict level 2//" + "-//ietf//dtd html 2.0 strict//" + "-//ietf//dtd html 2.0//" + "-//ietf//dtd html 2.1e//" + "-//ietf//dtd html 3.0//" + "-//ietf//dtd html 3.2 final//" + "-//ietf//dtd html 3.2//" + "-//ietf//dtd html 3//" + "-//ietf//dtd html level 0//" + "-//ietf//dtd html level 1//" + "-//ietf//dtd html level 2//" + "-//ietf//dtd html level 3//" + "-//ietf//dtd html strict level 0//" + "-//ietf//dtd html strict level 1//" + "-//ietf//dtd html strict level 2//" + "-//ietf//dtd html strict level 3//" + "-//ietf//dtd html strict//" + "-//ietf//dtd html//" + "-//metrius//dtd metrius presentational//" + "-//microsoft//dtd internet explorer 2.0 html strict//" + "-//microsoft//dtd internet explorer 2.0 html//" + "-//microsoft//dtd internet explorer 2.0 tables//" + "-//microsoft//dtd internet explorer 3.0 html strict//" + "-//microsoft//dtd internet explorer 3.0 html//" + "-//microsoft//dtd internet explorer 3.0 tables//" + "-//netscape comm. corp.//dtd html//" + "-//netscape comm. corp.//dtd strict html//" + "-//o'reilly and associates//dtd html 2.0//" + "-//o'reilly and associates//dtd html extended 1.0//" + "-//o'reilly and associates//dtd html extended relaxed 1.0//" + "-//sq//dtd html 2.0 hotmetal + extensions//" + "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//" + "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//" + "-//spyglass//dtd html 2.0 extended//" + "-//sun microsystems corp.//dtd hotjava html//" + "-//sun microsystems corp.//dtd hotjava strict html//" + "-//w3c//dtd html 3 1995-03-24//" + "-//w3c//dtd html 3.2 draft//" + "-//w3c//dtd html 3.2 final//" + "-//w3c//dtd html 3.2//" + "-//w3c//dtd html 3.2s draft//" + "-//w3c//dtd html 4.0 frameset//" + "-//w3c//dtd html 4.0 transitional//" + "-//w3c//dtd html experimental 19960712//" + "-//w3c//dtd html experimental 970421//" + "-//w3c//dtd w3 html//" + "-//w3o//dtd w3 html 3.0//" + "-//webtechs//dtd mozilla html 2.0//" + "-//webtechs//dtd mozilla html//" +] + # These are the character references that don't need a terminating semicolon # min length: 2, max: 6, none are a prefix of any other. legacy_char_refs = { @@ -605,18 +668,29 @@ parse_html = (args) -> else console.log "Parse error at character #{cur} of #{txt.length}" + # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements + # "Noah's Ark clause" but with three afe_push = (new_el) -> matches = 0 for el, i in afe + if el.type is TYPE_AFE_MARKER + break if el.name is new_el.name and el.namespace is new_el.namespace + attrs_match = true for k, v of el.attrs - continue unless new_el.attrs[k] is v - for k, v of new_el.attrs - continue unless el.attrs[k] is v - matches += 1 - if matches is 3 - afe.splice i, 1 - break + unless new_el.attrs[k] is v + attrs_match = false + break + if attrs_match + for k, v of new_el.attrs + unless el.attrs[k] is v + attrs_match = false + break + if attrs_match + matches += 1 + if matches is 3 + afe.splice i, 1 + break afe.unshift new_el afe_push_marker = -> afe.unshift new_afe_marker() @@ -626,24 +700,24 @@ parse_html = (args) -> # But first... the helpers template_tag_is_open = -> - for t in open_els - if t.name is 'template' and t.namespace is NS_HTML + for el in open_els + if el.name is 'template' and el.namespace is NS_HTML return true return false is_in_scope_x = (tag_name, scope, namespace) -> - for t in open_els - if t.name is tag_name and (namespace is null or namespace is t.namespace) + for el in open_els + if el.name is tag_name and (namespace is null or namespace is el.namespace) return true - if scope[t.name] is t.namespace + if scope[el.name] is el.namespace return false return false is_in_scope_x_y = (tag_name, scope, scope2, namespace) -> - for t in open_els - if t.name is tag_name and (namespace is null or namespace is t.namespace) + for el in open_els + if el.name is tag_name and (namespace is null or namespace is el.namespace) return true - if scope[t.name] is t.namespace + if scope[el.name] is el.namespace return false - if scope2[t.name] is t.namespace + if scope2[el.name] is el.namespace return false return false standard_scopers = { @@ -743,8 +817,8 @@ parse_html = (args) -> loop if node_i is open_els.length - 1 last = true - # fixfull (fragment case) - + if flag_fragment_parsing + node = context_element # 4. If node is a select element, run these substeps: if node.name is 'select' and node.namespace is NS_HTML # 1. If last is true, jump to the step below labeled done. @@ -1419,6 +1493,35 @@ parse_html = (args) -> # 8.2.5.4.1 The "initial" insertion mode # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode + is_quirks_yes_doctype = (t) -> + if t.flag 'force-quirks' + return true + if t.name isnt 'html' + return true + if t.public_identifier? + pi = t.public_identifier.toLowerCase() + for p in quirks_yes_pi_prefixes + if pi.substr(0, p.length) is p + return true + if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html' + return true + if t.system_identifier? + if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd' + return true + else if t.public_identifier? + # already did this: pi = t.public_identifier.toLowerCase() + if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//' + return true + return false + is_quirks_limited_doctype = (t) -> + if t.public_identifier? + pi = t.public_identifier.toLowerCase() + if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//' + return true + if t.system_identifier? + if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//' + return true + return false ins_mode_initial = (t) -> if is_space_tok t return @@ -1427,13 +1530,20 @@ parse_html = (args) -> doc.children.push t return if t.type is TYPE_DOCTYPE - # FIXME check identifiers, set quirks, etc - # fixfull + # fixfull syntax error from first paragraph and following bullets + # fixfull set doc.doctype + # fixfull is the "not an iframe srcdoc" thing relevant? + if is_quirks_yes_doctype t + doc.flag 'quirks mode', QUIRKS_YES + else if is_quirks_limited_doctype t + doc.flag 'quirks mode', QUIRKS_LIMITED doc.children.push t ins_mode = ins_mode_before_html return # Anything else - #fixfull (iframe, quirks) + # fixfull not iframe srcdoc? + parse_error() + doc.flag 'quirks mode', QUIRKS_YES ins_mode = ins_mode_before_html process_token t return @@ -1451,6 +1561,7 @@ parse_html = (args) -> if t.type is TYPE_START_TAG and t.name is 'html' el = token_to_element t, NS_HTML, doc doc.children.push el + el.document = doc open_els.unshift(el) # fixfull (big paragraph in spec about manifest, fragment, urls, etc) ins_mode = ins_mode_before_head @@ -1462,9 +1573,9 @@ parse_html = (args) -> parse_error() return # Anything else - html_tok = new_open_tag 'html' - el = token_to_element html_tok, NS_HTML, doc + el = token_to_element new_open_tag('html'), NS_HTML, doc doc.children.push el + el.document = doc open_els.unshift el # ?fixfull browsing context ins_mode = ins_mode_before_head @@ -1496,8 +1607,7 @@ parse_html = (args) -> parse_error() return # Anything else - head_tok = new_open_tag 'head' - el = insert_html_element head_tok + el = insert_html_element new_open_tag 'head' head_element_pointer = el ins_mode = ins_mode_in_head process_token t @@ -1651,7 +1761,7 @@ parse_html = (args) -> parse_error() open_els.unshift head_element_pointer ins_mode_in_head t - for el, i of open_els + for el, i in open_els if el is head_element_pointer open_els.splice i, 1 return @@ -1671,17 +1781,23 @@ parse_html = (args) -> # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it - for el, i in open_els - if el.name is name and el.namespace is NS_HTML + node = open_els[0] + loop + if node.name is name and node.namespace is NS_HTML generate_implied_end_tags name # arg is exception - parse_error() unless i is 0 - while i >= 0 - open_els.shift() - i -= 1 - return - if special_elements[el.name] is el.namespace + unless node is open_els[0] + parse_error() + loop + el = open_els.shift() + if el is node + return + if special_elements[node.name] is node.namespace parse_error() return + for el, i in open_els + if node is el + node = open_els[i + 1] + break return ins_mode_in_body = (t) -> if t.type is TYPE_TEXT and t.text is "\u0000" @@ -1721,7 +1837,7 @@ parse_html = (args) -> return unless second.name is 'body' return if template_tag_is_open() flag_frameset_ok = false - for a of t.attrs_a + for a in t.attrs_a second.attrs[a[0]] = a[1] unless second.attrs[a[0]]? return if t.type is TYPE_START_TAG and t.name is 'frameset' @@ -1809,11 +1925,7 @@ parse_html = (args) -> if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing') close_p_if_in_button_scope() insert_html_element t - # spec: If the next token is a "LF" (U+000A) character token, then - # ignore that token and move on to the next one. (Newlines at the - # start of pre blocks are ignored as an authoring convenience.) - if txt.charAt(cur) is "\u000a" # FIXME check for crlf? - cur += 1 + eat_next_token_if_newline() flag_frameset_ok = false return if t.type is TYPE_START_TAG and t.name is 'form' @@ -2008,6 +2120,10 @@ parse_html = (args) -> return if t.type is TYPE_START_TAG and t.name is 'nobr' reconstruct_afe() + if is_in_scope 'nobr', NS_HTML + parse_error() + adoption_agency 'nobr' + reconstruct_afe() el = insert_html_element t afe_push el return @@ -2034,14 +2150,16 @@ parse_html = (args) -> clear_afe_to_marker() return if t.type is TYPE_START_TAG and t.name is 'table' - close_p_if_in_button_scope() # fixfull quirksmode thing + unless doc.flag('quirks mode') is QUIRKS_YES + close_p_if_in_button_scope() # test insert_html_element t flag_frameset_ok = false ins_mode = ins_mode_in_table return if t.type is TYPE_END_TAG and t.name is 'br' parse_error() - t.type is TYPE_START_TAG + # W3C: t.type = TYPE_START_TAG + t = new_open_tag 'br' # WHATWG # fall through if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr') reconstruct_afe() @@ -2058,7 +2176,8 @@ parse_html = (args) -> unless is_input_hidden_tok t flag_frameset_ok = false return - if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track') + if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track') + # WHATWG adds 'menuitem' for this block insert_html_element t open_els.shift() t.acknowledge_self_closing() @@ -2118,8 +2237,7 @@ parse_html = (args) -> return if t.type is TYPE_START_TAG and t.name is 'textarea' insert_html_element t - if txt.charAt(cur) is "\u000a" # FIXME check for crlf? - cur += 1 + eat_next_token_if_newline() tok_state = tok_state_rcdata original_ins_mode = ins_mode flag_frameset_ok = false @@ -2614,7 +2732,7 @@ parse_html = (args) -> insert_html_element t return if t.type is TYPE_END_TAG and t.name is 'optgroup' - if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML + if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML open_els.shift() if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML @@ -2650,7 +2768,7 @@ parse_html = (args) -> return if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea') parse_error() - if is_in_select_scope 'select', NS_HTML + unless is_in_select_scope 'select', NS_HTML return loop el = open_els.shift() @@ -2976,7 +3094,7 @@ parse_html = (args) -> tok_state = tok_state_tag_open when "\u0000" parse_error() - return new_text_node "\ufffd" + return new_text_node c when '' # EOF return new_eof_token() else @@ -3673,7 +3791,7 @@ parse_html = (args) -> return if c is '>' tok_state = tok_state_data - return + return tok_cur_tag if is_uc_alpha(c) tok_cur_tag.attrs_a.unshift [c.toLowerCase(), ''] tok_state = tok_state_attribute_name @@ -4426,7 +4544,10 @@ parse_html = (args) -> else val = txt.substr cur, (next_gt - cur) cur = next_gt + 3 - return new_character_token val # fixfull split + val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") + if val.length > 0 + return new_character_token val # fixfull split + return null # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference # Don't set this as a state, just call it @@ -4517,11 +4638,31 @@ parse_html = (args) -> return '&' return # never reached + eat_next_token_if_newline = -> + old_cur = cur + t = null + until t? + t = tok_state() + if t.type is TYPE_TEXT + # definition of a newline depends on whether it was a character ref or not + if cur - old_cur is 1 + # not a character reference + if t.text is "\u000d" or t.text is "\u000a" + return + else + if t.text is "\u000a" + return + # not a "newline" + cur = old_cur + return + # tree constructor initialization # see comments on TYPE_TAG/etc for the structure of this data txt = args.html cur = 0 - doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML + doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML + doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this + fragment_root = null # fragment parsing algorithm returns children of this open_els = [] afe = [] # active formatting elements template_ins_modes = [] @@ -4535,28 +4676,105 @@ parse_html = (args) -> temporary_buffer = null pending_table_character_tokens = [] head_element_pointer = null - flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case) - context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments + flag_fragment_parsing = false + context_element = null prev_node_id = 0 # just for debugging # tokenizer initialization tok_state = tok_state_data - # text pre-processing - # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream - txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this - txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this - txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this + parse_init = -> + # fragment parsing (text arg) + if args.fragment? + # this handles the fragment from the tests in the format described here: + # https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/README.md + f = args.fragment + ns = NS_HTML + if f.substr(0, 5) is 'math ' + f = f.substr 5 + ns = NS_MATHML + else if f.substr(0, 4) is 'svg ' + f = f.substr 4 + ns = NS_SVG + t = new_open_tag f + context_element = token_to_element t, ns + context_element.document = new Node TYPE_TAG, name: 'document', namespace: NS_HTML + context_element.document.flag 'quirks mode', QUIRKS_NO + # fragment parsing (Node arg) + if args.context? + context_element = args.context + + # http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments + # fragment parsing algorithm + if context_element? + flag_fragment_parsing = true + doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML + # search up the tree from context, to try to find it's document, + # because this file only puts a "document" property on the root + # element. + old_doc = null + el = context_element + loop + if el.document? + old_doc = el.document + break + if el.parent + el = el.parent + else + break + if old_doc + doc.flag 'quirks mode', old_doc.flag 'quirks mode' + # set tok_state + if context_element.namespace is NS_HTML + switch context_element.name + when 'title', 'textarea' + tok_state = tok_state_rcdata + when 'style', 'xmp', 'iframe', 'noembed', 'noframes' + tok_state = tok_state_rawtext + when 'script' + tok_state = tok_state_script_data + when 'noscript' + if flag_scripting + tok_state = tok_state_rawtext + when 'plaintext' + tok_state = tok_state_plaintext + fragment_root = new Node TYPE_TAG, name: 'html', namespace: NS_HTML + doc.children.push fragment_root + fragment_root.document = doc + open_els = [fragment_root] + if context_element.name is 'template' and context_element.namespace is NS_HTML + template_ins_modes.unshift ins_mode_in_template + # fixfull create token for context (it should have it's original one already) + reset_ins_mode() + # set form_element pointer... in the foreign doc?! + el = context_element + loop + if el.name is 'form' and el.namespace is NS_HTML + form_element_pointer = el + break + if el.parent + el = el.parent + else + break + + # text pre-processing + # FIXME check http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream + txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this + txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this - if args.name is "tests18.dat #17" - console.log "hi" - # proccess input # http://www.w3.org/TR/html5/syntax.html#tree-construction - while flag_parsing - t = tok_state() - if t? - process_token t - # fixfull parse error if has self-closing flag, but it wasn't acknolwedged + parse_main_loop = -> + while flag_parsing + t = tok_state() + if t? + process_token t + # fixfull parse error if has self-closing flag, but it wasn't acknolwedged + return + parse_init() + parse_main_loop() + + if flag_fragment_parsing + return fragment_root.children return doc.children serialize_els = (els, shallow, show_ids) -> @@ -4578,3 +4796,6 @@ module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE module.exports.NS_HTML = NS_HTML module.exports.NS_MATHML = NS_MATHML module.exports.NS_SVG = NS_SVG +module.exports.QUIRKS_NO = QUIRKS_NO +module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED +module.exports.QUIRKS_YES = QUIRKS_YES