X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=adb9babbce8be63161825da256a11085b796191b;hb=06cc39431c9f7b4b4c10ae23be5652aca453238b;hp=0fb97b904a223b84ac374e6f036b4e9883e5afa6;hpb=0ad80cfb335ea76a37690949edec8db94a08b9cb;p=peach-html5-editor.git diff --git a/parse-html.coffee b/parse-html.coffee index 0fb97b9..adb9bab 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -24,7 +24,7 @@ # # Deviations from that spec: # -# Purposeful: search this file for "WTAG" +# Purposeful: search this file for "WHATWG" # # Not finished yet: search this file for "fixfull", "TODO" and "FIXME" @@ -84,6 +84,11 @@ NS_HTML = 1 NS_MATHML = 2 NS_SVG = 3 +# quirks mode constants +QUIRKS_NO = 1 +QUIRKS_LIMITED = 2 +QUIRKS_YES = 3 + g_debug_log = [] debug_log_reset = -> g_debug_log = [] @@ -249,6 +254,64 @@ unicode_fixes[0x9C] = "\u0153" unicode_fixes[0x9E] = "\u017E" unicode_fixes[0x9F] = "\u0178" +quirks_yes_pi_prefixes = [ + "+//silmaril//dtd html pro v0r11 19970101//" + "-//as//dtd html 3.0 aswedit + extensions//" + "-//advasoft ltd//dtd html 3.0 aswedit + extensions//" + "-//ietf//dtd html 2.0 level 1//" + "-//ietf//dtd html 2.0 level 2//" + "-//ietf//dtd html 2.0 strict level 1//" + "-//ietf//dtd html 2.0 strict level 2//" + "-//ietf//dtd html 2.0 strict//" + "-//ietf//dtd html 2.0//" + "-//ietf//dtd html 2.1e//" + "-//ietf//dtd html 3.0//" + "-//ietf//dtd html 3.2 final//" + "-//ietf//dtd html 3.2//" + "-//ietf//dtd html 3//" + "-//ietf//dtd html level 0//" + "-//ietf//dtd html level 1//" + "-//ietf//dtd html level 2//" + "-//ietf//dtd html level 3//" + "-//ietf//dtd html strict level 0//" + "-//ietf//dtd html strict level 1//" + "-//ietf//dtd html strict level 2//" + "-//ietf//dtd html strict level 3//" + "-//ietf//dtd html strict//" + "-//ietf//dtd html//" + "-//metrius//dtd metrius presentational//" + "-//microsoft//dtd internet explorer 2.0 html strict//" + "-//microsoft//dtd internet explorer 2.0 html//" + "-//microsoft//dtd internet explorer 2.0 tables//" + "-//microsoft//dtd internet explorer 3.0 html strict//" + "-//microsoft//dtd internet explorer 3.0 html//" + "-//microsoft//dtd internet explorer 3.0 tables//" + "-//netscape comm. corp.//dtd html//" + "-//netscape comm. corp.//dtd strict html//" + "-//o'reilly and associates//dtd html 2.0//" + "-//o'reilly and associates//dtd html extended 1.0//" + "-//o'reilly and associates//dtd html extended relaxed 1.0//" + "-//sq//dtd html 2.0 hotmetal + extensions//" + "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//" + "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//" + "-//spyglass//dtd html 2.0 extended//" + "-//sun microsystems corp.//dtd hotjava html//" + "-//sun microsystems corp.//dtd hotjava strict html//" + "-//w3c//dtd html 3 1995-03-24//" + "-//w3c//dtd html 3.2 draft//" + "-//w3c//dtd html 3.2 final//" + "-//w3c//dtd html 3.2//" + "-//w3c//dtd html 3.2s draft//" + "-//w3c//dtd html 4.0 frameset//" + "-//w3c//dtd html 4.0 transitional//" + "-//w3c//dtd html experimental 19960712//" + "-//w3c//dtd html experimental 970421//" + "-//w3c//dtd w3 html//" + "-//w3o//dtd w3 html 3.0//" + "-//webtechs//dtd mozilla html 2.0//" + "-//webtechs//dtd mozilla html//" +] + # These are the character references that don't need a terminating semicolon # min length: 2, max: 6, none are a prefix of any other. legacy_char_refs = { @@ -342,7 +405,7 @@ special_elements = { img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML, listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, - menu:NS_HTML,menuitem:NS_HTML, # WATWG adds these + menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML, @@ -468,7 +531,7 @@ svg_attribute_fixes = { diffuseconstant: 'diffuseConstant' edgemode: 'edgeMode' externalresourcesrequired: 'externalResourcesRequired' - # WTAG removes this: filterres: 'filterRes' + # WHATWG removes this: filterres: 'filterRes' filterunits: 'filterUnits' glyphref: 'glyphRef' gradienttransform: 'gradientTransform' @@ -605,18 +668,29 @@ parse_html = (args) -> else console.log "Parse error at character #{cur} of #{txt.length}" + # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements + # "Noah's Ark clause" but with three afe_push = (new_el) -> matches = 0 for el, i in afe + if el.type is TYPE_AFE_MARKER + break if el.name is new_el.name and el.namespace is new_el.namespace + attrs_match = true for k, v of el.attrs - continue unless new_el.attrs[k] is v - for k, v of new_el.attrs - continue unless el.attrs[k] is v - matches += 1 - if matches is 3 - afe.splice i, 1 - break + unless new_el.attrs[k] is v + attrs_match = false + break + if attrs_match + for k, v of new_el.attrs + unless el.attrs[k] is v + attrs_match = false + break + if attrs_match + matches += 1 + if matches is 3 + afe.splice i, 1 + break afe.unshift new_el afe_push_marker = -> afe.unshift new_afe_marker() @@ -626,24 +700,24 @@ parse_html = (args) -> # But first... the helpers template_tag_is_open = -> - for t in open_els - if t.name is 'template' and t.namespace is NS_HTML + for el in open_els + if el.name is 'template' and el.namespace is NS_HTML return true return false is_in_scope_x = (tag_name, scope, namespace) -> - for t in open_els - if t.name is tag_name and (namespace is null or namespace is t.namespace) + for el in open_els + if el.name is tag_name and (namespace is null or namespace is el.namespace) return true - if scope[t.name] is t.namespace + if scope[el.name] is el.namespace return false return false is_in_scope_x_y = (tag_name, scope, scope2, namespace) -> - for t in open_els - if t.name is tag_name and (namespace is null or namespace is t.namespace) + for el in open_els + if el.name is tag_name and (namespace is null or namespace is el.namespace) return true - if scope[t.name] is t.namespace + if scope[el.name] is el.namespace return false - if scope2[t.name] is t.namespace + if scope2[el.name] is el.namespace return false return false standard_scopers = { @@ -896,16 +970,46 @@ parse_html = (args) -> debug_log "tree: #{serialize_els doc.children, false, true}" debug_log "open_els: #{serialize_els open_els, true, true}" debug_log "afe: #{serialize_els afe, true, true}" +# this block implements tha W3C spec +# # 1. If the current node is an HTML element whose tag name is subject, +# # then run these substeps: +# # +# # 1. Let element be the current node. +# # +# # 2. Pop element off the stack of open elements. +# # +# # 3. If element is also in the list of active formatting elements, +# # remove the element from the list. +# # +# # 4. Abort the adoption agency algorithm. +# if open_els[0].name is subject and open_els[0].namespace is NS_HTML +# el = open_els.shift() +# # remove it from the list of active formatting elements (if found) +# for t, i in afe +# if t is el +# afe.splice i, 1 +# break +# debug_log "aaa: starting off with subject on top of stack, exiting" +# return +# WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm + # If the current node is an HTML element whose tag name is subject, and + # the current node is not in the list of active formatting elements, + # then pop the current node off the stack of open elements, and abort + # these steps. if open_els[0].name is subject and open_els[0].namespace is NS_HTML - el = open_els[0] - open_els.shift() + debug_log "aaa: starting off with subject on top of stack, exiting" # remove it from the list of active formatting elements (if found) - for t, i in afe - if t is el - afe.splice i, 1 + in_afe = false + for el, i in afe + if el is open_els[0] + in_afe = true break - debug_log "aaa: starting off with subject on top of stack, exiting" - return + unless in_afe + debug_log "aaa: ...and not in afe, aaa done" + open_els.shift() + return + # fall through +# END WHATWG outer = 0 loop if outer >= 8 @@ -1389,6 +1493,35 @@ parse_html = (args) -> # 8.2.5.4.1 The "initial" insertion mode # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode + is_quirks_yes_doctype = (t) -> + if t.flag 'force-quirks' + return true + if t.name isnt 'html' + return true + if t.public_identifier? + pi = t.public_identifier.toLowerCase() + for p in quirks_yes_pi_prefixes + if pi.substr(0, p.length) is p + return true + if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html' + return true + if t.system_identifier? + if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd' + return true + else if t.public_identifier? + # already did this: pi = t.public_identifier.toLowerCase() + if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//' + return true + return false + is_quirks_limited_doctype = (t) -> + if t.public_identifier? + pi = t.public_identifier.toLowerCase() + if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//' + return true + if t.system_identifier? + if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//' + return true + return false ins_mode_initial = (t) -> if is_space_tok t return @@ -1397,13 +1530,20 @@ parse_html = (args) -> doc.children.push t return if t.type is TYPE_DOCTYPE - # FIXME check identifiers, set quirks, etc - # fixfull + # fixfull syntax error from first paragraph and following bullets + # fixfull set doc.doctype + # fixfull is the "not an iframe srcdoc" thing relevant? + if is_quirks_yes_doctype t + doc.flag 'quirks mode', QUIRKS_YES + else if is_quirks_limited_doctype t + doc.flag 'quirks mode', QUIRKS_LIMITED doc.children.push t ins_mode = ins_mode_before_html return # Anything else - #fixfull (iframe, quirks) + # fixfull not iframe srcdoc? + parse_error() + doc.flag 'quirks mode', QUIRKS_YES ins_mode = ins_mode_before_html process_token t return @@ -1432,9 +1572,9 @@ parse_html = (args) -> parse_error() return # Anything else - html_tok = new_open_tag 'html' - el = token_to_element html_tok, NS_HTML, doc + el = token_to_element new_open_tag('html'), NS_HTML, doc doc.children.push el + el.parent = doc open_els.unshift el # ?fixfull browsing context ins_mode = ins_mode_before_head @@ -1466,8 +1606,7 @@ parse_html = (args) -> parse_error() return # Anything else - head_tok = new_open_tag 'head' - el = insert_html_element head_tok + el = insert_html_element new_open_tag 'head' head_element_pointer = el ins_mode = ins_mode_in_head process_token t @@ -1621,7 +1760,7 @@ parse_html = (args) -> parse_error() open_els.unshift head_element_pointer ins_mode_in_head t - for el, i of open_els + for el, i in open_els if el is head_element_pointer open_els.splice i, 1 return @@ -1641,17 +1780,23 @@ parse_html = (args) -> # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it - for el, i in open_els - if el.name is name and el.namespace is NS_HTML + node = open_els[0] + loop + if node.name is name and node.namespace is NS_HTML generate_implied_end_tags name # arg is exception - parse_error() unless i is 0 - while i >= 0 - open_els.shift() - i -= 1 - return - if special_elements[el.name] is el.namespace + unless node is open_els[0] + parse_error() + loop + el = open_els.shift() + if el is node + return + if special_elements[node.name] is node.namespace parse_error() return + for el, i in open_els + if node is el + node = open_els[i + 1] + break return ins_mode_in_body = (t) -> if t.type is TYPE_TEXT and t.text is "\u0000" @@ -1691,7 +1836,7 @@ parse_html = (args) -> return unless second.name is 'body' return if template_tag_is_open() flag_frameset_ok = false - for a of t.attrs_a + for a in t.attrs_a second.attrs[a[0]] = a[1] unless second.attrs[a[0]]? return if t.type is TYPE_START_TAG and t.name is 'frameset' @@ -1978,6 +2123,10 @@ parse_html = (args) -> return if t.type is TYPE_START_TAG and t.name is 'nobr' reconstruct_afe() + if is_in_scope 'nobr', NS_HTML + parse_error() + adoption_agency 'nobr' + reconstruct_afe() el = insert_html_element t afe_push el return @@ -2004,14 +2153,15 @@ parse_html = (args) -> clear_afe_to_marker() return if t.type is TYPE_START_TAG and t.name is 'table' - close_p_if_in_button_scope() # fixfull quirksmode thing + unless doc.flag('quirks mode') is QUIRKS_YES + close_p_if_in_button_scope() # test insert_html_element t flag_frameset_ok = false ins_mode = ins_mode_in_table return if t.type is TYPE_END_TAG and t.name is 'br' parse_error() - t.type is TYPE_START_TAG + t.type = TYPE_START_TAG # fall through if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr') reconstruct_afe() @@ -2028,7 +2178,8 @@ parse_html = (args) -> unless is_input_hidden_tok t flag_frameset_ok = false return - if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track') + if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track') + # WHATWG adds 'menuitem' for this block insert_html_element t open_els.shift() t.acknowledge_self_closing() @@ -2138,7 +2289,7 @@ parse_html = (args) -> # parse_error() # insert_html_element t # return -# below implements the WATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody +# below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc') if is_in_scope 'ruby', NS_HTML generate_implied_end_tags() @@ -2153,7 +2304,7 @@ parse_html = (args) -> parse_error() insert_html_element t return -# end WATWG chunk +# end WHATWG chunk if t.type is TYPE_START_TAG and t.name is 'math' reconstruct_afe() adjust_mathml_attributes t @@ -2584,7 +2735,7 @@ parse_html = (args) -> insert_html_element t return if t.type is TYPE_END_TAG and t.name is 'optgroup' - if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML + if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML open_els.shift() if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML @@ -2726,7 +2877,8 @@ parse_html = (args) -> ins_mode_in_body t return if t.type is TYPE_COMMENT - insert_comment t, [open_els[0], open_els[0].children.length] + first = open_els[open_els.length - 1] + insert_comment t, [first, first.children.length] return if t.type is TYPE_DOCTYPE parse_error() @@ -2805,7 +2957,7 @@ parse_html = (args) -> ins_mode_in_body t return if t.type is TYPE_END_TAG and t.name is 'html' - insert_mode = ins_mode_after_after_frameset + ins_mode = ins_mode_after_after_frameset return if t.type is TYPE_START_TAG and t.name is 'noframes' ins_mode_in_head t @@ -3047,25 +3199,27 @@ parse_html = (args) -> # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state tok_state_end_tag_open = -> - switch c = txt.charAt(cur++) - when '>' - parse_error() - tok_state = tok_state_data - when '' # EOF - parse_error() - tok_state = tok_state_data - return new_text_node '' + parse_error() + tok_state = tok_state_data + return + if c is '' # EOF + parse_error() + tok_state = tok_state_data + return new_text_node ' else val = txt.substr cur, (next_gt - cur) cur = next_gt + 3 - return new_character_token val # fixfull split + if val.length > 0 + return new_character_token val # fixfull split + return null # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference # Don't set this as a state, just call it @@ -4489,6 +4645,7 @@ parse_html = (args) -> txt = args.html cur = 0 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML + doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this open_els = [] afe = [] # active formatting elements template_ins_modes = [] @@ -4504,6 +4661,7 @@ parse_html = (args) -> head_element_pointer = null flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case) context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments + prev_node_id = 0 # just for debugging # tokenizer initialization tok_state = tok_state_data @@ -4514,7 +4672,7 @@ parse_html = (args) -> txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this - if args.name is "tests16.dat #25" + if args.name is "tests23.dat #1" console.log "hi" # proccess input # http://www.w3.org/TR/html5/syntax.html#tree-construction @@ -4544,3 +4702,6 @@ module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE module.exports.NS_HTML = NS_HTML module.exports.NS_MATHML = NS_MATHML module.exports.NS_SVG = NS_SVG +module.exports.QUIRKS_NO = QUIRKS_NO +module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED +module.exports.QUIRKS_YES = QUIRKS_YES