X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;f=parse-html.coffee;h=f5437c9ffb07849d241fce35fdcc32c9549b035f;hb=43cf85a626c514ced655824ec92f7a39178855af;hp=06c2a1b04e67c55ad0fbf7e5e1b2b78396efa99f;hpb=3ff49c30096e8e97599b98755157f9a692937f58;p=peach-html5-editor.git diff --git a/parse-html.coffee b/parse-html.coffee index 06c2a1b..f5437c9 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -16,13 +16,17 @@ # This file implements a parser for html snippets, meant to be used by a -# WYSIWYG editor. Hence it does not attempt to parse doctypes, , -# or tags, nor does it produce the top level "document" node in the dom -# tree, nor nodes for html, head or body. Comments containing "fixfull" -# indicate places where additional code is needed for full HTML document -# parsing. +# WYSIWYG editor. + +# The implementation is a pretty direct implementation of the parsing algorithm +# described here: +# http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream +# +# Deviations from that spec: +# +# Purposeful: search this file for "WHATWG" # -# Instead, the data structure produced by this parser is an array of Nodes. +# Not finished yet: search this file for "fixfull", "TODO" and "FIXME" # stacks/lists @@ -108,7 +112,7 @@ class Node @id = "#{++prev_node_id}" acknowledge_self_closing: -> if @token? - @token.flag 'did_self_close' + @token.flag 'did_self_close', true else @flag 'did_self_close', true flag: (key, value = null) -> @@ -336,14 +340,17 @@ special_elements = { h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML, header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML, img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML, - listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML, - noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML, - ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML, - script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML, - style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML, - template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, - thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML, - wbr:NS_HTML, xmp:NS_HTML, + listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, + + menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these + + meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML, + noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML, + plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML, + select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML, + table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML, + textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML, + tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML, # MathML: mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML, @@ -461,7 +468,7 @@ svg_attribute_fixes = { diffuseconstant: 'diffuseConstant' edgemode: 'edgeMode' externalresourcesrequired: 'externalResourcesRequired' - filterres: 'filterRes' + # WHATWG removes this: filterres: 'filterRes' filterunits: 'filterUnits' glyphref: 'glyphRef' gradienttransform: 'gradientTransform' @@ -513,6 +520,20 @@ svg_attribute_fixes = { ychannelselector: 'yChannelSelector' zoomandpan: 'zoomAndPan' } +foreign_attr_fixes = { + 'xlink:actuate': 'xlink actuate' + 'xlink:arcrole': 'xlink arcrole' + 'xlink:href': 'xlink href' + 'xlink:role': 'xlink role' + 'xlink:show': 'xlink show' + 'xlink:title': 'xlink title' + 'xlink:type': 'xlink type' + 'xml:base': 'xml base' + 'xml:lang': 'xml lang' + 'xml:space': 'xml space' + 'xmlns': 'xmlns' + 'xmlns:xlink': 'xmlns xlink' +} adjust_mathml_attributes = (t) -> for a in t.attrs_a if a[0] is 'definitionurl' @@ -525,6 +546,9 @@ adjust_svg_attributes = (t) -> return adjust_foreign_attributes = (t) -> # fixfull + for a in t.attrs_a + if foreign_attr_fixes[a[0]]? + a[0] = foreign_attr_fixes[a[0]] return # decode_named_char_ref() @@ -625,10 +649,10 @@ parse_html = (args) -> standard_scopers = { applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML, td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML, - template: NS_HTML, mi: NS_MATHML, + template: NS_HTML, - mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML, - 'annotation-xml': NS_MATHML, + mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, + mtext: NS_MATHML, 'annotation-xml': NS_MATHML, foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG } @@ -648,7 +672,7 @@ parse_html = (args) -> for t in open_els if t.name is tag_name and (namespace is null or namespace is t.namespace) return true - if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option' + if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option' return false return false # this checks for a particular element, not by name @@ -872,16 +896,46 @@ parse_html = (args) -> debug_log "tree: #{serialize_els doc.children, false, true}" debug_log "open_els: #{serialize_els open_els, true, true}" debug_log "afe: #{serialize_els afe, true, true}" +# this block implements tha W3C spec +# # 1. If the current node is an HTML element whose tag name is subject, +# # then run these substeps: +# # +# # 1. Let element be the current node. +# # +# # 2. Pop element off the stack of open elements. +# # +# # 3. If element is also in the list of active formatting elements, +# # remove the element from the list. +# # +# # 4. Abort the adoption agency algorithm. +# if open_els[0].name is subject and open_els[0].namespace is NS_HTML +# el = open_els.shift() +# # remove it from the list of active formatting elements (if found) +# for t, i in afe +# if t is el +# afe.splice i, 1 +# break +# debug_log "aaa: starting off with subject on top of stack, exiting" +# return +# WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm + # If the current node is an HTML element whose tag name is subject, and + # the current node is not in the list of active formatting elements, + # then pop the current node off the stack of open elements, and abort + # these steps. if open_els[0].name is subject and open_els[0].namespace is NS_HTML - el = open_els[0] - open_els.shift() + debug_log "aaa: starting off with subject on top of stack, exiting" # remove it from the list of active formatting elements (if found) - for t, i in afe - if t is el - afe.splice i, 1 + in_afe = false + for el, i in afe + if el is open_els[0] + in_afe = true break - debug_log "aaa: starting off with subject on top of stack, exiting" - return + unless in_afe + debug_log "aaa: ...and not in afe, aaa done" + open_els.shift() + return + # fall through +# END WHATWG outer = 0 loop if outer >= 8 @@ -1189,7 +1243,7 @@ parse_html = (args) -> ins_mode t return if is_mathml_text_integration_point(acn) - if t.type is TYPE_START_TAG and (t.name is 'mglyph' or t.name is 'malignmark') + if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark') ins_mode t return if t.type is TYPE_TEXT @@ -1652,7 +1706,7 @@ parse_html = (args) -> parse_error() return if template_tag_is_open() root_attrs = open_els[open_els.length - 1].attrs - for a of t.attrs_a + for a in t.attrs_a root_attrs[a[0]] = a[1] unless root_attrs[a[0]]? return @@ -1663,10 +1717,10 @@ parse_html = (args) -> parse_error() return if open_els.length < 2 second = open_els[open_els.length - 2] - return unless second.ns is NS_HTML + return unless second.namespace is NS_HTML return unless second.name is 'body' return if template_tag_is_open() - frameset_ok_flag = false + flag_frameset_ok = false for a of t.attrs_a second.attrs[a[0]] = a[1] unless second.attrs[a[0]]? return @@ -1675,9 +1729,10 @@ parse_html = (args) -> return if open_els.length < 2 second_i = open_els.length - 2 second = open_els[second_i] - return unless second.ns is NS_HTML + return unless second.namespace is NS_HTML return unless second.name is 'body' - flag_frameset_ok = false + if flag_frameset_ok is false + return if second.parent? for el, i in second.parent.children if el is second @@ -2098,20 +2153,37 @@ parse_html = (args) -> reconstruct_afe() insert_html_element t return - if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc') +# this comment block implements the W3C spec +# if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc') +# if is_in_scope 'ruby', NS_HTML +# generate_implied_end_tags() +# unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML +# parse_error() +# insert_html_element t +# return +# if t.type is TYPE_START_TAG and t.name is 'rt' +# if is_in_scope 'ruby', NS_HTML +# generate_implied_end_tags 'rtc' # arg is exception +# unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML +# parse_error() +# insert_html_element t +# return +# below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody + if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc') if is_in_scope 'ruby', NS_HTML generate_implied_end_tags() unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML parse_error() insert_html_element t return - if t.type is TYPE_START_TAG and t.name is 'rt' + if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt') if is_in_scope 'ruby', NS_HTML - generate_implied_end_tags 'rtc' # arg is exception + generate_implied_end_tags 'rtc' unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML parse_error() insert_html_element t return +# end WHATWG chunk if t.type is TYPE_START_TAG and t.name is 'math' reconstruct_afe() adjust_mathml_attributes t @@ -2180,7 +2252,8 @@ parse_html = (args) -> ins_mode_in_table = (t) -> switch t.type when TYPE_TEXT - if t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' + if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML + pending_table_character_tokens = [] original_ins_mode = ins_mode ins_mode = ins_mode_in_table_text process_token t @@ -2270,7 +2343,7 @@ parse_html = (args) -> # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext ins_mode_in_table_text = (t) -> if t.type is TYPE_TEXT and t.text is "\u0000" - # huh? I thought the tokenizer didn't emit these + # from javascript? parse_error() return if t.type is TYPE_TEXT @@ -2287,8 +2360,8 @@ parse_html = (args) -> insert_character old else for old in pending_table_character_tokens - ins_mode_table_else old - pending_table_character_tokens = [] # FIXME test (spec doesn't say this) + ins_mode_in_table_else old + pending_table_character_tokens = [] ins_mode = original_ins_mode process_token t @@ -2683,7 +2756,8 @@ parse_html = (args) -> ins_mode_in_body t return if t.type is TYPE_COMMENT - insert_comment t, [open_els[0], open_els[0].children.length] + first = open_els[open_els.length - 1] + insert_comment t, [first, first.children.length] return if t.type is TYPE_DOCTYPE parse_error() @@ -2692,7 +2766,9 @@ parse_html = (args) -> ins_mode_in_body t return if t.type is TYPE_END_TAG and t.name is 'html' - # fixfull fragment case + if flag_fragment_parsing + parse_error() + return ins_mode = ins_mode_after_after_body return if t.type is TYPE_EOF @@ -2760,7 +2836,7 @@ parse_html = (args) -> ins_mode_in_body t return if t.type is TYPE_END_TAG and t.name is 'html' - insert_mode = ins_mode_after_after_frameset + ins_mode = ins_mode_after_after_frameset return if t.type is TYPE_START_TAG and t.name is 'noframes' ins_mode_in_head t @@ -2786,6 +2862,7 @@ parse_html = (args) -> # Anything else parse_error() ins_mode = ins_mode_in_body + process_token t return # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode @@ -2830,6 +2907,7 @@ parse_html = (args) -> if t.name is 'script' t.acknowledge_self_closing() in_foreign_content_end_script() + # fixfull else open_els.shift() t.acknowledge_self_closing() @@ -2859,8 +2937,7 @@ parse_html = (args) -> return loop # is this safe? open_els.shift() - cn = open_els[0] - if is_mathml_text_integration_point(cn) or is_html_integration(cn) or cn.namespace is NS_HTML + if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML break process_token t return @@ -2871,9 +2948,11 @@ parse_html = (args) -> in_foreign_content_end_script() return if t.type is TYPE_END_TAG - if open_els[0].name.toLowerCase() isnt t.name + i = 0 + node = open_els[i] + if node.name.toLowerCase() isnt t.name parse_error() - for node in open_els + loop if node is open_els[open_els.length - 1] return if node.name.toLowerCase() is t.name @@ -2881,6 +2960,8 @@ parse_html = (args) -> el = open_els.shift() if el is node return + i += 1 + node = open_els[i] if node.namespace is NS_HTML break ins_mode t # explicitly call HTML insertion mode @@ -2895,7 +2976,7 @@ parse_html = (args) -> tok_state = tok_state_tag_open when "\u0000" parse_error() - return new_text_node c + return new_text_node "\ufffd" when '' # EOF return new_eof_token() else @@ -2969,50 +3050,55 @@ parse_html = (args) -> # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state tok_state_tag_open = -> - switch c = txt.charAt(cur++) - when '!' - tok_state = tok_state_markup_declaration_open - when '/' - tok_state = tok_state_end_tag_open - when '?' - parse_error() - tok_cur_tag = new_comment_token '?' - tok_state = tok_state_bogus_comment - else - if is_lc_alpha(c) - tok_cur_tag = new_open_tag c - tok_state = tok_state_tag_name - else if is_uc_alpha(c) - tok_cur_tag = new_open_tag c.toLowerCase() - tok_state = tok_state_tag_name - else - parse_error() - tok_state = tok_state_data - cur -= 1 # we didn't parse/handle the char after < - return new_text_node '<' - return null + c = txt.charAt(cur++) + if c is '!' + tok_state = tok_state_markup_declaration_open + return + if c is '/' + tok_state = tok_state_end_tag_open + return + if is_uc_alpha(c) + tok_cur_tag = new_open_tag c.toLowerCase() + tok_state = tok_state_tag_name + return + if is_lc_alpha(c) + tok_cur_tag = new_open_tag c + tok_state = tok_state_tag_name + return + if c is '?' + parse_error() + tok_cur_tag = new_comment_token '?' # FIXME right? + tok_state = tok_state_bogus_comment + return + # Anything else + parse_error() + tok_state = tok_state_data + cur -= 1 # we didn't parse/handle the char after < + return new_text_node '<' # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state tok_state_end_tag_open = -> - switch c = txt.charAt(cur++) - when '>' - parse_error() - tok_state = tok_state_data - when '' # EOF - parse_error() - tok_state = tok_state_data - return new_text_node '' + parse_error() + tok_state = tok_state_data + return + if c is '' # EOF + parse_error() + tok_state = tok_state_data + return new_text_node ' # Anything else tok_state = tok_state_script_data_escaped cur -= 1 # Reconsume - return new_character_token c + return new_character_token '<' # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state tok_state_script_data_escaped_end_tag_open = -> @@ -3720,7 +3806,7 @@ parse_html = (args) -> tok_state_self_closing_start_tag = -> c = txt.charAt(cur++) if c is '>' - tok_cur_tag.flag 'self-closing' + tok_cur_tag.flag 'self-closing', true tok_state = tok_state_data return tok_cur_tag if c is '' @@ -3744,7 +3830,7 @@ parse_html = (args) -> else val = txt.substr cur, (next_gt - cur) cur = next_gt + 1 - val = val.replace "\u0000", "\ufffd" + val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") tok_cur_tag.text += val tok_state = tok_state_data return tok_cur_tag @@ -4340,9 +4426,6 @@ parse_html = (args) -> else val = txt.substr cur, (next_gt - cur) cur = next_gt + 3 - val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this - val = val.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this - val = val.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this return new_character_token val # fixfull split # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference @@ -4454,11 +4537,18 @@ parse_html = (args) -> head_element_pointer = null flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case) context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments + prev_node_id = 0 # just for debugging # tokenizer initialization tok_state = tok_state_data - if args.name is "namespace-sensitivity.dat #1" + # text pre-processing + # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream + txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this + txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this + txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this + + if args.name is "tests18.dat #17" console.log "hi" # proccess input # http://www.w3.org/TR/html5/syntax.html#tree-construction