X-Git-Url: https://jasonwoof.com/gitweb/?p=peach-html5-editor.git;a=blobdiff_plain;f=parse-html.coffee;h=73eda84ffabceecd283782f1e4d7a90a290d10b9;hp=06c2a1b04e67c55ad0fbf7e5e1b2b78396efa99f;hb=af702500dd54507b24184075a7fb7c1f5acf70e5;hpb=3ff49c30096e8e97599b98755157f9a692937f58 diff --git a/parse-html.coffee b/parse-html.coffee index 06c2a1b..73eda84 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -16,13 +16,17 @@ # This file implements a parser for html snippets, meant to be used by a -# WYSIWYG editor. Hence it does not attempt to parse doctypes, , -# or tags, nor does it produce the top level "document" node in the dom -# tree, nor nodes for html, head or body. Comments containing "fixfull" -# indicate places where additional code is needed for full HTML document -# parsing. +# WYSIWYG editor. + +# The implementation is a pretty direct implementation of the parsing algorithm +# described here: +# http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream +# +# Deviations from that spec: # -# Instead, the data structure produced by this parser is an array of Nodes. +# Purposeful: search this file for "WTAG" +# +# Not finished yet: search this file for "fixfull", "TODO" and "FIXME" # stacks/lists @@ -648,7 +652,7 @@ parse_html = (args) -> for t in open_els if t.name is tag_name and (namespace is null or namespace is t.namespace) return true - if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option' + if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option' return false return false # this checks for a particular element, not by name @@ -1663,10 +1667,10 @@ parse_html = (args) -> parse_error() return if open_els.length < 2 second = open_els[open_els.length - 2] - return unless second.ns is NS_HTML + return unless second.namespace is NS_HTML return unless second.name is 'body' return if template_tag_is_open() - frameset_ok_flag = false + flag_frameset_ok = false for a of t.attrs_a second.attrs[a[0]] = a[1] unless second.attrs[a[0]]? return @@ -1675,9 +1679,10 @@ parse_html = (args) -> return if open_els.length < 2 second_i = open_els.length - 2 second = open_els[second_i] - return unless second.ns is NS_HTML + return unless second.namespace is NS_HTML return unless second.name is 'body' - flag_frameset_ok = false + if flag_frameset_ok is false + return if second.parent? for el, i in second.parent.children if el is second @@ -2098,20 +2103,37 @@ parse_html = (args) -> reconstruct_afe() insert_html_element t return - if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc') +# this comment block implements the W3C spec +# if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc') +# if is_in_scope 'ruby', NS_HTML +# generate_implied_end_tags() +# unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML +# parse_error() +# insert_html_element t +# return +# if t.type is TYPE_START_TAG and t.name is 'rt' +# if is_in_scope 'ruby', NS_HTML +# generate_implied_end_tags 'rtc' # arg is exception +# unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML +# parse_error() +# insert_html_element t +# return +# below implements the WATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody + if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc') if is_in_scope 'ruby', NS_HTML generate_implied_end_tags() unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML parse_error() insert_html_element t return - if t.type is TYPE_START_TAG and t.name is 'rt' + if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt') if is_in_scope 'ruby', NS_HTML - generate_implied_end_tags 'rtc' # arg is exception + generate_implied_end_tags 'rtc' unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML parse_error() insert_html_element t return +# end WATWG chunk if t.type is TYPE_START_TAG and t.name is 'math' reconstruct_afe() adjust_mathml_attributes t @@ -2895,7 +2917,7 @@ parse_html = (args) -> tok_state = tok_state_tag_open when "\u0000" parse_error() - return new_text_node c + return new_text_node "\ufffd" when '' # EOF return new_eof_token() else @@ -3744,7 +3766,7 @@ parse_html = (args) -> else val = txt.substr cur, (next_gt - cur) cur = next_gt + 1 - val = val.replace "\u0000", "\ufffd" + val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") tok_cur_tag.text += val tok_state = tok_state_data return tok_cur_tag @@ -4340,9 +4362,6 @@ parse_html = (args) -> else val = txt.substr cur, (next_gt - cur) cur = next_gt + 3 - val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this - val = val.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this - val = val.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this return new_character_token val # fixfull split # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference @@ -4458,7 +4477,13 @@ parse_html = (args) -> # tokenizer initialization tok_state = tok_state_data - if args.name is "namespace-sensitivity.dat #1" + # text pre-processing + # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream + txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this + txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this + txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this + + if args.name is "plain-text-unsafe.dat #4" console.log "hi" # proccess input # http://www.w3.org/TR/html5/syntax.html#tree-construction