X-Git-Url: https://jasonwoof.com/gitweb/?a=blobdiff_plain;ds=sidebyside;f=parse-html.coffee;h=fee4202f92205ab9df44d834ffacb3826534bd56;hb=7460e85442ced49600febffff4ac8fe16d7361e3;hp=31a46f413fc2447fd4e1d21b03531d9f5ae71c66;hpb=12e07fdf217eda724e703e32ec5c8b968bb3a727;p=peach-html5-editor.git diff --git a/parse-html.coffee b/parse-html.coffee index 31a46f4..fee4202 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -195,8 +195,8 @@ is_space_tok = (t) -> return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1 is_input_hidden_tok = (t) -> - return unless t.type is TYPE_START_TAG - for a of t.attrs_a + return false unless t.type is TYPE_START_TAG + for a in t.attrs_a if a[0] is 'type' if a[1].toLowerCase() is 'hidden' return true @@ -512,8 +512,9 @@ decode_named_char_ref = (txt) -> return null if decoded is txt return g_dncr.cache[txt] = decoded -parse_html = (txt, parse_error_cb = null) -> - cur = 0 # index of next char in txt to be parsed +parse_html = (args) -> + txt = null + cur = null # index of next char in txt to be parsed # declare doc and tokenizer variables so they're in scope below doc = null open_els = null # stack of open elements @@ -538,8 +539,8 @@ parse_html = (txt, parse_error_cb = null) -> flag_parsing = false parse_error = -> - if parse_error_cb? - parse_error_cb cur + if args.error_cb? + args.error_cb cur else console.log "Parse error at character #{cur} of #{txt.length}" @@ -1191,7 +1192,7 @@ parse_html = (txt, parse_error_cb = null) -> last_template = null last_template_i = null for el, i in open_els - if el.name is 'template' + if el.name is 'template' and el.namespace is NS_HTML last_template = el last_template_i = i break @@ -1200,7 +1201,7 @@ parse_html = (txt, parse_error_cb = null) -> last_table = null last_table_i for el, i in open_els - if el.name is 'table' + if el.name is 'table' and el.namespace is NS_HTML last_table = el last_table_i = i break @@ -1222,6 +1223,7 @@ parse_html = (txt, parse_error_cb = null) -> # this is odd target = open_els[open_els.length - 1] target_i = target.children.length + break # 5. If last table has a parent element, then let adjusted # insertion location be inside last table's parent element, # immediately before last table, and abort these substeps. @@ -1393,6 +1395,7 @@ parse_html = (txt, parse_error_cb = null) -> el = insert_html_element t head_element_pointer = el ins_mode = ins_mode_in_head + return if t.type is TYPE_END_TAG if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br' # fall through to Anything else below @@ -1438,7 +1441,7 @@ parse_html = (txt, parse_error_cb = null) -> if t.type is TYPE_START_TAG and t.name is 'title' parse_generic_rcdata_text t return - if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style')) + if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style') parse_generic_raw_text t return if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false @@ -1500,14 +1503,14 @@ parse_html = (txt, parse_error_cb = null) -> if t.type is TYPE_DOCTYPE parse_error() return - if t.type is TYPE_START_TAG + if t.type is TYPE_START_TAG and t.name is 'html' ins_mode_in_body t return if t.type is TYPE_END_TAG and t.name is 'noscript' open_els.shift() ins_mode = ins_mode_in_head return - if (t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\u000a" or t.text is "\u000c" or t.text is "\u000d" or t.text is ' ')) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style')) + if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style')) ins_mode_in_head t return if t.type is TYPE_END_TAG and t.name is 'br' @@ -2100,19 +2103,6 @@ parse_html = (txt, parse_error_cb = null) -> return return - ins_mode_in_table_else = (t) -> - parse_error() - flag_foster_parenting = true # FIXME - ins_mode_in_body t - flag_foster_parenting = false - can_in_table = { # FIXME do this inline like everywhere else - 'table': true - 'tbody': true - 'tfoot': true - 'thead': true - 'tr': true - } - # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata ins_mode_text = (t) -> if t.type is TYPE_TEXT @@ -2142,6 +2132,19 @@ parse_html = (txt, parse_error_cb = null) -> # http://www.w3.org/TR/html5/syntax.html#tokenization # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable + ins_mode_in_table_else = (t) -> + parse_error() + flag_foster_parenting = true + ins_mode_in_body t + flag_foster_parenting = false + return + can_in_table = { # FIXME do this inline like everywhere else + 'table': true + 'tbody': true + 'tfoot': true + 'thead': true + 'tr': true + } ins_mode_in_table = (t) -> switch t.type when TYPE_TEXT @@ -2192,7 +2195,7 @@ parse_html = (txt, parse_error_cb = null) -> when 'style', 'script', 'template' ins_mode_in_head t when 'input' - if is_input_hidden_tok t + unless is_input_hidden_tok t ins_mode_in_table_else t else parse_error() @@ -3040,9 +3043,9 @@ parse_html = (txt, parse_error_cb = null) -> is_appropriate_end_tag = (t) -> # spec says to check against "the tag name of the last start tag to # have been emitted from this tokenizer", but this is only called from - # the various "raw" states, which I'm pretty sure all push the start - # token onto open_els. TODO: verify this after the script data states - # are implemented + # the various "raw" states, so it's hopefully ok to assume that + # open_els[0].name will work instead TODO: verify this after the script + # data states are implemented debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}" return t.type is TYPE_END_TAG and t.name is open_els[0].name @@ -3184,6 +3187,11 @@ parse_html = (txt, parse_error_cb = null) -> tok_state = tok_state_self_closing_start_tag return # fall through + if c is '>' + if is_appropriate_end_tag tok_cur_tag + tok_state = tok_state_data + return tok_cur_tag + # fall through if is_uc_alpha(c) tok_cur_tag.name += c.toLowerCase() temporary_buffer += c @@ -3336,6 +3344,11 @@ parse_html = (txt, parse_error_cb = null) -> tok_state = tok_state_self_closing_start_tag return # fall through + if c is '>' + if is_appropriate_end_tag tok_cur_tag + tok_state = tok_state_data + return tok_cur_tag + # fall through if is_uc_alpha(c) tok_cur_tag.name += c.toLowerCase() temporary_buffer += c.toLowerCase() @@ -3516,16 +3529,16 @@ parse_html = (txt, parse_error_cb = null) -> return tmp when "\u0000" parse_error() - tok_cur_tag.attrs_a[0][0] = "\ufffd" + tok_cur_tag.attrs_a[0][0] += "\ufffd" when '"', "'", '<' parse_error() - tok_cur_tag.attrs_a[0][0] = c + tok_cur_tag.attrs_a[0][0] += c when '' # EOF parse_error() tok_state = tok_state_data else if is_uc_alpha(c) - tok_cur_tag.attrs_a[0][0] = c.toLowerCase() + tok_cur_tag.attrs_a[0][0] += c.toLowerCase() else tok_cur_tag.attrs_a[0][0] += c return null @@ -3723,7 +3736,7 @@ parse_html = (txt, parse_error_cb = null) -> return # Otherwise parse_error() - tok_cur_tag = new_comment_token '!' # TODO test ("!" right?) + tok_cur_tag = new_comment_token '' tok_state = tok_state_bogus_comment return @@ -3734,6 +3747,7 @@ parse_html = (txt, parse_error_cb = null) -> tok_state = tok_state_comment_start_dash when "\u0000" parse_error() + tok_state = tok_state_comment return new_character_token "\ufffd" when '>' parse_error() @@ -3746,6 +3760,7 @@ parse_html = (txt, parse_error_cb = null) -> return tok_cur_tag else tok_cur_tag.text += c + tok_state = tok_state_comment return null # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state @@ -4294,7 +4309,9 @@ parse_html = (txt, parse_error_cb = null) -> else val = txt.substr cur, (next_gt - cur) cur = next_gt + 3 - val = val.replace "\u0000", "\ufffd" # fixfull spec doesn't say this + val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this + val = val.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this + val = val.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this return new_character_token val # fixfull split # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference @@ -4375,13 +4392,15 @@ parse_html = (txt, parse_error_cb = null) -> # tree constructor initialization # see comments on TYPE_TAG/etc for the structure of this data + txt = args.html + cur = 0 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML open_els = [] afe = [] # active formatting elements template_ins_modes = [] ins_mode = ins_mode_initial original_ins_mode = ins_mode # TODO check spec - flag_scripting = true # TODO might need an extra flag to get