From 5056e7c63784b7c388e514feebdaf3bc296826c4 Mon Sep 17 00:00:00 2001 From: Jason Woofenden Date: Wed, 23 Dec 2015 15:12:57 -0500 Subject: [PATCH] change args, fix script parsing --- parse-html.coffee | 34 ++++++++++++++++++++++------------ test.coffee | 6 ++++-- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/parse-html.coffee b/parse-html.coffee index 55d73a3..cea8fa1 100644 --- a/parse-html.coffee +++ b/parse-html.coffee @@ -512,8 +512,9 @@ decode_named_char_ref = (txt) -> return null if decoded is txt return g_dncr.cache[txt] = decoded -parse_html = (txt, parse_error_cb = null) -> - cur = 0 # index of next char in txt to be parsed +parse_html = (args) -> + txt = null + cur = null # index of next char in txt to be parsed # declare doc and tokenizer variables so they're in scope below doc = null open_els = null # stack of open elements @@ -538,8 +539,8 @@ parse_html = (txt, parse_error_cb = null) -> flag_parsing = false parse_error = -> - if parse_error_cb? - parse_error_cb cur + if args.error_cb? + args.error_cb cur else console.log "Parse error at character #{cur} of #{txt.length}" @@ -3041,9 +3042,9 @@ parse_html = (txt, parse_error_cb = null) -> is_appropriate_end_tag = (t) -> # spec says to check against "the tag name of the last start tag to # have been emitted from this tokenizer", but this is only called from - # the various "raw" states, which I'm pretty sure all push the start - # token onto open_els. TODO: verify this after the script data states - # are implemented + # the various "raw" states, so it's hopefully ok to assume that + # open_els[0].name will work instead TODO: verify this after the script + # data states are implemented debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}" return t.type is TYPE_END_TAG and t.name is open_els[0].name @@ -3185,6 +3186,11 @@ parse_html = (txt, parse_error_cb = null) -> tok_state = tok_state_self_closing_start_tag return # fall through + if c is '>' + if is_appropriate_end_tag tok_cur_tag + tok_state = tok_state_data + return tok_cur_tag + # fall through if is_uc_alpha(c) tok_cur_tag.name += c.toLowerCase() temporary_buffer += c @@ -3517,16 +3523,16 @@ parse_html = (txt, parse_error_cb = null) -> return tmp when "\u0000" parse_error() - tok_cur_tag.attrs_a[0][0] = "\ufffd" + tok_cur_tag.attrs_a[0][0] += "\ufffd" when '"', "'", '<' parse_error() - tok_cur_tag.attrs_a[0][0] = c + tok_cur_tag.attrs_a[0][0] += c when '' # EOF parse_error() tok_state = tok_state_data else if is_uc_alpha(c) - tok_cur_tag.attrs_a[0][0] = c.toLowerCase() + tok_cur_tag.attrs_a[0][0] += c.toLowerCase() else tok_cur_tag.attrs_a[0][0] += c return null @@ -4295,7 +4301,9 @@ parse_html = (txt, parse_error_cb = null) -> else val = txt.substr cur, (next_gt - cur) cur = next_gt + 3 - val = val.replace "\u0000", "\ufffd" # fixfull spec doesn't say this + val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this + val = val.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this + val = val.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this return new_character_token val # fixfull split # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference @@ -4376,13 +4384,15 @@ parse_html = (txt, parse_error_cb = null) -> # tree constructor initialization # see comments on TYPE_TAG/etc for the structure of this data + txt = args.html + cur = 0 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML open_els = [] afe = [] # active formatting elements template_ins_modes = [] ins_mode = ins_mode_initial original_ins_mode = ins_mode # TODO check spec - flag_scripting = true # TODO might need an extra flag to get