JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
change args, fix script parsing
authorJason Woofenden <jason@jasonwoof.com>
Wed, 23 Dec 2015 20:12:57 +0000 (15:12 -0500)
committerJason Woofenden <jason@jasonwoof.com>
Wed, 23 Dec 2015 20:12:57 +0000 (15:12 -0500)
parse-html.coffee
test.coffee

index 55d73a3..cea8fa1 100644 (file)
@@ -512,8 +512,9 @@ decode_named_char_ref = (txt) ->
        return null if decoded is txt
        return g_dncr.cache[txt] = decoded
 
-parse_html = (txt, parse_error_cb = null) ->
-       cur = 0 # index of next char in txt to be parsed
+parse_html = (args) ->
+       txt = null
+       cur = null # index of next char in txt to be parsed
        # declare doc and tokenizer variables so they're in scope below
        doc = null
        open_els = null # stack of open elements
@@ -538,8 +539,8 @@ parse_html = (txt, parse_error_cb = null) ->
                flag_parsing = false
 
        parse_error = ->
-               if parse_error_cb?
-                       parse_error_cb cur
+               if args.error_cb?
+                       args.error_cb cur
                else
                        console.log "Parse error at character #{cur} of #{txt.length}"
 
@@ -3041,9 +3042,9 @@ parse_html = (txt, parse_error_cb = null) ->
        is_appropriate_end_tag = (t) ->
                # spec says to check against "the tag name of the last start tag to
                # have been emitted from this tokenizer", but this is only called from
-               # the various "raw" states, which I'm pretty sure all push the start
-               # token onto open_els. TODO: verify this after the script data states
-               # are implemented
+               # the various "raw" states, so it's hopefully ok to assume that
+               # open_els[0].name will work instead TODO: verify this after the script
+               # data states are implemented
                debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
                return t.type is TYPE_END_TAG and t.name is open_els[0].name
 
@@ -3185,6 +3186,11 @@ parse_html = (txt, parse_error_cb = null) ->
                                tok_state = tok_state_self_closing_start_tag
                                return
                        # fall through
+               if c is '>'
+                       if is_appropriate_end_tag tok_cur_tag
+                               tok_state = tok_state_data
+                               return tok_cur_tag
+                       # fall through
                if is_uc_alpha(c)
                        tok_cur_tag.name += c.toLowerCase()
                        temporary_buffer += c
@@ -3517,16 +3523,16 @@ parse_html = (txt, parse_error_cb = null) ->
                                return tmp
                        when "\u0000"
                                parse_error()
-                               tok_cur_tag.attrs_a[0][0] = "\ufffd"
+                               tok_cur_tag.attrs_a[0][0] += "\ufffd"
                        when '"', "'", '<'
                                parse_error()
-                               tok_cur_tag.attrs_a[0][0] = c
+                               tok_cur_tag.attrs_a[0][0] += c
                        when '' # EOF
                                parse_error()
                                tok_state = tok_state_data
                        else
                                if is_uc_alpha(c)
-                                       tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
+                                       tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
                                else
                                        tok_cur_tag.attrs_a[0][0] += c
                return null
@@ -4295,7 +4301,9 @@ parse_html = (txt, parse_error_cb = null) ->
                else
                        val = txt.substr cur, (next_gt - cur)
                        cur = next_gt + 3
-               val = val.replace "\u0000", "\ufffd" # fixfull spec doesn't say this
+               val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
+               val = val.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
+               val = val.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
                return new_character_token val # fixfull split
 
        # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
@@ -4376,13 +4384,15 @@ parse_html = (txt, parse_error_cb = null) ->
 
        # tree constructor initialization
        # see comments on TYPE_TAG/etc for the structure of this data
+       txt = args.html
+       cur = 0
        doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
        open_els = []
        afe = [] # active formatting elements
        template_ins_modes = []
        ins_mode = ins_mode_initial
        original_ins_mode = ins_mode # TODO check spec
-       flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
+       flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
        flag_frameset_ok = true
        flag_parsing = true
        flag_foster_parenting = false
index 1e27066..58b1ed8 100644 (file)
@@ -7690,11 +7690,13 @@ tests = [
        }, {
                name: "webkit02.dat #2"
                html: "<p id=\"status\"><noscript><strong>A</strong></noscript><span>B</span></p>"
+               scripting: true
                errors: 1
                expected: "| <html>\n|   <head>\n|   <body>\n|     <p>\n|       id=\"status\"\n|       <noscript>\n|         \"<strong>A</strong>\"\n|       <span>\n|         \"B\"\n"
        }, {
                name: "webkit02.dat #3"
                html: "<p id=\"status\"><noscript><strong>A</strong></noscript><span>B</span></p>"
+               scripting: false
                errors: 1
                expected: "| <html>\n|   <head>\n|   <body>\n|     <p>\n|       id=\"status\"\n|       <noscript>\n|         <strong>\n|           \"A\"\n|       <span>\n|         \"B\"\n"
        }, {
@@ -7834,10 +7836,10 @@ test_parser = (args) ->
                return
        wheic.debug_log_reset()
        parse_errors = []
-       errors_cb = (i) ->
+       args.error_cb = (i) ->
                parse_errors.push i
        prev_node_id = 0 # reset counter
-       parsed = wheic.parse_html args.html, errors_cb
+       parsed = wheic.parse_html args
        serialized = serialize_els parsed
        if serialized isnt args.expected
                #wheic.debug_log_each (str) ->