change args, fix script parsing

author Jason Woofenden <jason@jasonwoof.com>

Wed, 23 Dec 2015 20:12:57 +0000 (15:12 -0500)

committer Jason Woofenden <jason@jasonwoof.com>

Wed, 23 Dec 2015 20:12:57 +0000 (15:12 -0500)
author Jason Woofenden <jason@jasonwoof.com>
Wed, 23 Dec 2015 20:12:57 +0000 (15:12 -0500)
committer Jason Woofenden <jason@jasonwoof.com>
Wed, 23 Dec 2015 20:12:57 +0000 (15:12 -0500)
diff --git a/parse-html.coffee b/parse-html.coffee

index 55d73a3..cea8fa1 100644 (file)
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -512,8 +512,9 @@ decode_named_char_ref = (txt) ->
         return null if decoded is txt
         return g_dncr.cache[txt] = decoded
  
-parse_html = (txt, parse_error_cb = null) ->
-       cur = 0 # index of next char in txt to be parsed
+parse_html = (args) ->
+       txt = null
+       cur = null # index of next char in txt to be parsed
         # declare doc and tokenizer variables so they're in scope below
         doc = null
         open_els = null # stack of open elements
@@ -538,8 +539,8 @@ parse_html = (txt, parse_error_cb = null) ->
                 flag_parsing = false
  
         parse_error = ->
-               if parse_error_cb?
-                       parse_error_cb cur
+               if args.error_cb?
+                       args.error_cb cur
                 else
                         console.log "Parse error at character #{cur} of #{txt.length}"
  
@@ -3041,9 +3042,9 @@ parse_html = (txt, parse_error_cb = null) ->
         is_appropriate_end_tag = (t) ->
                 # spec says to check against "the tag name of the last start tag to
                 # have been emitted from this tokenizer", but this is only called from
-               # the various "raw" states, which I'm pretty sure all push the start
-               # token onto open_els. TODO: verify this after the script data states
-               # are implemented
+               # the various "raw" states, so it's hopefully ok to assume that
+               # open_els[0].name will work instead TODO: verify this after the script
+               # data states are implemented
                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
  
@@ -3185,6 +3186,11 @@ parse_html = (txt, parse_error_cb = null) ->
                                 tok_state = tok_state_self_closing_start_tag
                                 return
                         # fall through
+               if c is '>'
+                       if is_appropriate_end_tag tok_cur_tag
+                               tok_state = tok_state_data
+                               return tok_cur_tag
+                       # fall through
                 if is_uc_alpha(c)
                         tok_cur_tag.name += c.toLowerCase()
                         temporary_buffer += c
@@ -3517,16 +3523,16 @@ parse_html = (txt, parse_error_cb = null) ->
                                 return tmp
                         when "\u0000"
                                 parse_error()
-                               tok_cur_tag.attrs_a[0][0] = "\ufffd"
+                               tok_cur_tag.attrs_a[0][0] += "\ufffd"
                         when '"', "'", '<'
                                 parse_error()
-                               tok_cur_tag.attrs_a[0][0] = c
+                               tok_cur_tag.attrs_a[0][0] += c
                         when '' # EOF
                                 parse_error()
                                 tok_state = tok_state_data
                         else
                                 if is_uc_alpha(c)
-                                       tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
+                                       tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
                                 else
                                         tok_cur_tag.attrs_a[0][0] += c
                 return null
@@ -4295,7 +4301,9 @@ parse_html = (txt, parse_error_cb = null) ->
                 else
                         val = txt.substr cur, (next_gt - cur)
                         cur = next_gt + 3
-               val = val.replace "\u0000", "\ufffd" # fixfull spec doesn't say this
+               val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
+               val = val.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
+               val = val.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
                 return new_character_token val # fixfull split
  
         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
@@ -4376,13 +4384,15 @@ parse_html = (txt, parse_error_cb = null) ->
  
         # tree constructor initialization
         # see comments on TYPE_TAG/etc for the structure of this data
+       txt = args.html
+       cur = 0
         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
         open_els = []
         afe = [] # active formatting elements
         template_ins_modes = []
         ins_mode = ins_mode_initial
         original_ins_mode = ins_mode # TODO check spec
-       flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
+       flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
         flag_frameset_ok = true
         flag_parsing = true
         flag_foster_parenting = false
diff --git a/test.coffee b/test.coffee

index 1e27066..58b1ed8 100644 (file)
--- a/test.coffee
+++ b/test.coffee
@@ -7690,11 +7690,13 @@ tests = [
         }, {
                 name: "webkit02.dat #2"
                 html: "<p id=\"status\"><noscript><strong>A</strong></noscript><span>B</span></p>"
+               scripting: true
                 errors: 1
                 expected: "| <html>\n|   <head>\n|   <body>\n|     <p>\n|       id=\"status\"\n|       <noscript>\n|         \"<strong>A</strong>\"\n|       <span>\n|         \"B\"\n"
         }, {
                 name: "webkit02.dat #3"
                 html: "<p id=\"status\"><noscript><strong>A</strong></noscript><span>B</span></p>"
+               scripting: false
                 errors: 1
                 expected: "| <html>\n|   <head>\n|   <body>\n|     <p>\n|       id=\"status\"\n|       <noscript>\n|         <strong>\n|           \"A\"\n|       <span>\n|         \"B\"\n"
         }, {
@@ -7834,10 +7836,10 @@ test_parser = (args) ->
                 return
         wheic.debug_log_reset()
         parse_errors = []
-       errors_cb = (i) ->
+       args.error_cb = (i) ->
                 parse_errors.push i
         prev_node_id = 0 # reset counter
-       parsed = wheic.parse_html args.html, errors_cb
+       parsed = wheic.parse_html args
         serialized = serialize_els parsed
         if serialized isnt args.expected
                 #wheic.debug_log_each (str) ->
author	Jason Woofenden <jason@jasonwoof.com>
	Wed, 23 Dec 2015 20:12:57 +0000 (15:12 -0500)
committer	Jason Woofenden <jason@jasonwoof.com>
	Wed, 23 Dec 2015 20:12:57 +0000 (15:12 -0500)
parse-html.coffee		patch \| blob \| history
test.coffee		patch \| blob \| history