implement rest of tokenizer states

author Jason Woofenden <jason@jasonwoof.com>

Tue, 22 Dec 2015 22:30:45 +0000 (17:30 -0500)

committer Jason Woofenden <jason@jasonwoof.com>

Tue, 22 Dec 2015 22:30:45 +0000 (17:30 -0500)
author Jason Woofenden <jason@jasonwoof.com>
Tue, 22 Dec 2015 22:30:45 +0000 (17:30 -0500)
committer Jason Woofenden <jason@jasonwoof.com>
Tue, 22 Dec 2015 22:30:45 +0000 (17:30 -0500)
diff --git a/parse-html.coffee b/parse-html.coffee

index e193118..f64c734 100644 (file)
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -181,6 +181,11 @@ digits = "0123456789"
  alnum = lc_alpha + uc_alpha + digits
  hex_chars = digits + "abcdefABCDEF"
  
+is_uc_alpha = (str) ->
+       return str.length is 1 and uc_alpha.indexOf(str) > -1
+is_lc_alpha = (str) ->
+       return str.length is 1 and lc_alpha.indexOf(str) > -1
+
  # some SVG elements have dashes in them
  tag_name_chars = alnum + "-"
  
@@ -191,6 +196,15 @@ is_space = (txt) ->
  is_space_tok = (t) ->
         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
  
+is_input_hidden_tok = (t) ->
+       return unless t.type is TYPE_START_TAG
+       for a of t.attrs_a
+               if a[0] is 'type'
+                       if a[1].toLowerCase() is 'hidden'
+                               return true
+                       return false
+       return false
+
  # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
  whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
  
@@ -450,7 +464,7 @@ parse_html = (txt, parse_error_cb = null) ->
                 for t in open_els
                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
                                 return true
-                       if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
+                       if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
                                 return false
                 return false
         # this checks for a particular element, not by name
@@ -499,9 +513,11 @@ parse_html = (txt, parse_error_cb = null) ->
                 return
         clear_afe_to_marker = ->
                 loop
+                       return unless afe.length > 0 # this happens in fragment case, ?spec error
                         el = afe.shift()
                         if el.type is TYPE_AFE_MARKER
                                 return
+               return
  
         # 8.2.3.1 ...
         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
@@ -1018,7 +1034,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                 # last template's template contents, after its last child (if
                                 # any), and abort these substeps.
                                 if last_template and (last_table is null or last_template_i < last_table_i)
-                                       target = template # fixfull should be it's contents
+                                       target = last_template # fixfull should be it's contents
                                         target_i = target.children.length
                                         break
                                 # 4. If there is no last table, then let adjusted insertion
@@ -1272,7 +1288,7 @@ parse_html = (txt, parse_error_cb = null) ->
                         return
                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
                         insert_html_element t
-                       insertion_mode = in_head_noscript # FIXME implement
+                       insertion_mode = ins_mode_in_head_noscript # FIXME implement
                         return
                 if t.type is TYPE_START_TAG and t.name is 'script'
                         ail = adjusted_insertion_location()
@@ -1614,7 +1630,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                         when 'style', 'script', 'template'
                                                 ins_mode_in_head t
                                         when 'input'
-                                               if token_is_input_hidden t
+                                               if is_input_hidden_tok t
                                                         ins_mode_in_table_else t
                                                 else
                                                         parse_error()
@@ -1691,7 +1707,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                         if el.name is 'caption'
                                                 break
                                 clear_afe_to_marker()
-                               insertion_mode = in_table
+                               insertion_mode = ins_mode_in_table
                         else
                                 parse_error()
                                 # fragment case
@@ -1704,7 +1720,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                         if el.name is 'caption'
                                                 break
                                 clear_afe_to_marker()
-                               insertion_mode = in_table
+                               insertion_mode = ins_mode_in_table
                                 insertion_mode t
                         # else fragment case
                         return
@@ -1735,7 +1751,7 @@ parse_html = (txt, parse_error_cb = null) ->
                         return
                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
                         if open_els[0].name is 'colgroup'
-                               open_els[0].shift()
+                               open_els.shift()
                                 insertion_mode = ins_mode_in_table
                         else
                                 parse_error()
@@ -2049,7 +2065,7 @@ parse_html = (txt, parse_error_cb = null) ->
                 if t.type is TYPE_END_TAG
                         parse_error()
                         return
-               if t.type is EOF
+               if t.type is TYPE_EOF
                         unless template_tag_is_open()
                                 stop_parsing()
                                 return
@@ -2120,7 +2136,7 @@ parse_html = (txt, parse_error_cb = null) ->
                         open_els.shift()
                         t.acknowledge_self_closing()
                         return
-               if t.type is TYPE_START TAG and t.name is 'noframes'
+               if t.type is TYPE_START_TAG and t.name is 'noframes'
                         ins_mode_in_head t
                         return
                 if t.type is TYPE_EOF
@@ -2291,10 +2307,10 @@ parse_html = (txt, parse_error_cb = null) ->
                                 tok_cur_tag = new_comment_token '?'
                                 tok_state = tok_state_bogus_comment
                         else
-                               if lc_alpha.indexOf(c) > -1
+                               if is_lc_alpha(c)
                                         tok_cur_tag = new_open_tag c
                                         tok_state = tok_state_tag_name
-                               else if uc_alpha.indexOf(c) > -1
+                               else if is_uc_alpha(c)
                                         tok_cur_tag = new_open_tag c.toLowerCase()
                                         tok_state = tok_state_tag_name
                                 else
@@ -2315,10 +2331,10 @@ parse_html = (txt, parse_error_cb = null) ->
                                 tok_state = tok_state_data
                                 return new_text_node '</'
                         else
-                               if uc_alpha.indexOf(c) > -1
+                               if is_uc_alpha(c)
                                         tok_cur_tag = new_end_tag c.toLowerCase()
                                         tok_state = tok_state_tag_name
-                               else if lc_alpha.indexOf(c) > -1
+                               else if is_lc_alpha(c)
                                         tok_cur_tag = new_end_tag c
                                         tok_state = tok_state_tag_name
                                 else
@@ -2346,7 +2362,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                 parse_error()
                                 tok_state = tok_state_data
                         else
-                               if uc_alpha.indexOf(c) > -1
+                               if is_uc_alpha(c)
                                         tok_cur_tag.name += c.toLowerCase()
                                 else
                                         tok_cur_tag.name += c
@@ -2367,12 +2383,12 @@ parse_html = (txt, parse_error_cb = null) ->
         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
         tok_state_rcdata_end_tag_open = ->
                 c = txt.charAt(cur++)
-               if uc_alpha.indexOf(c) > -1
+               if is_uc_alpha(c)
                         tok_cur_tag = new_end_tag c.toLowerCase()
                         temporary_buffer += c
                         tok_state = tok_state_rcdata_end_tag_name
                         return null
-               if lc_alpha.indexOf(c) > -1
+               if is_lc_alpha(c)
                         tok_cur_tag = new_end_tag c
                         temporary_buffer += c
                         tok_state = tok_state_rcdata_end_tag_name
@@ -2410,11 +2426,11 @@ parse_html = (txt, parse_error_cb = null) ->
                                 tok_state = tok_state_data
                                 return tok_cur_tag
                         # else fall through to "Anything else"
-               if uc_alpha.indexOf(c) > -1
+               if is_uc_alpha(c)
                         tok_cur_tag.name += c.toLowerCase()
                         temporary_buffer += c
                         return null
-               if lc_alpha.indexOf(c) > -1
+               if is_lc_alpha(c)
                         tok_cur_tag.name += c
                         temporary_buffer += c
                         return null
@@ -2438,12 +2454,12 @@ parse_html = (txt, parse_error_cb = null) ->
         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
         tok_state_rawtext_end_tag_open = ->
                 c = txt.charAt(cur++)
-               if uc_alpha.indexOf(c) > -1
+               if is_uc_alpha(c)
                         tok_cur_tag = new_end_tag c.toLowerCase()
                         temporary_buffer += c
                         tok_state = tok_state_rawtext_end_tag_name
                         return null
-               if lc_alpha.indexOf(c) > -1
+               if is_lc_alpha(c)
                         tok_cur_tag = new_end_tag c
                         temporary_buffer += c
                         tok_state = tok_state_rawtext_end_tag_name
@@ -2471,11 +2487,11 @@ parse_html = (txt, parse_error_cb = null) ->
                                 tok_state = tok_state_data
                                 return tok_cur_tag
                         # else fall through to "Anything else"
-               if uc_alpha.indexOf(c) > -1
+               if is_uc_alpha(c)
                         tok_cur_tag.name += c.toLowerCase()
                         temporary_buffer += c
                         return null
-               if lc_alpha.indexOf(c) > -1
+               if is_lc_alpha(c)
                         tok_cur_tag.name += c
                         temporary_buffer += c
                         return null
@@ -2484,7 +2500,334 @@ parse_html = (txt, parse_error_cb = null) ->
                 cur -= 1 # reconsume the input character
                 return new_character_token '</' + temporary_buffer # fixfull separate these
  
-       # TODO _all_ of the missing states here (17-33) are for parsing script tags
+       # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
+       tok_state_script_data_less_than_sign = ->
+               c = txt.charAt(cur++)
+               if c is '/'
+                       temporary_buffer = ''
+                       tok_state = tok_state_script_data_end_tag_open
+                       return
+               if c is '!'
+                       tok_state = tok_state_script_data_escape_start
+                       return new_character_token '<!' # fixfull split
+               # Anything else
+               tok_state = tok_state_script_data
+               cur -= 1 # Reconsume
+               return new_character_token '<'
+
+       # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
+       tok_state_script_data_end_tag_open = ->
+               c = txt.charAt(cur++)
+               if is_uc_alpha(c)
+                       tok_cur_tag = new_end_tag c.toLowerCase()
+                       temporary_buffer += c
+                       tok_state = tok_state_script_data_end_tag_name
+                       return
+               if is_lc_alpha(c)
+                       tok_cur_tag = new_end_tag c
+                       temporary_buffer += c
+                       tok_state = tok_state_script_data_end_tag_name
+                       return
+               # Anything else
+               tok_state = tok_state_script_data
+               cur -= 1 # Reconsume
+               return new_character_token '</'
+
+       # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
+       tok_state_script_data_end_tag_name = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
+                       if is_appropriate_end_tag tok_cur_tag
+                               tok_state = tok_state_before_attribute_name
+                               return
+                       # fall through
+               if c is '/'
+                       if is_appropriate_end_tag tok_cur_tag
+                               tok_state = tok_state_self_closing_start_tag
+                               return
+                       # fall through
+               if is_uc_alpha(c)
+                       tok_cur_tag.name += c.toLowerCase()
+                       temporary_buffer += c
+                       return
+               if is_lc_alpha(c)
+                       tok_cur_tag.name += c
+                       temporary_buffer += c
+                       return
+               # Anything else
+               tok_state = tok_state_script_data
+               cur -= 1 # Reconsume
+               return new_character_token "</#{temporary_buffer}" # fixfull split
+
+       # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
+       tok_state_script_data_escape_start = ->
+               c = txt.charAt(cur++)
+               if c is '-'
+                       tok_state = tok_state_script_data_escape_start_dash
+                       return new_character_token '-'
+               # Anything else
+               tok_state = tok_state_script_data
+               cur -= 1 # Reconsume
+               return
+
+       # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
+       tok_state_script_data_escape_start_dash = ->
+               c = txt.charAt(cur++)
+               if c is '-'
+                       tok_state = tok_state_script_data_escaped_dash_dash
+                       return new_character_token '-'
+               # Anything else
+               tok_state = tok_state_script_data
+               cur -= 1 # Reconsume
+               return
+
+       # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
+       tok_state_script_data_escaped = ->
+               c = txt.charAt(cur++)
+               if c is '-'
+                       tok_state = tok_state_script_data_escaped_dash
+                       return new_character_token '-'
+               if c is '<'
+                       tok_state = tok_state_script_data_escaped_less_than_sign
+                       return
+               if c is "\u0000"
+                       parse_error()
+                       return new_character_token "\ufffd"
+               if c is '' # EOF
+                       tok_state = tok_state_data
+                       parse_error()
+                       cur -= 1 # Reconsume
+                       return
+               # Anything else
+               return new_character_token c
+
+       # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
+       tok_state_script_data_escaped_dash = ->
+               c = txt.charAt(cur++)
+               if c is '-'
+                       tok_state = tok_state_script_data_escaped_dash_dash
+                       return new_character_token '-'
+               if c is '<'
+                       tok_state = tok_state_script_data_escaped_less_than_sign
+                       return
+               if c is "\u0000"
+                       parse_error()
+                       tok_state = tok_state_script_data_escaped
+                       return new_character_token "\ufffd"
+               if c is '' # EOF
+                       tok_state = tok_state_data
+                       parse_error()
+                       cur -= 1 # Reconsume
+                       return
+               # Anything else
+               tok_state = tok_state_script_data_escaped
+               return new_character_token c
+
+       # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
+       tok_state_script_data_escaped_dash_dash = ->
+               c = txt.charAt(cur++)
+               if c is '-'
+                       return new_character_token '-'
+               if c is '<'
+                       tok_state = tok_state_script_data_escaped_less_than_sign
+                       return
+               if c is '>'
+                       tok_state = tok_state_script_data
+                       return new_character_token '>'
+               if c is "\u0000"
+                       parse_error()
+                       tok_state = tok_state_script_data_escaped
+                       return new_character_token "\ufffd"
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       cur -= 1 # Reconsume
+                       return
+               # Anything else
+               tok_state = tok_state_script_data_escaped
+               return new_character_token c
+
+       # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
+       tok_state_script_data_escaped_less_than_sign = ->
+               c = txt.charAt(cur++)
+               if c is '/'
+                       temporary_buffer = ''
+                       tok_state = tok_state_script_data_escaped_end_tag_open
+                       return
+               if is_uc_alpha(c)
+                       temporary_buffer = c.toLowerCase() # yes, really
+                       tok_state = tok_state_script_data_double_escape_start
+                       return new_character_token "<#{c}" # fixfull split
+               if is_lc_alpha(c)
+                       temporary_buffer = c
+                       tok_state = tok_state_script_data_double_escape_start
+                       return new_character_token "<#{c}" # fixfull split
+               # Anything else
+               tok_state = tok_state_script_data_escaped
+               cur -= 1 # Reconsume
+               return new_character_token c
+
+       # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
+       tok_state_script_data_escaped_end_tag_open = ->
+               c = txt.charAt(cur++)
+               if is_uc_alpha(c)
+                       tok_cur_tag = new_end_tag c.toLowerCase()
+                       temporary_buffer += c
+                       tok_state = tok_state_script_data_escaped_end_tag_name
+                       return
+               if is_lc_alpha(c)
+                       tok_cur_tag = new_end_tag c
+                       temporary_buffer += c
+                       tok_state = tok_state_script_data_escaped_end_tag_name
+                       return
+               # Anything else
+               tok_state = tok_state_script_data_escaped
+               cur -= 1 # Reconsume
+               return new_character_token '</' # fixfull split
+
+       # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
+       tok_state_script_data_escaped_end_tag_name = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+                       if is_appropriate_end_tag tok_cur_tag
+                               tok_state = tok_state_before_attribute_name
+                               return
+                       # fall through
+               if c is '/'
+                       if is_appropriate_end_tag tok_cur_tag
+                               tok_state = tok_state_self_closing_start_tag
+                               return
+                       # fall through
+               if is_uc_alpha(c)
+                       tok_cur_tag.name += c.toLowerCase()
+                       temporary_buffer += c.toLowerCase()
+                       return
+               if is_lc_alpha(c)
+                       tok_cur_tag.name += c
+                       temporary_buffer += c.toLowerCase()
+                       return
+               # Anything else
+               tok_state = tok_state_script_data_escaped
+               cur -= 1 # Reconsume
+               return new_character_token "</#{temporary_buffer}" # fixfull split
+
+       # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
+       tok_state_script_data_double_escape_start = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
+                       if temporary_buffer is 'script'
+                               tok_state = tok_state_script_data_double_escaped
+                       else
+                               tok_state = tok_state_script_data_escaped
+                       return new_character_token c
+               if is_uc_alpha(c)
+                       temporary_buffer += c.toLowerCase() # yes, really lowercase
+                       return new_character_token c
+               if is_lc_alpha(c)
+                       temporary_buffer += c
+                       return new_character_token c
+               # Anything else
+               tok_state = tok_state_script_data_escaped
+               cur -= 1 # Reconsume
+               return
+
+       # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
+       tok_state_script_data_double_escaped = ->
+               c = txt.charAt(cur++)
+               if c is '-'
+                       tok_state = tok_state_script_data_double_escaped_dash
+                       return new_character_token '-'
+               if c is '<'
+                       tok_state = tok_state_script_data_double_escaped_less_than_sign
+                       return new_character_token '<'
+               if c is "\u0000"
+                       parse_error()
+                       return new_character_token "\ufffd"
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       cur -= 1 # Reconsume
+                       return
+               # Anything else
+               return new_character_token c
+
+       # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
+       tok_state_script_data_double_escaped_dash = ->
+               c = txt.charAt(cur++)
+               if c is '-'
+                       tok_state = tok_state_script_data_double_escaped_dash_dash
+                       return new_character_token '-'
+               if c is '<'
+                       tok_state = tok_state_script_data_double_escaped_less_than_sign
+                       return new_character_token '<'
+               if c is "\u0000"
+                       parse_error()
+                       tok_state = tok_state_script_data_double_escaped
+                       return new_character_token "\ufffd"
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       cur -= 1 # Reconsume
+                       return
+               # Anything else
+               tok_state = tok_state_script_data_double_escaped
+               return new_character_token c
+
+       # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
+       tok_state_script_data_double_escaped_dash_dash = ->
+               c = txt.charAt(cur++)
+               if c is '-'
+                       return new_character_token '-'
+               if c is '<'
+                       tok_state = tok_state_script_data_double_escaped_less_than_sign
+                       return new_character_token '<'
+               if c is '>'
+                       tok_state = tok_state_script_data
+                       return new_character_token '>'
+               if c is "\u0000"
+                       parse_error()
+                       tok_state = tok_state_script_data_double_escaped
+                       return new_character_token "\ufffd"
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       cur -= 1 # Reconsume
+                       return
+               # Anything else
+               tok_state = tok_state_script_data_double_escaped
+               return new_character_token c
+
+       # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
+       tok_state_script_data_double_escaped_less_than_sign = ->
+               c = txt.charAt(cur++)
+               if c is '/'
+                       temporary_buffer = ''
+                       tok_state = tok_state_script_data_double_escape_end
+                       return new_character_token '/'
+               # Anything else
+               tok_state = tok_state_script_data_double_escaped
+               cur -= 1 # Reconsume
+               return
+
+       # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
+       tok_state_script_data_double_escape_end = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
+                       if temporary_buffer is 'script'
+                               tok_state = tok_state_script_data_escaped
+                       else
+                               tok_state = tok_state_script_data_double_escaped
+                       return new_character_token c
+               if is_uc_alpha(c)
+                       temporary_buffer += c.toLowerCase() # yes, really lowercase
+                       return new_character_token c
+               if is_lc_alpha(c)
+                       temporary_buffer += c
+                       return new_character_token c
+               # Anything else
+               tok_state = tok_state_script_data_double_escaped
+               cur -= 1 # Reconsume
+               return
  
         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
         tok_state_before_attribute_name = ->
@@ -2510,7 +2853,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                 parse_error()
                                 tok_state = tok_state_data
                         else
-                               if uc_alpha.indexOf(c) > -1
+                               if is_uc_alpha(c)
                                         attr_name = c.toLowerCase()
                                 else
                                         attr_name = c
@@ -2543,7 +2886,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                 parse_error()
                                 tok_state = tok_state_data
                         else
-                               if uc_alpha.indexOf(c) > -1
+                               if is_uc_alpha(c)
                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
                                 else
                                         tok_cur_tag.attrs_a[0][0] += c
@@ -2563,7 +2906,7 @@ parse_html = (txt, parse_error_cb = null) ->
                 if c is '>'
                         tok_state = tok_state_data
                         return
-               if uc_alpha.indexOf(c) > -1
+               if is_uc_alpha(c)
                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
                         tok_state = tok_state_attribute_name
                         return
@@ -2691,6 +3034,24 @@ parse_html = (txt, parse_error_cb = null) ->
                                 cur -= 1 # we didn't handle that char
                 return null
  
+       # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
+       tok_state_self_closing_start_tag = ->
+               c = txt.charAt(cur++)
+               if c is '>'
+                       tok_cur_tag.flag 'self-closing'
+                       tok_state = tok_state_data
+                       return tok_cur_tag
+               if c is ''
+                       parse_error()
+                       tok_state = tok_state_data
+                       cur -= 1 # Reconsume
+                       return
+               # Anything else
+               parse_error()
+               tok_state = tok_state_before_attribute_name
+               cur -= 1 # Reconsume
+               return
+
         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
         # WARNING: put a comment token in tok_cur_tag before setting this state
         tok_state_bogus_comment = ->
@@ -2718,7 +3079,7 @@ parse_html = (txt, parse_error_cb = null) ->
                         tok_state = tok_state_doctype
                         return
                 acn = adjusted_current_node()
-               if acn and acn.namespace isnt NS_HTML and text.substr(cur, 7) is '[CDATA['
+               if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
                         cur += 7
                         tok_state = tok_state_cdata_section
                         return
@@ -2881,7 +3242,7 @@ parse_html = (txt, parse_error_cb = null) ->
                 c = txt.charAt(cur++)
                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
                         return
-               if uc_alpha.indexOf(c) > -1
+               if is_uc_alpha(c)
                         tok_cur_tag = new_doctype_token c.toLowerCase()
                         tok_state = tok_state_doctype_name
                         return
@@ -2917,7 +3278,7 @@ parse_html = (txt, parse_error_cb = null) ->
                 if c is '>'
                         tok_state = tok_state_data
                         return tok_cur_tag
-               if uc_alpha.indexOf(c) > -1
+               if is_uc_alpha(c)
                         tok_cur_tag.name += c.toLowerCase()
                         return
                 if c is "\u0000"
@@ -3379,7 +3740,7 @@ parse_html = (txt, parse_error_cb = null) ->
         pending_table_character_tokens = []
         head_element_pointer = null
         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
-       context_element = null # FIXME initialize from args.fragment
+       context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
  
         # tokenizer initialization
         tok_state = tok_state_data
diff --git a/test.coffee b/test.coffee

index 9e82bdd..3098bbd 100644 (file)
--- a/test.coffee
+++ b/test.coffee
@@ -7390,6 +7390,7 @@ tests = [
                 expected: "| <frame>\n"
         }, {
                 name: "tests_innerHTML_1.dat #85"
+               html: ""
                 fragment: "html"
                 expected: "| <head>\n| <body>\n"
         }, {
@@ -7832,6 +7833,8 @@ test_parser = (args) ->
                 #       console.log str
                 console.log "FAILED: \"#{args.name}\""
                 console.log "      Input: #{args.html}"
+               if args.fragment?
+                       console.log "   Fragment: #{args.fragment}"
                 console.log "    Correct: #{args.expected}"
                 console.log "     Output: #{serialized}"
                 if parse_errors.length > 0
@@ -7843,8 +7846,7 @@ test_parser = (args) ->
                 console.log "passed \"#{args.name}\""
                 test_results.passed += 1
  test_summary = ->
-       console.log "Tests passed: #{test_results.passed}"
-       console.log "Tests Failed: #{test_results.failed}"
+       console.log "Tests passed: #{test_results.passed}, Tests Failed: #{test_results.failed}"
  
  
  next_test = 0
@@ -7852,11 +7854,12 @@ run_tests_and_breathe = ->
         start_time = new Date()
         loop
                 if next_test >= tests.length
+                       test_summary()
                         return
                 test_parser tests[next_test]
                 next_test += 1
                 now = new Date()
                 if now - start_time > 100 # miliseconds
-                       setTimeout run_tests_and_breathe, 1
+                       break
+       setTimeout run_tests_and_breathe, 1
  run_tests_and_breathe()
-test_summary()
author	Jason Woofenden <jason@jasonwoof.com>
	Tue, 22 Dec 2015 22:30:45 +0000 (17:30 -0500)
committer	Jason Woofenden <jason@jasonwoof.com>
	Tue, 22 Dec 2015 22:30:45 +0000 (17:30 -0500)
parse-html.coffee		patch \| blob \| history
test.coffee		patch \| blob \| history