implement lots of raw-ish text parsing

author Jason Woofenden <jason@jasonwoof.com>

Sun, 20 Dec 2015 14:49:10 +0000 (09:49 -0500)

committer Jason Woofenden <jason@jasonwoof.com>

Sun, 20 Dec 2015 14:49:10 +0000 (09:49 -0500)
author Jason Woofenden <jason@jasonwoof.com>
Sun, 20 Dec 2015 14:49:10 +0000 (09:49 -0500)
committer Jason Woofenden <jason@jasonwoof.com>
Sun, 20 Dec 2015 14:49:10 +0000 (09:49 -0500)
diff --git a/parse-html.coffee b/parse-html.coffee

index 86d0136..d271706 100644 (file)
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -94,6 +94,8 @@ class Node
                 attrs = {}
                 attrs[k] = v for k, v of @attrs
                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
                 attrs = {}
                 attrs[k] = v for k, v of @attrs
                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
+       acknowledge_self_closing: ->
+               # fixfull
         serialize: (shallow = false, show_ids = false) -> # for unit tests
                 ret = ''
                 switch @type
         serialize: (shallow = false, show_ids = false) -> # for unit tests
                 ret = ''
                 switch @type
@@ -149,6 +151,7 @@ new_element = (name) ->
         return new Node TYPE_TAG, name: name
  new_text_node = (txt) ->
         return new Node TYPE_TEXT, text: txt
         return new Node TYPE_TAG, name: name
  new_text_node = (txt) ->
         return new Node TYPE_TEXT, text: txt
+new_character_token = new_text_node
  new_comment_node = (txt) ->
         return new Node TYPE_COMMENT, text: txt
  new_eof_token = ->
  new_comment_node = (txt) ->
         return new Node TYPE_COMMENT, text: txt
  new_eof_token = ->
@@ -158,8 +161,8 @@ new_afe_marker = ->
  new_aaa_bookmark = ->
         return new Node TYPE_AAA_BOOKMARK
  
  new_aaa_bookmark = ->
         return new Node TYPE_AAA_BOOKMARK
  
-lc_alpha = "abcdefghijklmnopqrstuvwxqz"
-uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
+lc_alpha = "abcdefghijklmnopqrstuvwxyz"
+uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  digits = "0123456789"
  alnum = lc_alpha + uc_alpha + digits
  hex_chars = digits + "abcdefABCDEF"
  digits = "0123456789"
  alnum = lc_alpha + uc_alpha + digits
  hex_chars = digits + "abcdefABCDEF"
@@ -340,15 +343,19 @@ parse_html = (txt, parse_error_cb = null) ->
         cur = 0 # index of next char in txt to be parsed
         # declare tree and tokenizer variables so they're in scope below
         tree = null
         cur = 0 # index of next char in txt to be parsed
         # declare tree and tokenizer variables so they're in scope below
         tree = null
-       open_els = [] # stack of open elements
+       open_els = null # stack of open elements
+       afe = null # active formatting elements
+       template_insertion_modes = null
         insertion_mode = null
         insertion_mode = null
+       original_insertion_mode = null
         tok_state = null
         tok_cur_tag = null # partially parsed tag
         tok_state = null
         tok_cur_tag = null # partially parsed tag
+       flag_scripting = null
         flag_frameset_ok = null
         flag_parsing = null
         flag_foster_parenting = null
         form_element_pointer = null
         flag_frameset_ok = null
         flag_parsing = null
         flag_foster_parenting = null
         form_element_pointer = null
-       afe = [] # active formatting elements
+       temporary_buffer = null
  
         parse_error = ->
                 if parse_error_cb?
  
         parse_error = ->
                 if parse_error_cb?
@@ -872,7 +879,6 @@ parse_html = (txt, parse_error_cb = null) ->
                 debug_log "AAA DONE"
  
         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
                 debug_log "AAA DONE"
  
         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
-       # FIXME test this (particularly emplied end tags)
         close_p_element = ->
                 generate_implied_end_tags 'p' # arg is exception
                 if open_els[0].name isnt 'p'
         close_p_element = ->
                 generate_implied_end_tags 'p' # arg is exception
                 if open_els[0].name isnt 'p'
@@ -886,7 +892,8 @@ parse_html = (txt, parse_error_cb = null) ->
                         close_p_element()
  
         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
                         close_p_element()
  
         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
-       tree_insert_text = (t) ->
+       # aka insert_a_character = (t) ->
+       insert_character = (t) ->
                 dest = adjusted_insertion_location()
                 # fixfull check for Document node
                 if dest[1] > 0
                 dest = adjusted_insertion_location()
                 # fixfull check for Document node
                 if dest[1] > 0
@@ -1046,17 +1053,114 @@ parse_html = (txt, parse_error_cb = null) ->
  
         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
         # position should be [node, index_within_children]
  
         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
         # position should be [node, index_within_children]
-       tree_insert_a_comment = (t, position = null) ->
+       tree_insert_comment = (t, position = null) ->
                 position ?= adjusted_insertion_location()
                 position[0].children.splice position[1], 0, t
  
                 position ?= adjusted_insertion_location()
                 position[0].children.splice position[1], 0, t
  
+       # 8.2.5.2
+       # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
+       parse_generic_raw_text = (t) ->
+               insert_html_element t
+               tok_state = tok_state_rawtext
+               original_insertion_mode = insertion_mode
+               insertion_mode = ins_mode_text
+       parse_generic_rcdata_text = (t) ->
+               insert_html_element t
+               tok_state = tok_state_rcdata
+               original_insertion_mode = insertion_mode
+               insertion_mode = ins_mode_text
+
         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
         generate_implied_end_tags = (except = null) ->
                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
                         open_els.shift()
  
         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
         generate_implied_end_tags = (except = null) ->
                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
                         open_els.shift()
  
-       # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
+       # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
+       ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
+               open_els.shift() # spec says this will be a 'head' node
+               insertion_mode = ins_mode_after_head
+               insertion_mode t
+       ins_mode_in_head = (t) ->
+               if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
+                       insert_character t
+                       return
+               if t.type is TYPE_COMMENT
+                       tree_insert_comment t
+                       return
+               if t.type is TYPE_DOCTYPE
+                       parse_error()
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'html'
+                       ins_mode_in_body t
+                       return
+               if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
+                       el = insert_html_element t
+                       open_els.shift()
+                       el.acknowledge_self_closing()
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'meta'
+                       el = insert_html_element t
+                       open_els.shift()
+                       el.acknowledge_self_closing()
+                       # fixfull encoding stuff
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'title'
+                       parse_generic_rcdata_element t
+                       return
+               if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
+                       parse_generic_raw_text t
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
+                       insert_html_element t
+                       insertion_mode = in_head_noscript # FIXME implement
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'script'
+                       ail = adjusted_insertion_location()
+                       el = token_to_element t, NS_HTML, ail
+                       el.flag_parser_inserted true # FIXME implement
+                       # fixfull frament case
+                       ail[0].children.splice ail[1], 0, el
+                       open_els.unshift el
+                       tok_state = tok_state_script_data
+                       original_insertion_mode = insertion_mode # make sure orig... is defined
+                       insertion_mode = ins_mode_text # FIXME implement
+                       return
+               if t.type is TYPE_END_TAG and t.name is 'head'
+                       open_els.shift() # will be a head element... spec says so
+                       insertion_mode = ins_mode_after_head
+                       return
+               if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
+                       ins_mode_in_head_else t
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'template'
+                       insert_html_element t
+                       afe_push_marker()
+                       flag_frameset_ok = false
+                       insertion_mode = ins_mode_in_template
+                       template_insertion_modes.unshift ins_mode_in_template # FIXME implement
+                       return
+               if t.type is TYPE_END_TAG and t.name is 'template'
+                       if template_tag_is_open()
+                               generate_implied_end_tags
+                               if open_els[0].name isnt 'template'
+                                       parse_error()
+                               loop
+                                       el = open_els.shift()
+                                       if el.name is 'template'
+                                               break
+                               clear_afe_to_marker()
+                               template_insertion_modes.shift()
+                               reset_insertion_mode()
+                       else
+                               parse_error()
+                       return
+               if (t.type is TYPE_OPEN_TAG and t.name is 'head') or t.type is TYPE_END_TAG
+                       parse_error()
+                       return
+               ins_mode_in_head_else t
+
+       # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
                 for node, i in open_els
                         if node.name is name # FIXME check namespace too
         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
                 for node, i in open_els
                         if node.name is name # FIXME check namespace too
@@ -1077,13 +1181,13 @@ parse_html = (txt, parse_error_cb = null) ->
                                                 parse_error()
                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
                                                 reconstruct_active_formatting_elements()
                                                 parse_error()
                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
                                                 reconstruct_active_formatting_elements()
-                                               tree_insert_text t
+                                               insert_character t
                                         else
                                                 reconstruct_active_formatting_elements()
                                         else
                                                 reconstruct_active_formatting_elements()
-                                               tree_insert_text t
+                                               insert_character t
                                                 flag_frameset_ok = false
                         when TYPE_COMMENT
                                                 flag_frameset_ok = false
                         when TYPE_COMMENT
-                               tree_insert_a_comment t
+                               tree_insert_comment t
                         when TYPE_DOCTYPE
                                 parse_error()
                         when TYPE_START_TAG
                         when TYPE_DOCTYPE
                                 parse_error()
                         when TYPE_START_TAG
@@ -1096,7 +1200,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                                         root_attrs[k] = v unless root_attrs[k]?
                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
                                                 # FIXME also do this for </template> (end tag)
                                                         root_attrs[k] = v unless root_attrs[k]?
                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
                                                 # FIXME also do this for </template> (end tag)
-                                               return tree_in_head t
+                                               return ins_mode_in_head t
                                         when 'body'
                                                 parse_error()
                                                 # TODO
                                         when 'body'
                                                 parse_error()
                                                 # TODO
@@ -1255,6 +1359,36 @@ parse_html = (txt, parse_error_cb = null) ->
                         el = afe.shift()
                         if el.type is TYPE_AFE_MARKER
                                 return
                         el = afe.shift()
                         if el.type is TYPE_AFE_MARKER
                                 return
+
+       # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
+       ins_mode_text = (t) ->
+               if t.type is TYPE_TEXT
+                       insert_character t
+                       return
+               if t.type is TYPE_EOF
+                       parse_error()
+                       if open_els[0].name is 'script'
+                               open_els[0].flag 'already started', true
+                       open_els.shift()
+                       insertion_mode = original_insertion_mode
+                       insertion_mode t
+                       return
+               if t.type is TYPE_END_TAG and t.name is 'script'
+                       open_els.shift()
+                       insertion_mode = original_insertion_mode
+                       # fixfull the spec seems to assume that I'm going to run the script
+                       # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
+                       return
+               if t.type is TYPE_END_TAG
+                       open_els.shift()
+                       insertion_mode = original_insertion_mode
+                       return
+               console.log 'warning: end of ins_mode_text reached'
+
+       # the functions below implement the tokenizer stats described here:
+       # http://www.w3.org/TR/html5/syntax.html#tokenization
+
+       # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
         ins_mode_in_table = (t) ->
                 switch t.type
                         when TYPE_TEXT
         ins_mode_in_table = (t) ->
                 switch t.type
                         when TYPE_TEXT
@@ -1265,7 +1399,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                 else
                                         ins_mode_in_table_else t
                         when TYPE_COMMENT
                                 else
                                         ins_mode_in_table_else t
                         when TYPE_COMMENT
-                               tree_insert_a_comment t
+                               tree_insert_comment t
                         when TYPE_DOCTYPE
                                 parse_error()
                         when TYPE_START_TAG
                         when TYPE_DOCTYPE
                                 parse_error()
                         when TYPE_START_TAG
@@ -1309,9 +1443,9 @@ parse_html = (txt, parse_error_cb = null) ->
                                                         ins_mode_in_table_else t
                                                 else
                                                         parse_error()
                                                         ins_mode_in_table_else t
                                                 else
                                                         parse_error()
-                                                       insert_html_element t
+                                                       el = insert_html_element t
                                                         open_els.shift()
                                                         open_els.shift()
-                                                       # fixfull acknowledge sef-closing flag
+                                                       el.acknowledge_self_closing()
                                         when 'form'
                                                 parse_error()
                                                 if form_element_pointer?
                                         when 'form'
                                                 parse_error()
                                                 if form_element_pointer?
@@ -1345,6 +1479,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                 ins_mode_in_table_else t
  
  
                                 ins_mode_in_table_else t
  
  
+       # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
         ins_mode_in_table_text = (t) ->
                 switch t.type
                         when TYPE_TEXT
         ins_mode_in_table_text = (t) ->
                 switch t.type
                         when TYPE_TEXT
@@ -1355,6 +1490,7 @@ parse_html = (txt, parse_error_cb = null) ->
                 console.log "unimplemented ins_mode_in_table_text"
                 # FIXME CONTINUE
  
                 console.log "unimplemented ins_mode_in_table_text"
                 # FIXME CONTINUE
  
+       # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
         ins_mode_in_table_body = (t) ->
                 if t.type is TYPE_START_TAG and t.name is 'tr'
                         clear_stack_to_table_body_context()
         ins_mode_in_table_body = (t) ->
                 if t.type is TYPE_START_TAG and t.name is 'tr'
                         clear_stack_to_table_body_context()
@@ -1397,6 +1533,7 @@ parse_html = (txt, parse_error_cb = null) ->
                 # Anything else
                 ins_mode_in_table t
  
                 # Anything else
                 ins_mode_in_table t
  
+       # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
         ins_mode_in_row = (t) ->
                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
                         clear_stack_to_table_row_context()
         ins_mode_in_row = (t) ->
                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
                         clear_stack_to_table_row_context()
@@ -1449,7 +1586,7 @@ parse_html = (txt, parse_error_cb = null) ->
                 clear_afe_to_marker()
                 insertion_mode = ins_mode_in_row
  
                 clear_afe_to_marker()
                 insertion_mode = ins_mode_in_row
  
-       # http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
+       # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
         ins_mode_in_cell = (t) ->
                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
                         if is_in_table_scope t.name
         ins_mode_in_cell = (t) ->
                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
                         if is_in_table_scope t.name
@@ -1492,15 +1629,11 @@ parse_html = (txt, parse_error_cb = null) ->
                 # Anything Else
                 ins_mode_in_body t
  
                 # Anything Else
                 ins_mode_in_body t
  
-
-       # the functions below implement the tokenizer stats described here:
-       # http://www.w3.org/TR/html5/syntax.html#tokenization
-
         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
         tok_state_data = ->
                 switch c = txt.charAt(cur++)
                         when '&'
         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
         tok_state_data = ->
                 switch c = txt.charAt(cur++)
                         when '&'
-                               return new_text_node tokenize_character_reference()
+                               return new_text_node parse_character_reference()
                         when '<'
                                 tok_state = tok_state_tag_open
                         when "\u0000"
                         when '<'
                                 tok_state = tok_state_tag_open
                         when "\u0000"
@@ -1514,7 +1647,68 @@ parse_html = (txt, parse_error_cb = null) ->
  
         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
         # not needed: tok_state_character_reference_in_data = ->
  
         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
         # not needed: tok_state_character_reference_in_data = ->
-       # just call tok_state_character_reference_in_data()
+       # just call parse_character_reference()
+
+       # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
+       tok_state_rcdata = ->
+               switch c = txt.charAt(cur++)
+                       when '&'
+                               return new_text_node parse_character_reference()
+                       when '<'
+                               tok_state = tok_state_rcdata_less_than_sign
+                       when "\u0000"
+                               parse_error()
+                               return new_character_token "\ufffd"
+                       when '' # EOF
+                               return new_eof_token()
+                       else
+                               return new_character_token c
+               return null
+
+       # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
+       # not needed: tok_state_character_reference_in_rcdata = ->
+       # just call parse_character_reference()
+
+       # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
+       tok_state_rawtext = ->
+               switch c = txt.charAt(cur++)
+                       when '<'
+                               tok_state = tok_state_rawtext_less_than_sign
+                       when "\u0000"
+                               parse_error()
+                               return new_character_token "\ufffd"
+                       when '' # EOF
+                               return new_eof_token()
+                       else
+                               return new_character_token c
+               return null
+
+       # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
+       tok_state_script_data = ->
+               switch c = txt.charAt(cur++)
+                       when '<'
+                               tok_state = tok_state_script_data_less_than_sign
+                       when "\u0000"
+                               parse_error()
+                               return new_character_token "\ufffd"
+                       when '' # EOF
+                               return new_eof_token()
+                       else
+                               return new_character_token c
+               return null
+
+       # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
+       tok_state_plaintext = ->
+               switch c = txt.charAt(cur++)
+                       when "\u0000"
+                               parse_error()
+                               return new_character_token "\ufffd"
+                       when '' # EOF
+                               return new_eof_token()
+                       else
+                               return new_character_token c
+               return null
+
  
         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
         tok_state_tag_open = ->
  
         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
         tok_state_tag_open = ->
@@ -1587,6 +1781,140 @@ parse_html = (txt, parse_error_cb = null) ->
                                         tok_cur_tag.name += c
                 return null
  
                                         tok_cur_tag.name += c
                 return null
  
+       # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
+       tok_state_rcdata_less_than_sign = ->
+               c = txt.charAt(cur++)
+               if c is '/'
+                       temporary_buffer = ''
+                       tok_state = tok_state_rcdata_end_tag_open
+                       return null
+               # Anything else
+               tok_state = tok_state_rcdata
+               cur -= 1 # reconsume the input character
+               return new_character_token '<'
+
+       # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
+       tok_state_rcdata_end_tag_open = ->
+               c = txt.charAt(cur++)
+               if uc_alpha.indexOf(c) > -1
+                       tok_cur_tag = new_end_tag c.toLowerCase()
+                       temporary_buffer += c
+                       tok_state = tok_state_rcdata_end_tag_name
+                       return null
+               if lc_alpha.indexOf(c) > -1
+                       tok_cur_tag = new_end_tag c
+                       temporary_buffer += c
+                       tok_state = tok_state_rcdata_end_tag_name
+                       return null
+               # Anything else
+               tok_state = tok_state_rcdata
+               cur -= 1 # reconsume the input character
+               return new_character_token "</" # fixfull separate these
+
+       # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
+       is_appropriate_end_tag = (t) ->
+               # spec says to check against "the tag name of the last start tag to
+               # have been emitted from this tokenizer", but this is only called from
+               # the various "raw" states, which I'm pretty sure all push the start
+               # token onto open_els. TODO: verify this after the script data states
+               # are implemented
+               debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
+               return t.type is TYPE_END_TAG and t.name is open_els[0].name
+
+       # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
+       tok_state_rcdata_end_tag_name = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
+                       if is_appropriate_end_tag tok_cur_tag
+                               tok_state = tok_state_before_attribute_name
+                               return
+                       # else fall through to "Anything else"
+               if c is '/'
+                       if is_appropriate_end_tag tok_cur_tag
+                               tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
+                               return
+                       # else fall through to "Anything else"
+               if c is '>'
+                       if is_appropriate_end_tag tok_cur_tag
+                               tok_state = tok_state_data
+                               return tok_cur_tag
+                       # else fall through to "Anything else"
+               if uc_alpha.indexOf(c) > -1
+                       tok_cur_tag.name += c.toLowerCase()
+                       temporary_buffer += c
+                       return null
+               if lc_alpha.indexOf(c) > -1
+                       tok_cur_tag.name += c
+                       temporary_buffer += c
+                       return null
+               # Anything else
+               tok_state = tok_state_rcdata
+               cur -= 1 # reconsume the input character
+               return new_character_token '</' + temporary_buffer # fixfull separate these
+
+       # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
+       tok_state_rawtext_less_than_sign = ->
+               c = txt.charAt(cur++)
+               if c is '/'
+                       temporary_buffer = ''
+                       tok_state = tok_state_rawtext_end_tag_open
+                       return null
+               # Anything else
+               tok_state = tok_state_rawtext
+               cur -= 1 # reconsume the input character
+               return new_character_token '<'
+
+       # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
+       tok_state_rawtext_end_tag_open = ->
+               c = txt.charAt(cur++)
+               if uc_alpha.indexOf(c) > -1
+                       tok_cur_tag = new_end_tag c.toLowerCase()
+                       temporary_buffer += c
+                       tok_state = tok_state_rawtext_end_tag_name
+                       return null
+               if lc_alpha.indexOf(c) > -1
+                       tok_cur_tag = new_end_tag c
+                       temporary_buffer += c
+                       tok_state = tok_state_rawtext_end_tag_name
+                       return null
+               # Anything else
+               tok_state = tok_state_rawtext
+               cur -= 1 # reconsume the input character
+               return new_character_token "</" # fixfull separate these
+
+       # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
+       tok_state_rawtext_end_tag_name = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
+                       if is_appropriate_end_tag tok_cur_tag
+                               tok_state = tok_state_before_attribute_name
+                               return
+                       # else fall through to "Anything else"
+               if c is '/'
+                       if is_appropriate_end_tag tok_cur_tag
+                               tok_state = tok_state_self_closing_start_tag
+                               return
+                       # else fall through to "Anything else"
+               if c is '>'
+                       if is_appropriate_end_tag tok_cur_tag
+                               tok_state = tok_state_data
+                               return tok_cur_tag
+                       # else fall through to "Anything else"
+               if uc_alpha.indexOf(c) > -1
+                       tok_cur_tag.name += c.toLowerCase()
+                       temporary_buffer += c
+                       return null
+               if lc_alpha.indexOf(c) > -1
+                       tok_cur_tag.name += c
+                       temporary_buffer += c
+                       return null
+               # Anything else
+               tok_state = tok_state_rawtext
+               cur -= 1 # reconsume the input character
+               return new_character_token '</' + temporary_buffer # fixfull separate these
+
+       # TODO _all_ of the missing states here (17-33) are for parsing script tags
+
         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
         tok_state_before_attribute_name = ->
                 attr_name = null
         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
         tok_state_before_attribute_name = ->
                 attr_name = null
@@ -1721,7 +2049,7 @@ parse_html = (txt, parse_error_cb = null) ->
                         when '"'
                                 tok_state = tok_state_after_attribute_value_quoted
                         when '&'
                         when '"'
                                 tok_state = tok_state_after_attribute_value_quoted
                         when '&'
-                               tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
+                               tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
                         when "\u0000"
                                 # Parse error
                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
                         when "\u0000"
                                 # Parse error
                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
@@ -1738,7 +2066,7 @@ parse_html = (txt, parse_error_cb = null) ->
                         when "'"
                                 tok_state = tok_state_after_attribute_value_quoted
                         when '&'
                         when "'"
                                 tok_state = tok_state_after_attribute_value_quoted
                         when '&'
-                               tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
+                               tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
                         when "\u0000"
                                 # Parse error
                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
                         when "\u0000"
                                 # Parse error
                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
@@ -1755,7 +2083,7 @@ parse_html = (txt, parse_error_cb = null) ->
                         when "\t", "\n", "\u000c", ' '
                                 tok_state = tok_state_before_attribute_name
                         when '&'
                         when "\t", "\n", "\u000c", ' '
                                 tok_state = tok_state_before_attribute_name
                         when '&'
-                               tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
+                               tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
                         when '>'
                                 tok_state = tok_state_data
                                 tmp = tok_cur_tag
                         when '>'
                                 tok_state = tok_state_data
                                 tmp = tok_cur_tag
@@ -1795,7 +2123,7 @@ parse_html = (txt, parse_error_cb = null) ->
         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
         # Don't set this as a state, just call it
         # returns a string (NOT a text node)
         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
         # Don't set this as a state, just call it
         # returns a string (NOT a text node)
-       tokenize_character_reference = (allowed_char = null, in_attr = false) ->
+       parse_character_reference = (allowed_char = null, in_attr = false) ->
                 if cur >= txt.length
                         return '&'
                 switch c = txt.charAt(cur)
                 if cur >= txt.length
                         return '&'
                 switch c = txt.charAt(cur)
@@ -1872,12 +2200,16 @@ parse_html = (txt, parse_error_cb = null) ->
         # see comments on TYPE_TAG/etc for the structure of this data
         tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
         open_els = [tree]
         # see comments on TYPE_TAG/etc for the structure of this data
         tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
         open_els = [tree]
+       afe = [] # active formatting elements
+       template_insertion_modes = []
         insertion_mode = ins_mode_in_body
         insertion_mode = ins_mode_in_body
+       original_insertion_mode = insertion_mode # TODO check spec
+       flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
         flag_frameset_ok = true
         flag_parsing = true
         flag_foster_parenting = false
         form_element_pointer = null
         flag_frameset_ok = true
         flag_parsing = true
         flag_foster_parenting = false
         form_element_pointer = null
-       afe = [] # active formatting elements
+       temporary_buffer = null
  
         # tokenizer initialization
         tok_state = tok_state_data
  
         # tokenizer initialization
         tok_state = tok_state_data
@@ -2042,3 +2374,9 @@ test_parser name: "variation on html5lib aaa 17 (with attributes in various orde
  test_parser name: "junk after attribute close-quote", \
         html: '<p><b c="d", e="f">foo<p>x',
         expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'
  test_parser name: "junk after attribute close-quote", \
         html: '<p><b c="d", e="f">foo<p>x',
         expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'
+test_parser name: "html5lib aaa02 1", \
+       html: '<b>1<i>2<p>3</b>4',
+       expected: 'tag:"b",{},[text:"1",tag:"i",{},[text:"2"]],tag:"i",{},[tag:"p",{},[tag:"b",{},[text:"3"],text:"4"]]'
+test_parser name: "html5lib aaa02 2", \
+       html: '<a><div><style></style><address><a>',
+       expected: 'tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"style",{},[]],tag:"address",{},[tag:"a",{},[],tag:"a",{},[]]]'
author	Jason Woofenden <jason@jasonwoof.com>
	Sun, 20 Dec 2015 14:49:10 +0000 (09:49 -0500)
committer	Jason Woofenden <jason@jasonwoof.com>
	Sun, 20 Dec 2015 14:49:10 +0000 (09:49 -0500)