implement lots of raw-ish text parsing

[peach-html5-editor.git] / parse-html.coffee
diff --git a/parse-html.coffee b/parse-html.coffee

index c71567d..d271706 100644 (file)
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -94,6 +94,8 @@ class Node
                 attrs = {}
                 attrs[k] = v for k, v of @attrs
                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
+       acknowledge_self_closing: ->
+               # fixfull
         serialize: (shallow = false, show_ids = false) -> # for unit tests
                 ret = ''
                 switch @type
@@ -105,8 +107,17 @@ class Node
                                         ret += "##{@id},"
                                 if shallow
                                         break
-                               ret += JSON.stringify @attrs
-                               ret += ',['
+                               attr_keys = []
+                               for k of @attrs
+                                       attr_keys.push k
+                               attr_keys.sort()
+                               ret += '{'
+                               sep = ''
+                               for k in attr_keys
+                                       ret += sep
+                                       sep = ','
+                                       ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
+                               ret += '},['
                                 sep = ''
                                 for c in @children
                                         ret += sep
@@ -140,6 +151,7 @@ new_element = (name) ->
         return new Node TYPE_TAG, name: name
  new_text_node = (txt) ->
         return new Node TYPE_TEXT, text: txt
+new_character_token = new_text_node
  new_comment_node = (txt) ->
         return new Node TYPE_COMMENT, text: txt
  new_eof_token = ->
@@ -149,8 +161,8 @@ new_afe_marker = ->
  new_aaa_bookmark = ->
         return new Node TYPE_AAA_BOOKMARK
  
-lc_alpha = "abcdefghijklmnopqrstuvwxqz"
-uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
+lc_alpha = "abcdefghijklmnopqrstuvwxyz"
+uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
  digits = "0123456789"
  alnum = lc_alpha + uc_alpha + digits
  hex_chars = digits + "abcdefABCDEF"
@@ -331,15 +343,19 @@ parse_html = (txt, parse_error_cb = null) ->
         cur = 0 # index of next char in txt to be parsed
         # declare tree and tokenizer variables so they're in scope below
         tree = null
-       open_els = [] # stack of open elements
+       open_els = null # stack of open elements
+       afe = null # active formatting elements
+       template_insertion_modes = null
         insertion_mode = null
+       original_insertion_mode = null
         tok_state = null
         tok_cur_tag = null # partially parsed tag
+       flag_scripting = null
         flag_frameset_ok = null
         flag_parsing = null
         flag_foster_parenting = null
         form_element_pointer = null
-       afe = [] # active formatting elements
+       temporary_buffer = null
  
         parse_error = ->
                 if parse_error_cb?
@@ -347,6 +363,21 @@ parse_html = (txt, parse_error_cb = null) ->
                 else
                         console.log "Parse error at character #{cur} of #{txt.length}"
  
+       afe_push = (new_el) ->
+               matches = 0
+               for el, i in afe
+                       if el.name is new_el.name and el.namespace is new_el.namespace
+                               for k, v of el.attrs
+                                       continue unless new_el.attrs[k] is v
+                               for k, v of new_el.attrs
+                                       continue unless el.attrs[k] is v
+                               matches += 1
+                               if matches is 3
+                                       afe.splice i, 1
+                                       break
+               afe.unshift new_el
+       afe_push_marker = ->
+               afe.unshift new_afe_marker()
  
         # the functions below impliment the Tree Contstruction algorithm
         # http://www.w3.org/TR/html5/syntax.html#tree-construction
@@ -553,7 +584,7 @@ parse_html = (txt, parse_error_cb = null) ->
                         tree_insert_element el
                         afe[i] = el
                         break if i is 0
-                       i -= 1
+                       i -= 1 # Advance
  
         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
         # adoption agency algorithm
@@ -562,6 +593,10 @@ parse_html = (txt, parse_error_cb = null) ->
         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
         adoption_agency = (subject) ->
+               debug_log "adoption_agency()"
+               debug_log "tree: #{serialize_els tree.children, false, true}"
+               debug_log "open_els: #{serialize_els open_els, true, true}"
+               debug_log "afe: #{serialize_els afe, true, true}"
                 if open_els[0].name is subject
                         el = open_els[0]
                         open_els.shift()
@@ -570,6 +605,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                 if t is el
                                         afe.splice i, 1
                                         break
+                       debug_log "aaa: starting off with subject on top of stack, exiting"
                         return
                 outer = 0
                 loop
@@ -590,6 +626,7 @@ parse_html = (txt, parse_error_cb = null) ->
                         # If there is no such element, then abort these steps and instead
                         # act as described in the "any other end tag" entry above.
                         if fe is null
+                               debug_log "aaa: fe not found in afe"
                                 in_body_any_other_end_tag subject
                                 return
                         # 6. If formatting element is not in the stack of open elements,
@@ -601,6 +638,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                         in_open_els = true
                                         break
                         unless in_open_els
+                               debug_log "aaa: fe not found in open_els"
                                 parse_error()
                                 # "remove it from the list" must mean afe, since it's not in open_els
                                 afe.splice fe_of_afe, 1
@@ -609,6 +647,7 @@ parse_html = (txt, parse_error_cb = null) ->
                         # the element is not in scope, then this is a parse error; abort
                         # these steps.
                         unless el_is_in_scope fe
+                               debug_log "aaa: fe not in scope"
                                 parse_error()
                                 return
                         # 8. If formatting element is not the current node, this is a parse
@@ -634,6 +673,7 @@ parse_html = (txt, parse_error_cb = null) ->
                         # formatting element from the list of active formatting elements,
                         # and finally abort these steps.
                         if fb is null
+                               debug_log "aaa: no fb"
                                 loop
                                         t = open_els.shift()
                                         if t is fe
@@ -666,8 +706,8 @@ parse_html = (txt, parse_error_cb = null) ->
                                                 break
                                 node = node_next ? node_above
                                 debug_log "inner loop #{inner}"
-                               debug_log "open_els: #{serialize_els open_els, true, true}"
                                 debug_log "tree: #{serialize_els tree.children, false, true}"
+                               debug_log "open_els: #{serialize_els open_els, true, true}"
                                 debug_log "afe: #{serialize_els afe, true, true}"
                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
@@ -839,22 +879,23 @@ parse_html = (txt, parse_error_cb = null) ->
                 debug_log "AAA DONE"
  
         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
-       # FIXME test this (particularly emplied end tags)
         close_p_element = ->
                 generate_implied_end_tags 'p' # arg is exception
                 if open_els[0].name isnt 'p'
                         parse_error()
                 while open_els.length > 1 # just in case
-                       t = open_els.shift()
-                       if t.name is 'p'
+                       el = open_els.shift()
+                       if el.name is 'p'
                                 return
         close_p_if_in_button_scope = ->
                 if is_in_button_scope 'p'
-                       close_a_p_element()
+                       close_p_element()
  
         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
-       tree_insert_text = (t) ->
+       # aka insert_a_character = (t) ->
+       insert_character = (t) ->
                 dest = adjusted_insertion_location()
+               # fixfull check for Document node
                 if dest[1] > 0
                         prev = dest[0].children[dest[1] - 1]
                         if prev.type is TYPE_TEXT
@@ -1012,17 +1053,114 @@ parse_html = (txt, parse_error_cb = null) ->
  
         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
         # position should be [node, index_within_children]
-       tree_insert_a_comment = (t, position = null) ->
+       tree_insert_comment = (t, position = null) ->
                 position ?= adjusted_insertion_location()
                 position[0].children.splice position[1], 0, t
  
+       # 8.2.5.2
+       # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
+       parse_generic_raw_text = (t) ->
+               insert_html_element t
+               tok_state = tok_state_rawtext
+               original_insertion_mode = insertion_mode
+               insertion_mode = ins_mode_text
+       parse_generic_rcdata_text = (t) ->
+               insert_html_element t
+               tok_state = tok_state_rcdata
+               original_insertion_mode = insertion_mode
+               insertion_mode = ins_mode_text
+
         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
         generate_implied_end_tags = (except = null) ->
-               while end_tag_implied[open_els[0]] and open_els[0].name isnt except
+               while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
                         open_els.shift()
  
-       # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
+       # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
+       ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
+               open_els.shift() # spec says this will be a 'head' node
+               insertion_mode = ins_mode_after_head
+               insertion_mode t
+       ins_mode_in_head = (t) ->
+               if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
+                       insert_character t
+                       return
+               if t.type is TYPE_COMMENT
+                       tree_insert_comment t
+                       return
+               if t.type is TYPE_DOCTYPE
+                       parse_error()
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'html'
+                       ins_mode_in_body t
+                       return
+               if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
+                       el = insert_html_element t
+                       open_els.shift()
+                       el.acknowledge_self_closing()
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'meta'
+                       el = insert_html_element t
+                       open_els.shift()
+                       el.acknowledge_self_closing()
+                       # fixfull encoding stuff
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'title'
+                       parse_generic_rcdata_element t
+                       return
+               if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
+                       parse_generic_raw_text t
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
+                       insert_html_element t
+                       insertion_mode = in_head_noscript # FIXME implement
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'script'
+                       ail = adjusted_insertion_location()
+                       el = token_to_element t, NS_HTML, ail
+                       el.flag_parser_inserted true # FIXME implement
+                       # fixfull frament case
+                       ail[0].children.splice ail[1], 0, el
+                       open_els.unshift el
+                       tok_state = tok_state_script_data
+                       original_insertion_mode = insertion_mode # make sure orig... is defined
+                       insertion_mode = ins_mode_text # FIXME implement
+                       return
+               if t.type is TYPE_END_TAG and t.name is 'head'
+                       open_els.shift() # will be a head element... spec says so
+                       insertion_mode = ins_mode_after_head
+                       return
+               if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
+                       ins_mode_in_head_else t
+                       return
+               if t.type is TYPE_START_TAG and t.name is 'template'
+                       insert_html_element t
+                       afe_push_marker()
+                       flag_frameset_ok = false
+                       insertion_mode = ins_mode_in_template
+                       template_insertion_modes.unshift ins_mode_in_template # FIXME implement
+                       return
+               if t.type is TYPE_END_TAG and t.name is 'template'
+                       if template_tag_is_open()
+                               generate_implied_end_tags
+                               if open_els[0].name isnt 'template'
+                                       parse_error()
+                               loop
+                                       el = open_els.shift()
+                                       if el.name is 'template'
+                                               break
+                               clear_afe_to_marker()
+                               template_insertion_modes.shift()
+                               reset_insertion_mode()
+                       else
+                               parse_error()
+                       return
+               if (t.type is TYPE_OPEN_TAG and t.name is 'head') or t.type is TYPE_END_TAG
+                       parse_error()
+                       return
+               ins_mode_in_head_else t
+
+       # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
                 for node, i in open_els
                         if node.name is name # FIXME check namespace too
@@ -1043,13 +1181,13 @@ parse_html = (txt, parse_error_cb = null) ->
                                                 parse_error()
                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
                                                 reconstruct_active_formatting_elements()
-                                               tree_insert_text t
+                                               insert_character t
                                         else
                                                 reconstruct_active_formatting_elements()
-                                               tree_insert_text t
+                                               insert_character t
                                                 flag_frameset_ok = false
                         when TYPE_COMMENT
-                               tree_insert_a_comment t
+                               tree_insert_comment t
                         when TYPE_DOCTYPE
                                 parse_error()
                         when TYPE_START_TAG
@@ -1062,7 +1200,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                                         root_attrs[k] = v unless root_attrs[k]?
                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
                                                 # FIXME also do this for </template> (end tag)
-                                               return tree_in_head t
+                                               return ins_mode_in_head t
                                         when 'body'
                                                 parse_error()
                                                 # TODO
@@ -1106,12 +1244,12 @@ parse_html = (txt, parse_error_cb = null) ->
                                                                 if el is found
                                                                         open_els.splice i, 1
                                                 reconstruct_active_formatting_elements()
-                                               el = tree_insert_element t
-                                               afe.unshift el
+                                               el = insert_html_element t
+                                               afe_push el
                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
                                                 reconstruct_active_formatting_elements()
-                                               el = tree_insert_element t
-                                               afe.unshift el
+                                               el = insert_html_element t
+                                               afe_push el
                                         when 'table'
                                                 # fixfull quirksmode thing
                                                 close_p_if_in_button_scope()
@@ -1120,7 +1258,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                         # TODO lots more to implement here
                                         else # any other start tag
                                                 reconstruct_active_formatting_elements()
-                                               tree_insert_element t
+                                               insert_html_element t
                         when TYPE_EOF
                                 ok_tags = {
                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
@@ -1160,7 +1298,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                                 unless is_in_button_scope 'p'
                                                         parse_error()
                                                         insert_html_element new_open_tag 'p'
-                                                       close_p_element()
+                                               close_p_element()
                                         # TODO lots more close tags to implement here
                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
                                                 adoption_agency t.name
@@ -1221,6 +1359,36 @@ parse_html = (txt, parse_error_cb = null) ->
                         el = afe.shift()
                         if el.type is TYPE_AFE_MARKER
                                 return
+
+       # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
+       ins_mode_text = (t) ->
+               if t.type is TYPE_TEXT
+                       insert_character t
+                       return
+               if t.type is TYPE_EOF
+                       parse_error()
+                       if open_els[0].name is 'script'
+                               open_els[0].flag 'already started', true
+                       open_els.shift()
+                       insertion_mode = original_insertion_mode
+                       insertion_mode t
+                       return
+               if t.type is TYPE_END_TAG and t.name is 'script'
+                       open_els.shift()
+                       insertion_mode = original_insertion_mode
+                       # fixfull the spec seems to assume that I'm going to run the script
+                       # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
+                       return
+               if t.type is TYPE_END_TAG
+                       open_els.shift()
+                       insertion_mode = original_insertion_mode
+                       return
+               console.log 'warning: end of ins_mode_text reached'
+
+       # the functions below implement the tokenizer stats described here:
+       # http://www.w3.org/TR/html5/syntax.html#tokenization
+
+       # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
         ins_mode_in_table = (t) ->
                 switch t.type
                         when TYPE_TEXT
@@ -1231,14 +1399,14 @@ parse_html = (txt, parse_error_cb = null) ->
                                 else
                                         ins_mode_in_table_else t
                         when TYPE_COMMENT
-                               tree_insert_a_comment t
+                               tree_insert_comment t
                         when TYPE_DOCTYPE
                                 parse_error()
                         when TYPE_START_TAG
                                 switch t.name
                                         when 'caption'
                                                 clear_stack_to_table_context()
-                                               afe.unshift new_afe_marker()
+                                               afe_push_marker()
                                                 insert_html_element t
                                                 insertion_mode = ins_mode_in_caption
                                         when 'colgroup'
@@ -1275,9 +1443,9 @@ parse_html = (txt, parse_error_cb = null) ->
                                                         ins_mode_in_table_else t
                                                 else
                                                         parse_error()
-                                                       insert_html_element t
+                                                       el = insert_html_element t
                                                         open_els.shift()
-                                                       # fixfull acknowledge sef-closing flag
+                                                       el.acknowledge_self_closing()
                                         when 'form'
                                                 parse_error()
                                                 if form_element_pointer?
@@ -1311,6 +1479,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                 ins_mode_in_table_else t
  
  
+       # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
         ins_mode_in_table_text = (t) ->
                 switch t.type
                         when TYPE_TEXT
@@ -1321,6 +1490,7 @@ parse_html = (txt, parse_error_cb = null) ->
                 console.log "unimplemented ins_mode_in_table_text"
                 # FIXME CONTINUE
  
+       # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
         ins_mode_in_table_body = (t) ->
                 if t.type is TYPE_START_TAG and t.name is 'tr'
                         clear_stack_to_table_body_context()
@@ -1363,12 +1533,13 @@ parse_html = (txt, parse_error_cb = null) ->
                 # Anything else
                 ins_mode_in_table t
  
+       # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
         ins_mode_in_row = (t) ->
                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
                         clear_stack_to_table_row_context()
                         insert_html_element t
                         insertion_mode = ins_mode_in_cell
-                       afe.unshift new_afe_marker()
+                       afe_push_marker()
                         return
                 if t.type is TYPE_END_TAG and t.name is 'tr'
                         if is_in_table_scope 'tr'
@@ -1415,7 +1586,7 @@ parse_html = (txt, parse_error_cb = null) ->
                 clear_afe_to_marker()
                 insertion_mode = ins_mode_in_row
  
-       # http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
+       # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
         ins_mode_in_cell = (t) ->
                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
                         if is_in_table_scope t.name
@@ -1458,15 +1629,11 @@ parse_html = (txt, parse_error_cb = null) ->
                 # Anything Else
                 ins_mode_in_body t
  
-
-       # the functions below implement the tokenizer stats described here:
-       # http://www.w3.org/TR/html5/syntax.html#tokenization
-
         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
         tok_state_data = ->
                 switch c = txt.charAt(cur++)
                         when '&'
-                               return new_text_node tokenize_character_reference()
+                               return new_text_node parse_character_reference()
                         when '<'
                                 tok_state = tok_state_tag_open
                         when "\u0000"
@@ -1480,7 +1647,68 @@ parse_html = (txt, parse_error_cb = null) ->
  
         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
         # not needed: tok_state_character_reference_in_data = ->
-       # just call tok_state_character_reference_in_data()
+       # just call parse_character_reference()
+
+       # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
+       tok_state_rcdata = ->
+               switch c = txt.charAt(cur++)
+                       when '&'
+                               return new_text_node parse_character_reference()
+                       when '<'
+                               tok_state = tok_state_rcdata_less_than_sign
+                       when "\u0000"
+                               parse_error()
+                               return new_character_token "\ufffd"
+                       when '' # EOF
+                               return new_eof_token()
+                       else
+                               return new_character_token c
+               return null
+
+       # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
+       # not needed: tok_state_character_reference_in_rcdata = ->
+       # just call parse_character_reference()
+
+       # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
+       tok_state_rawtext = ->
+               switch c = txt.charAt(cur++)
+                       when '<'
+                               tok_state = tok_state_rawtext_less_than_sign
+                       when "\u0000"
+                               parse_error()
+                               return new_character_token "\ufffd"
+                       when '' # EOF
+                               return new_eof_token()
+                       else
+                               return new_character_token c
+               return null
+
+       # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
+       tok_state_script_data = ->
+               switch c = txt.charAt(cur++)
+                       when '<'
+                               tok_state = tok_state_script_data_less_than_sign
+                       when "\u0000"
+                               parse_error()
+                               return new_character_token "\ufffd"
+                       when '' # EOF
+                               return new_eof_token()
+                       else
+                               return new_character_token c
+               return null
+
+       # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
+       tok_state_plaintext = ->
+               switch c = txt.charAt(cur++)
+                       when "\u0000"
+                               parse_error()
+                               return new_character_token "\ufffd"
+                       when '' # EOF
+                               return new_eof_token()
+                       else
+                               return new_character_token c
+               return null
+
  
         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
         tok_state_tag_open = ->
@@ -1553,6 +1781,140 @@ parse_html = (txt, parse_error_cb = null) ->
                                         tok_cur_tag.name += c
                 return null
  
+       # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
+       tok_state_rcdata_less_than_sign = ->
+               c = txt.charAt(cur++)
+               if c is '/'
+                       temporary_buffer = ''
+                       tok_state = tok_state_rcdata_end_tag_open
+                       return null
+               # Anything else
+               tok_state = tok_state_rcdata
+               cur -= 1 # reconsume the input character
+               return new_character_token '<'
+
+       # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
+       tok_state_rcdata_end_tag_open = ->
+               c = txt.charAt(cur++)
+               if uc_alpha.indexOf(c) > -1
+                       tok_cur_tag = new_end_tag c.toLowerCase()
+                       temporary_buffer += c
+                       tok_state = tok_state_rcdata_end_tag_name
+                       return null
+               if lc_alpha.indexOf(c) > -1
+                       tok_cur_tag = new_end_tag c
+                       temporary_buffer += c
+                       tok_state = tok_state_rcdata_end_tag_name
+                       return null
+               # Anything else
+               tok_state = tok_state_rcdata
+               cur -= 1 # reconsume the input character
+               return new_character_token "</" # fixfull separate these
+
+       # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
+       is_appropriate_end_tag = (t) ->
+               # spec says to check against "the tag name of the last start tag to
+               # have been emitted from this tokenizer", but this is only called from
+               # the various "raw" states, which I'm pretty sure all push the start
+               # token onto open_els. TODO: verify this after the script data states
+               # are implemented
+               debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
+               return t.type is TYPE_END_TAG and t.name is open_els[0].name
+
+       # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
+       tok_state_rcdata_end_tag_name = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
+                       if is_appropriate_end_tag tok_cur_tag
+                               tok_state = tok_state_before_attribute_name
+                               return
+                       # else fall through to "Anything else"
+               if c is '/'
+                       if is_appropriate_end_tag tok_cur_tag
+                               tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
+                               return
+                       # else fall through to "Anything else"
+               if c is '>'
+                       if is_appropriate_end_tag tok_cur_tag
+                               tok_state = tok_state_data
+                               return tok_cur_tag
+                       # else fall through to "Anything else"
+               if uc_alpha.indexOf(c) > -1
+                       tok_cur_tag.name += c.toLowerCase()
+                       temporary_buffer += c
+                       return null
+               if lc_alpha.indexOf(c) > -1
+                       tok_cur_tag.name += c
+                       temporary_buffer += c
+                       return null
+               # Anything else
+               tok_state = tok_state_rcdata
+               cur -= 1 # reconsume the input character
+               return new_character_token '</' + temporary_buffer # fixfull separate these
+
+       # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
+       tok_state_rawtext_less_than_sign = ->
+               c = txt.charAt(cur++)
+               if c is '/'
+                       temporary_buffer = ''
+                       tok_state = tok_state_rawtext_end_tag_open
+                       return null
+               # Anything else
+               tok_state = tok_state_rawtext
+               cur -= 1 # reconsume the input character
+               return new_character_token '<'
+
+       # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
+       tok_state_rawtext_end_tag_open = ->
+               c = txt.charAt(cur++)
+               if uc_alpha.indexOf(c) > -1
+                       tok_cur_tag = new_end_tag c.toLowerCase()
+                       temporary_buffer += c
+                       tok_state = tok_state_rawtext_end_tag_name
+                       return null
+               if lc_alpha.indexOf(c) > -1
+                       tok_cur_tag = new_end_tag c
+                       temporary_buffer += c
+                       tok_state = tok_state_rawtext_end_tag_name
+                       return null
+               # Anything else
+               tok_state = tok_state_rawtext
+               cur -= 1 # reconsume the input character
+               return new_character_token "</" # fixfull separate these
+
+       # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
+       tok_state_rawtext_end_tag_name = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
+                       if is_appropriate_end_tag tok_cur_tag
+                               tok_state = tok_state_before_attribute_name
+                               return
+                       # else fall through to "Anything else"
+               if c is '/'
+                       if is_appropriate_end_tag tok_cur_tag
+                               tok_state = tok_state_self_closing_start_tag
+                               return
+                       # else fall through to "Anything else"
+               if c is '>'
+                       if is_appropriate_end_tag tok_cur_tag
+                               tok_state = tok_state_data
+                               return tok_cur_tag
+                       # else fall through to "Anything else"
+               if uc_alpha.indexOf(c) > -1
+                       tok_cur_tag.name += c.toLowerCase()
+                       temporary_buffer += c
+                       return null
+               if lc_alpha.indexOf(c) > -1
+                       tok_cur_tag.name += c
+                       temporary_buffer += c
+                       return null
+               # Anything else
+               tok_state = tok_state_rawtext
+               cur -= 1 # reconsume the input character
+               return new_character_token '</' + temporary_buffer # fixfull separate these
+
+       # TODO _all_ of the missing states here (17-33) are for parsing script tags
+
         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
         tok_state_before_attribute_name = ->
                 attr_name = null
@@ -1616,6 +1978,41 @@ parse_html = (txt, parse_error_cb = null) ->
                                         tok_cur_tag.attrs_a[0][0] += c
                 return null
  
+       # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
+       tok_state_after_attribute_name = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
+                       return
+               if c is '/'
+                       tok_state = tok_state_self_closing_start_tag
+                       return
+               if c is '='
+                       tok_state = tok_state_before_attribute_value
+                       return
+               if c is '>'
+                       tok_state = tok_state_data
+                       return
+               if uc_alpha.indexOf(c) > -1
+                       tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
+                       tok_state = tok_state_attribute_name
+                       return
+               if c is "\u0000"
+                       parse_error()
+                       tok_cur_tag.attrs_a.unshift ["\ufffd", '']
+                       tok_state = tok_state_attribute_name
+                       return
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       cur -= 1 # reconsume
+                       return
+               if c is '"' or c is "'" or c is '<'
+                       parse_error()
+                       # fall through to Anything else
+               # Anything else
+               tok_cur_tag.attrs_a.unshift [c, '']
+               tok_state = tok_state_attribute_name
+
         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
         tok_state_before_attribute_value = ->
                 switch c = txt.charAt(cur++)
@@ -1652,7 +2049,7 @@ parse_html = (txt, parse_error_cb = null) ->
                         when '"'
                                 tok_state = tok_state_after_attribute_value_quoted
                         when '&'
-                               tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
+                               tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
                         when "\u0000"
                                 # Parse error
                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
@@ -1669,7 +2066,7 @@ parse_html = (txt, parse_error_cb = null) ->
                         when "'"
                                 tok_state = tok_state_after_attribute_value_quoted
                         when '&'
-                               tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
+                               tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
                         when "\u0000"
                                 # Parse error
                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
@@ -1686,7 +2083,7 @@ parse_html = (txt, parse_error_cb = null) ->
                         when "\t", "\n", "\u000c", ' '
                                 tok_state = tok_state_before_attribute_name
                         when '&'
-                               tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
+                               tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
                         when '>'
                                 tok_state = tok_state_data
                                 tmp = tok_cur_tag
@@ -1726,7 +2123,7 @@ parse_html = (txt, parse_error_cb = null) ->
         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
         # Don't set this as a state, just call it
         # returns a string (NOT a text node)
-       tokenize_character_reference = (allowed_char = null, in_attr = false) ->
+       parse_character_reference = (allowed_char = null, in_attr = false) ->
                 if cur >= txt.length
                         return '&'
                 switch c = txt.charAt(cur)
@@ -1803,12 +2200,16 @@ parse_html = (txt, parse_error_cb = null) ->
         # see comments on TYPE_TAG/etc for the structure of this data
         tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
         open_els = [tree]
+       afe = [] # active formatting elements
+       template_insertion_modes = []
         insertion_mode = ins_mode_in_body
+       original_insertion_mode = insertion_mode # TODO check spec
+       flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
         flag_frameset_ok = true
         flag_parsing = true
         flag_foster_parenting = false
         form_element_pointer = null
-       afe = [] # active formatting elements
+       temporary_buffer = null
  
         # tokenizer initialization
         tok_state = tok_state_data
@@ -1844,119 +2245,138 @@ test_parser = (args) ->
         prev_node_id = 0 # reset counter
         parsed = parse_html args.html, errors_cb
         serialized = serialize_els parsed, false, false
-       if serialized isnt args.expected # or parse_errors.length isnt args.errors
+       if serialized isnt args.expected
                 debug_log_each (str) ->
                         console.log str
                 console.log "FAILED: \"#{args.name}\""
-       else
-               console.log "passed \"#{args.name}\""
-       if serialized isnt args.expected
                 console.log "      Input: #{args.html}"
                 console.log "    Correct: #{args.expected}"
                 console.log "     Output: #{serialized}"
-               if parse_errors.length isnt args.errors
-                       console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
+               if parse_errors.length > 0
+                       console.log " parse errs: #{JSON.stringify parse_errors}"
+               else
+                       console.log "   No parse errors"
+       else
+               console.log "passed \"#{args.name}\""
  
  test_parser name: "empty", \
         html: "",
-       expected: '',
-       errors: 0
+       expected: ''
  test_parser name: "just text", \
         html: "abc",
-       expected: 'text:"abc"',
-       errors: 0
+       expected: 'text:"abc"'
  test_parser name: "named entity", \
         html: "a&amp;1234",
-       expected: 'text:"a&1234"',
-       errors: 0
+       expected: 'text:"a&1234"'
  test_parser name: "broken named character references", \
         html: "1&amp2&&amp;3&aabbcc;",
-       expected: 'text:"1&2&&3&aabbcc;"',
-       errors: 2
+       expected: 'text:"1&2&&3&aabbcc;"'
  test_parser name: "numbered entity overrides", \
         html: "1&#X80&#x80; &#x83",
-       expected: 'text:"1€€ ƒ"',
-       errors: 0
+       expected: 'text:"1€€ ƒ"'
  test_parser name: "open tag", \
         html: "foo<span>bar",
-       expected: 'text:"foo",tag:"span",{},[text:"bar"]',
-       errors: 1 # no close tag
+       expected: 'text:"foo",tag:"span",{},[text:"bar"]'
  test_parser name: "open tag with attributes", \
         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
-       expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
-       errors: 1 # no close tag
+       expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
  test_parser name: "open tag with attributes of various quotings", \
         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
-       expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
-       errors: 1 # no close tag
+       expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]'
  test_parser name: "attribute entity exceptions dq", \
         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
-       expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
-       errors: 2 # no close tag, &amp= in attr
+       expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
  test_parser name: "attribute entity exceptions sq", \
         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
-       expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
-       errors: 2 # no close tag, &amp= in attr
+       expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
  test_parser name: "attribute entity exceptions uq", \
         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
-       expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
-       errors: 2 # no close tag, &amp= in attr
+       expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]'
  test_parser name: "matching closing tags", \
         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
-       expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
-       errors: 0
+       expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"'
  test_parser name: "missing closing tag inside", \
         html: "foo<div>bar<span>baz</div>qux",
-       expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
-       errors: 1 # close tag mismatch
+       expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"'
  test_parser name: "mis-matched closing tags", \
         html: "<span>12<div>34</span>56</div>78",
-       expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
-       errors: 2 # misplaced </span>, no </span> at the end
+       expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]'
  test_parser name: "mis-matched formatting elements", \
         html: "12<b>34<i>56</b>78</i>90",
-       expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
-       errors: 1 # no idea how many their should be
+       expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"'
  test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
         html: '<p>1<b>2<i>3</b>4</i>5</p>',
-       expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]',
-       errors: 1
+       expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]'
  test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
         html: '<b>1<p>2</b>3</p>',
-       expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]',
-       errors: 1
+       expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]'
  test_parser name: "crazy formatting elements test", \
         html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
         # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
         # firefox does this:
-       expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"',
-       errors: 6 # no idea how many there should be
+       expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
  # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
  test_parser name: "html5lib aaa 1", \
         html: '<a><p></a></p>',
-       expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]',
-       errors: 2
+       expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]'
  test_parser name: "html5lib aaa 2", \
         html: '<a>1<p>2</a>3</p>',
-       expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]',
-       errors: 2
+       expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]'
  test_parser name: "html5lib aaa 3", \
         html: '<a>1<button>2</a>3</button>',
-       expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]',
-       errors: 2
+       expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]'
  test_parser name: "html5lib aaa 4", \
         html: '<a>1<b>2</a>3</b>',
-       expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]',
-       errors: 2
+       expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]'
  test_parser name: "html5lib aaa 5 (two divs deep)", \
         html: '<a>1<div>2<div>3</a>4</div>5</div>',
-       expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]',
-       errors: 3
+       expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]'
  test_parser name: "html5lib aaa 6 (foster parenting)", \
         html: '<table><a>1<p>2</a>3</p>',
-       expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]',
-       errors: 10
+       expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]'
+test_parser name: "html5lib aaa 7 (aaa, eof) 1", \
+       html: '<b><b><a><p></a>',
+       expected: 'tag:"b",{},[tag:"b",{},[tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]]]'
+test_parser name: "html5lib aaa 8 (aaa, eof) 2", \
+       html: '<b><a><b><p></a>',
+       expected: 'tag:"b",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
+test_parser name: "html5lib aaa 9 (aaa, eof) 3", \
+       html: '<a><b><b><p></a>',
+       expected: 'tag:"a",{},[tag:"b",{},[tag:"b",{},[]]],tag:"b",{},[tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
+test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \
+       html: '<p>1<s id="A">2<b id="B">3</p>4</s>5</b>',
+       expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]'
  test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
         html: '<table><a>1<td>2</td>3</table>',
-       expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]',
-       errors: 10
+       expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]'
+test_parser name: "html5lib aaa 12 (table with foster parenting, split text)", \
+       html: '<table>A<td>B</td>C</table>',
+       expected: 'text:"AC",tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
+# TODO implement svg and namespacing
+#test_parser name: "html5lib aaa 13 (svg tr input)", \
+#      html: '<a><svg><tr><input></a>',
+#      expected: 'tag:"a",{},[svg:"svg",{},[svg:"tr",{},[svg:"input"]]]'
+test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \
+       html: '<div><a><b><div><div><div><div><div><div><div><div><div><div></a>',
+       expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"div",{},[tag:"div",{},[]]]]]]]]]]]]]'
+test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \
+       html: '<div><a><b><u><i><code><div></a>',
+       expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]'
+test_parser name: "html5lib aaa 16 (correctly nested 4b)", \
+       html: '<b><b><b><b>x</b></b></b></b>y',
+       expected: 'tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]],text:"y"'
+test_parser name: "html5lib aaa 17 (formatting, implied /p, noah's ark)", \
+       html: '<p><b><b><b><b><p>x',
+       expected: 'tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[]]]]],tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]]'
+test_parser name: "variation on html5lib aaa 17 (with attributes in various orders)", \
+       html: '<p><b c="d" e="f"><b e="f" c="d"><b e="f" c="d"><b c="d" e="f"><p>x',
+       expected: 'tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[]]]]],tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[text:"x"]]]]'
+test_parser name: "junk after attribute close-quote", \
+       html: '<p><b c="d", e="f">foo<p>x',
+       expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'
+test_parser name: "html5lib aaa02 1", \
+       html: '<b>1<i>2<p>3</b>4',
+       expected: 'tag:"b",{},[text:"1",tag:"i",{},[text:"2"]],tag:"i",{},[tag:"p",{},[tag:"b",{},[text:"3"],text:"4"]]'
+test_parser name: "html5lib aaa02 2", \
+       html: '<a><div><style></style><address><a>',
+       expected: 'tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"style",{},[]],tag:"address",{},[tag:"a",{},[],tag:"a",{},[]]]'