JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
fix any-other-end-tag, tweaks
[peach-html5-editor.git] / parse-html.coffee
index 5dc05a9..adb9bab 100644 (file)
@@ -24,7 +24,7 @@
 #
 # Deviations from that spec:
 #
-#   Purposeful: search this file for "WTAG"
+#   Purposeful: search this file for "WHATWG"
 #
 #   Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
 
@@ -84,6 +84,11 @@ NS_HTML = 1
 NS_MATHML = 2
 NS_SVG = 3
 
+# quirks mode constants
+QUIRKS_NO = 1
+QUIRKS_LIMITED = 2
+QUIRKS_YES = 3
+
 g_debug_log = []
 debug_log_reset = ->
        g_debug_log = []
@@ -112,7 +117,7 @@ class Node
                        @id = "#{++prev_node_id}"
        acknowledge_self_closing: ->
                if @token?
-                       @token.flag 'did_self_close'
+                       @token.flag 'did_self_close', true
                else
                        @flag 'did_self_close', true
        flag: (key, value = null) ->
@@ -249,6 +254,64 @@ unicode_fixes[0x9C] = "\u0153"
 unicode_fixes[0x9E] = "\u017E"
 unicode_fixes[0x9F] = "\u0178"
 
+quirks_yes_pi_prefixes = [
+       "+//silmaril//dtd html pro v0r11 19970101//"
+       "-//as//dtd html 3.0 aswedit + extensions//"
+       "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
+       "-//ietf//dtd html 2.0 level 1//"
+       "-//ietf//dtd html 2.0 level 2//"
+       "-//ietf//dtd html 2.0 strict level 1//"
+       "-//ietf//dtd html 2.0 strict level 2//"
+       "-//ietf//dtd html 2.0 strict//"
+       "-//ietf//dtd html 2.0//"
+       "-//ietf//dtd html 2.1e//"
+       "-//ietf//dtd html 3.0//"
+       "-//ietf//dtd html 3.2 final//"
+       "-//ietf//dtd html 3.2//"
+       "-//ietf//dtd html 3//"
+       "-//ietf//dtd html level 0//"
+       "-//ietf//dtd html level 1//"
+       "-//ietf//dtd html level 2//"
+       "-//ietf//dtd html level 3//"
+       "-//ietf//dtd html strict level 0//"
+       "-//ietf//dtd html strict level 1//"
+       "-//ietf//dtd html strict level 2//"
+       "-//ietf//dtd html strict level 3//"
+       "-//ietf//dtd html strict//"
+       "-//ietf//dtd html//"
+       "-//metrius//dtd metrius presentational//"
+       "-//microsoft//dtd internet explorer 2.0 html strict//"
+       "-//microsoft//dtd internet explorer 2.0 html//"
+       "-//microsoft//dtd internet explorer 2.0 tables//"
+       "-//microsoft//dtd internet explorer 3.0 html strict//"
+       "-//microsoft//dtd internet explorer 3.0 html//"
+       "-//microsoft//dtd internet explorer 3.0 tables//"
+       "-//netscape comm. corp.//dtd html//"
+       "-//netscape comm. corp.//dtd strict html//"
+       "-//o'reilly and associates//dtd html 2.0//"
+       "-//o'reilly and associates//dtd html extended 1.0//"
+       "-//o'reilly and associates//dtd html extended relaxed 1.0//"
+       "-//sq//dtd html 2.0 hotmetal + extensions//"
+       "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
+       "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
+       "-//spyglass//dtd html 2.0 extended//"
+       "-//sun microsystems corp.//dtd hotjava html//"
+       "-//sun microsystems corp.//dtd hotjava strict html//"
+       "-//w3c//dtd html 3 1995-03-24//"
+       "-//w3c//dtd html 3.2 draft//"
+       "-//w3c//dtd html 3.2 final//"
+       "-//w3c//dtd html 3.2//"
+       "-//w3c//dtd html 3.2s draft//"
+       "-//w3c//dtd html 4.0 frameset//"
+       "-//w3c//dtd html 4.0 transitional//"
+       "-//w3c//dtd html experimental 19960712//"
+       "-//w3c//dtd html experimental 970421//"
+       "-//w3c//dtd w3 html//"
+       "-//w3o//dtd w3 html 3.0//"
+       "-//webtechs//dtd mozilla html 2.0//"
+       "-//webtechs//dtd mozilla html//"
+]
+
 # These are the character references that don't need a terminating semicolon
 # min length: 2, max: 6, none are a prefix of any other.
 legacy_char_refs = {
@@ -342,7 +405,7 @@ special_elements = {
        img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
        listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
 
-       menu:NS_HTML,menuitem:NS_HTML, # WATWG adds these
+       menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
 
        meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
        noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
@@ -468,7 +531,7 @@ svg_attribute_fixes = {
        diffuseconstant: 'diffuseConstant'
        edgemode: 'edgeMode'
        externalresourcesrequired: 'externalResourcesRequired'
-       filterres: 'filterRes'
+       # WHATWG removes this: filterres: 'filterRes'
        filterunits: 'filterUnits'
        glyphref: 'glyphRef'
        gradienttransform: 'gradientTransform'
@@ -520,6 +583,20 @@ svg_attribute_fixes = {
        ychannelselector: 'yChannelSelector'
        zoomandpan: 'zoomAndPan'
 }
+foreign_attr_fixes = {
+       'xlink:actuate': 'xlink actuate'
+       'xlink:arcrole': 'xlink arcrole'
+       'xlink:href': 'xlink href'
+       'xlink:role': 'xlink role'
+       'xlink:show': 'xlink show'
+       'xlink:title': 'xlink title'
+       'xlink:type': 'xlink type'
+       'xml:base': 'xml base'
+       'xml:lang': 'xml lang'
+       'xml:space': 'xml space'
+       'xmlns': 'xmlns'
+       'xmlns:xlink': 'xmlns xlink'
+}
 adjust_mathml_attributes = (t) ->
        for a in t.attrs_a
                if a[0] is 'definitionurl'
@@ -532,6 +609,9 @@ adjust_svg_attributes = (t) ->
        return
 adjust_foreign_attributes = (t) ->
        # fixfull
+       for a in t.attrs_a
+               if foreign_attr_fixes[a[0]]?
+                       a[0] = foreign_attr_fixes[a[0]]
        return
 
 # decode_named_char_ref()
@@ -588,18 +668,29 @@ parse_html = (args) ->
                else
                        console.log "Parse error at character #{cur} of #{txt.length}"
 
+       # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
+       # "Noah's Ark clause" but with three
        afe_push = (new_el) ->
                matches = 0
                for el, i in afe
+                       if el.type is TYPE_AFE_MARKER
+                               break
                        if el.name is new_el.name and el.namespace is new_el.namespace
+                               attrs_match = true
                                for k, v of el.attrs
-                                       continue unless new_el.attrs[k] is v
-                               for k, v of new_el.attrs
-                                       continue unless el.attrs[k] is v
-                               matches += 1
-                               if matches is 3
-                                       afe.splice i, 1
-                                       break
+                                       unless new_el.attrs[k] is v
+                                               attrs_match = false
+                                               break
+                               if attrs_match
+                                       for k, v of new_el.attrs
+                                               unless el.attrs[k] is v
+                                                       attrs_match = false
+                                                       break
+                               if attrs_match
+                                       matches += 1
+                                       if matches is 3
+                                               afe.splice i, 1
+                                               break
                afe.unshift new_el
        afe_push_marker = ->
                afe.unshift new_afe_marker()
@@ -609,33 +700,33 @@ parse_html = (args) ->
 
        # But first... the helpers
        template_tag_is_open = ->
-               for t in open_els
-                       if t.name is 'template' and t.namespace is NS_HTML
+               for el in open_els
+                       if el.name is 'template' and el.namespace is NS_HTML
                                return true
                return false
        is_in_scope_x = (tag_name, scope, namespace) ->
-               for t in open_els
-                       if t.name is tag_name and (namespace is null or namespace is t.namespace)
+               for el in open_els
+                       if el.name is tag_name and (namespace is null or namespace is el.namespace)
                                return true
-                       if scope[t.name] is t.namespace
+                       if scope[el.name] is el.namespace
                                return false
                return false
        is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
-               for t in open_els
-                       if t.name is tag_name and (namespace is null or namespace is t.namespace)
+               for el in open_els
+                       if el.name is tag_name and (namespace is null or namespace is el.namespace)
                                return true
-                       if scope[t.name] is t.namespace
+                       if scope[el.name] is el.namespace
                                return false
-                       if scope2[t.name] is t.namespace
+                       if scope2[el.name] is el.namespace
                                return false
                return false
        standard_scopers = {
                applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
                td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
-               template: NS_HTML, mi: NS_MATHML,
+               template: NS_HTML,
 
-               mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
-               'annotation-xml': NS_MATHML,
+               mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
+               mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
 
                foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
        }
@@ -879,16 +970,46 @@ parse_html = (args) ->
                debug_log "tree: #{serialize_els doc.children, false, true}"
                debug_log "open_els: #{serialize_els open_els, true, true}"
                debug_log "afe: #{serialize_els afe, true, true}"
+# this block implements tha W3C spec
+#              # 1. If the current node is an HTML element whose tag name is subject,
+#              # then run these substeps:
+#              #
+#              # 1. Let element be the current node.
+#              #
+#              # 2. Pop element off the stack of open elements.
+#              #
+#              # 3. If element is also in the list of active formatting elements,
+#              # remove the element from the list.
+#              #
+#              # 4. Abort the adoption agency algorithm.
+#              if open_els[0].name is subject and open_els[0].namespace is NS_HTML
+#                      el = open_els.shift()
+#                      # remove it from the list of active formatting elements (if found)
+#                      for t, i in afe
+#                              if t is el
+#                                      afe.splice i, 1
+#                                      break
+#                      debug_log "aaa: starting off with subject on top of stack, exiting"
+#                      return
+# WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
+               # If the current node is an HTML element whose tag name is subject, and
+               # the current node is not in the list of active formatting elements,
+               # then pop the current node off the stack of open elements, and abort
+               # these steps.
                if open_els[0].name is subject and open_els[0].namespace is NS_HTML
-                       el = open_els[0]
-                       open_els.shift()
+                       debug_log "aaa: starting off with subject on top of stack, exiting"
                        # remove it from the list of active formatting elements (if found)
-                       for t, i in afe
-                               if t is el
-                                       afe.splice i, 1
+                       in_afe = false
+                       for el, i in afe
+                               if el is open_els[0]
+                                       in_afe = true
                                        break
-                       debug_log "aaa: starting off with subject on top of stack, exiting"
-                       return
+                       unless in_afe
+                               debug_log "aaa: ...and not in afe, aaa done"
+                               open_els.shift()
+                               return
+                       # fall through
+# END WHATWG
                outer = 0
                loop
                        if outer >= 8
@@ -1196,7 +1317,7 @@ parse_html = (args) ->
                        ins_mode t
                        return
                if is_mathml_text_integration_point(acn)
-                       if t.type is TYPE_START_TAG and (t.name is 'mglyph' or t.name is 'malignmark')
+                       if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
                                ins_mode t
                                return
                        if t.type is TYPE_TEXT
@@ -1372,6 +1493,35 @@ parse_html = (args) ->
 
        # 8.2.5.4.1 The "initial" insertion mode
        # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
+       is_quirks_yes_doctype = (t) ->
+               if t.flag 'force-quirks'
+                       return true
+               if t.name isnt 'html'
+                       return true
+               if t.public_identifier?
+                       pi = t.public_identifier.toLowerCase()
+                       for p in quirks_yes_pi_prefixes
+                               if pi.substr(0, p.length) is p
+                                       return true
+                       if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
+                               return true
+               if t.system_identifier?
+                       if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
+                               return true
+               else if t.public_identifier?
+                       # already did this: pi = t.public_identifier.toLowerCase()
+                       if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
+                               return true
+               return false
+       is_quirks_limited_doctype = (t) ->
+               if t.public_identifier?
+                       pi = t.public_identifier.toLowerCase()
+                       if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
+                               return true
+                       if t.system_identifier?
+                               if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
+                                       return true
+               return false
        ins_mode_initial = (t) ->
                if is_space_tok t
                        return
@@ -1380,13 +1530,20 @@ parse_html = (args) ->
                        doc.children.push t
                        return
                if t.type is TYPE_DOCTYPE
-                       # FIXME check identifiers, set quirks, etc
-                       # fixfull
+                       # fixfull syntax error from first paragraph and following bullets
+                       # fixfull set doc.doctype
+                       # fixfull is the "not an iframe srcdoc" thing relevant?
+                       if is_quirks_yes_doctype t
+                               doc.flag 'quirks mode', QUIRKS_YES
+                       else if is_quirks_limited_doctype t
+                               doc.flag 'quirks mode', QUIRKS_LIMITED
                        doc.children.push t
                        ins_mode = ins_mode_before_html
                        return
                # Anything else
-               #fixfull (iframe, quirks)
+               # fixfull not iframe srcdoc?
+               parse_error()
+               doc.flag 'quirks mode', QUIRKS_YES
                ins_mode = ins_mode_before_html
                process_token t
                return
@@ -1415,9 +1572,9 @@ parse_html = (args) ->
                                parse_error()
                                return
                # Anything else
-               html_tok = new_open_tag 'html'
-               el = token_to_element html_tok, NS_HTML, doc
+               el = token_to_element new_open_tag('html'), NS_HTML, doc
                doc.children.push el
+               el.parent = doc
                open_els.unshift el
                # ?fixfull browsing context
                ins_mode = ins_mode_before_head
@@ -1449,8 +1606,7 @@ parse_html = (args) ->
                                parse_error()
                                return
                # Anything else
-               head_tok = new_open_tag 'head'
-               el = insert_html_element head_tok
+               el = insert_html_element new_open_tag 'head'
                head_element_pointer = el
                ins_mode = ins_mode_in_head
                process_token t
@@ -1604,7 +1760,7 @@ parse_html = (args) ->
                        parse_error()
                        open_els.unshift head_element_pointer
                        ins_mode_in_head t
-                       for el, i of open_els
+                       for el, i in open_els
                                if el is head_element_pointer
                                        open_els.splice i, 1
                                        return
@@ -1624,17 +1780,23 @@ parse_html = (args) ->
 
        # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
        in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
-               for el, i in open_els
-                       if el.name is name and el.namespace is NS_HTML
+               node = open_els[0]
+               loop
+                       if node.name is name and node.namespace is NS_HTML
                                generate_implied_end_tags name # arg is exception
-                               parse_error() unless i is 0
-                               while i >= 0
-                                       open_els.shift()
-                                       i -= 1
-                               return
-                       if special_elements[el.name] is el.namespace
+                               unless node is open_els[0]
+                                       parse_error()
+                               loop
+                                       el = open_els.shift()
+                                       if el is node
+                                               return
+                       if special_elements[node.name] is node.namespace
                                parse_error()
                                return
+                       for el, i in open_els
+                               if node is el
+                                       node = open_els[i + 1]
+                                       break
                return
        ins_mode_in_body = (t) ->
                if t.type is TYPE_TEXT and t.text is "\u0000"
@@ -1659,7 +1821,7 @@ parse_html = (args) ->
                        parse_error()
                        return if template_tag_is_open()
                        root_attrs = open_els[open_els.length - 1].attrs
-                       for a of t.attrs_a
+                       for a in t.attrs_a
                                root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
                        return
 
@@ -1674,7 +1836,7 @@ parse_html = (args) ->
                        return unless second.name is 'body'
                        return if template_tag_is_open()
                        flag_frameset_ok = false
-                       for a of t.attrs_a
+                       for a in t.attrs_a
                                second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
                        return
                if t.type is TYPE_START_TAG and t.name is 'frameset'
@@ -1961,6 +2123,10 @@ parse_html = (args) ->
                        return
                if t.type is TYPE_START_TAG and t.name is 'nobr'
                        reconstruct_afe()
+                       if is_in_scope 'nobr', NS_HTML
+                               parse_error()
+                               adoption_agency 'nobr'
+                               reconstruct_afe()
                        el = insert_html_element t
                        afe_push el
                        return
@@ -1987,14 +2153,15 @@ parse_html = (args) ->
                        clear_afe_to_marker()
                        return
                if t.type is TYPE_START_TAG and t.name is 'table'
-                       close_p_if_in_button_scope() # fixfull quirksmode thing
+                       unless doc.flag('quirks mode') is QUIRKS_YES
+                               close_p_if_in_button_scope() # test
                        insert_html_element t
                        flag_frameset_ok = false
                        ins_mode = ins_mode_in_table
                        return
                if t.type is TYPE_END_TAG and t.name is 'br'
                        parse_error()
-                       t.type is TYPE_START_TAG
+                       t.type = TYPE_START_TAG
                        # fall through
                if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
                        reconstruct_afe()
@@ -2011,7 +2178,8 @@ parse_html = (args) ->
                        unless is_input_hidden_tok t
                                flag_frameset_ok = false
                        return
-               if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
+               if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
+                       # WHATWG adds 'menuitem' for this block
                        insert_html_element t
                        open_els.shift()
                        t.acknowledge_self_closing()
@@ -2121,7 +2289,7 @@ parse_html = (args) ->
 #                                      parse_error()
 #                      insert_html_element t
 #                      return
-# below implements the WATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
+# below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
                if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
                        if is_in_scope 'ruby', NS_HTML
                                generate_implied_end_tags()
@@ -2136,7 +2304,7 @@ parse_html = (args) ->
                                        parse_error()
                        insert_html_element t
                        return
-# end WATWG chunk
+# end WHATWG chunk
                if t.type is TYPE_START_TAG and t.name is 'math'
                        reconstruct_afe()
                        adjust_mathml_attributes t
@@ -2567,7 +2735,7 @@ parse_html = (args) ->
                        insert_html_element t
                        return
                if t.type is TYPE_END_TAG and t.name is 'optgroup'
-                       if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
+                       if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
                                if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
                                        open_els.shift()
                        if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
@@ -2709,7 +2877,8 @@ parse_html = (args) ->
                        ins_mode_in_body t
                        return
                if t.type is TYPE_COMMENT
-                       insert_comment t, [open_els[0], open_els[0].children.length]
+                       first = open_els[open_els.length - 1]
+                       insert_comment t, [first, first.children.length]
                        return
                if t.type is TYPE_DOCTYPE
                        parse_error()
@@ -2718,7 +2887,9 @@ parse_html = (args) ->
                        ins_mode_in_body t
                        return
                if t.type is TYPE_END_TAG and t.name is 'html'
-                       # fixfull fragment case
+                       if flag_fragment_parsing
+                               parse_error()
+                               return
                        ins_mode = ins_mode_after_after_body
                        return
                if t.type is TYPE_EOF
@@ -2786,7 +2957,7 @@ parse_html = (args) ->
                        ins_mode_in_body t
                        return
                if t.type is TYPE_END_TAG and t.name is 'html'
-                       insert_mode = ins_mode_after_after_frameset
+                       ins_mode = ins_mode_after_after_frameset
                        return
                if t.type is TYPE_START_TAG and t.name is 'noframes'
                        ins_mode_in_head t
@@ -2812,6 +2983,7 @@ parse_html = (args) ->
                # Anything else
                parse_error()
                ins_mode = ins_mode_in_body
+               process_token t
                return
 
        # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
@@ -2856,6 +3028,7 @@ parse_html = (args) ->
                        if t.name is 'script'
                                t.acknowledge_self_closing()
                                in_foreign_content_end_script()
+                               # fixfull
                        else
                                open_els.shift()
                                t.acknowledge_self_closing()
@@ -2885,8 +3058,7 @@ parse_html = (args) ->
                                return
                        loop # is this safe?
                                open_els.shift()
-                               cn = open_els[0]
-                               if is_mathml_text_integration_point(cn) or is_html_integration(cn) or cn.namespace is NS_HTML
+                               if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
                                        break
                        process_token t
                        return
@@ -2897,9 +3069,11 @@ parse_html = (args) ->
                        in_foreign_content_end_script()
                        return
                if t.type is TYPE_END_TAG
-                       if open_els[0].name.toLowerCase() isnt t.name
+                       i = 0
+                       node = open_els[i]
+                       if node.name.toLowerCase() isnt t.name
                                parse_error()
-                       for node in open_els
+                       loop
                                if node is open_els[open_els.length - 1]
                                        return
                                if node.name.toLowerCase() is t.name
@@ -2907,6 +3081,8 @@ parse_html = (args) ->
                                                el = open_els.shift()
                                                if el is node
                                                        return
+                               i += 1
+                               node = open_els[i]
                                if node.namespace is NS_HTML
                                        break
                        ins_mode t # explicitly call HTML insertion mode
@@ -2995,50 +3171,55 @@ parse_html = (args) ->
 
        # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
        tok_state_tag_open = ->
-               switch c = txt.charAt(cur++)
-                       when '!'
-                               tok_state = tok_state_markup_declaration_open
-                       when '/'
-                               tok_state = tok_state_end_tag_open
-                       when '?'
-                               parse_error()
-                               tok_cur_tag = new_comment_token '?'
-                               tok_state = tok_state_bogus_comment
-                       else
-                               if is_lc_alpha(c)
-                                       tok_cur_tag = new_open_tag c
-                                       tok_state = tok_state_tag_name
-                               else if is_uc_alpha(c)
-                                       tok_cur_tag = new_open_tag c.toLowerCase()
-                                       tok_state = tok_state_tag_name
-                               else
-                                       parse_error()
-                                       tok_state = tok_state_data
-                                       cur -= 1 # we didn't parse/handle the char after <
-                                       return new_text_node '<'
-               return null
+               c = txt.charAt(cur++)
+               if c is '!'
+                       tok_state = tok_state_markup_declaration_open
+                       return
+               if c is '/'
+                       tok_state = tok_state_end_tag_open
+                       return
+               if is_uc_alpha(c)
+                       tok_cur_tag = new_open_tag c.toLowerCase()
+                       tok_state = tok_state_tag_name
+                       return
+               if is_lc_alpha(c)
+                       tok_cur_tag = new_open_tag c
+                       tok_state = tok_state_tag_name
+                       return
+               if c is '?'
+                       parse_error()
+                       tok_cur_tag = new_comment_token '?' # FIXME right?
+                       tok_state = tok_state_bogus_comment
+                       return
+               # Anything else
+               parse_error()
+               tok_state = tok_state_data
+               cur -= 1 # we didn't parse/handle the char after <
+               return new_text_node '<'
 
        # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
        tok_state_end_tag_open = ->
-               switch c = txt.charAt(cur++)
-                       when '>'
-                               parse_error()
-                               tok_state = tok_state_data
-                       when '' # EOF
-                               parse_error()
-                               tok_state = tok_state_data
-                               return new_text_node '</'
-                       else
-                               if is_uc_alpha(c)
-                                       tok_cur_tag = new_end_tag c.toLowerCase()
-                                       tok_state = tok_state_tag_name
-                               else if is_lc_alpha(c)
-                                       tok_cur_tag = new_end_tag c
-                                       tok_state = tok_state_tag_name
-                               else
-                                       parse_error()
-                                       tok_cur_tag = new_comment_token '/'
-                                       tok_state = tok_state_bogus_comment
+               c = txt.charAt(cur++)
+               if is_uc_alpha(c)
+                       tok_cur_tag = new_end_tag c.toLowerCase()
+                       tok_state = tok_state_tag_name
+                       return
+               if is_lc_alpha(c)
+                       tok_cur_tag = new_end_tag c
+                       tok_state = tok_state_tag_name
+                       return
+               if c is '>'
+                       parse_error()
+                       tok_state = tok_state_data
+                       return
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       return new_text_node '</'
+               # Anything else
+               parse_error()
+               tok_cur_tag = new_comment_token c
+               tok_state = tok_state_bogus_comment
                return null
 
        # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
@@ -3368,7 +3549,7 @@ parse_html = (args) ->
                # Anything else
                tok_state = tok_state_script_data_escaped
                cur -= 1 # Reconsume
-               return new_character_token c
+               return new_character_token '<'
 
        # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
        tok_state_script_data_escaped_end_tag_open = ->
@@ -3746,7 +3927,7 @@ parse_html = (args) ->
        tok_state_self_closing_start_tag = ->
                c = txt.charAt(cur++)
                if c is '>'
-                       tok_cur_tag.flag 'self-closing'
+                       tok_cur_tag.flag 'self-closing', true
                        tok_state = tok_state_data
                        return tok_cur_tag
                if c is ''
@@ -4366,7 +4547,9 @@ parse_html = (args) ->
                else
                        val = txt.substr cur, (next_gt - cur)
                        cur = next_gt + 3
-               return new_character_token val # fixfull split
+               if val.length > 0
+                       return new_character_token val # fixfull split
+               return null
 
        # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
        # Don't set this as a state, just call it
@@ -4462,6 +4645,7 @@ parse_html = (args) ->
        txt = args.html
        cur = 0
        doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+       doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
        open_els = []
        afe = [] # active formatting elements
        template_ins_modes = []
@@ -4477,6 +4661,7 @@ parse_html = (args) ->
        head_element_pointer = null
        flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
        context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
+       prev_node_id = 0 # just for debugging
 
        # tokenizer initialization
        tok_state = tok_state_data
@@ -4487,7 +4672,7 @@ parse_html = (args) ->
        txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
        txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
 
-       if args.name is "plain-text-unsafe.dat #4"
+       if args.name is "tests23.dat #1"
                console.log "hi"
        # proccess input
        # http://www.w3.org/TR/html5/syntax.html#tree-construction
@@ -4517,3 +4702,6 @@ module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
 module.exports.NS_HTML = NS_HTML
 module.exports.NS_MATHML = NS_MATHML
 module.exports.NS_SVG = NS_SVG
+module.exports.QUIRKS_NO = QUIRKS_NO
+module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
+module.exports.QUIRKS_YES = QUIRKS_YES