JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
fix <html> at start
[peach-html5-editor.git] / parse-html.coffee
index 385a2a0..e193118 100644 (file)
@@ -164,8 +164,10 @@ new_element = (name) ->
 new_text_node = (txt) ->
        return new Node TYPE_TEXT, text: txt
 new_character_token = new_text_node
-new_comment_node = (txt) ->
+new_comment_token = (txt) ->
        return new Node TYPE_COMMENT, text: txt
+new_doctype_token = (name) ->
+       return new Node TYPE_DOCTYPE, name: name
 new_eof_token = ->
        return new Node TYPE_EOF
 new_afe_marker = ->
@@ -373,6 +375,7 @@ parse_html = (txt, parse_error_cb = null) ->
        pending_table_character_tokens = null
        head_element_pointer = null
        flag_fragment_parsing = null
+       context_element = null
 
        stop_parsing = ->
                flag_parsing = false
@@ -623,6 +626,14 @@ parse_html = (txt, parse_error_cb = null) ->
                        node = open_els[node_i]
                        # 19. Return to the step labeled loop.
 
+       # 8.2.3.2
+
+       # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
+       adjusted_current_node = ->
+               if open_els.length is 1 and flag_fragment_parsing
+                       return context_element
+               return open_els[0]
+
        # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
        # this implementation is structured (mostly) as described at the link above.
        # capitalized comments are the "labels" described at the link above.
@@ -1146,12 +1157,12 @@ parse_html = (txt, parse_error_cb = null) ->
                if is_space_tok t
                        return
                if t.type is TYPE_COMMENT
-                       # fixfull this is supposed to be "the last child of the document object"
+                       # ?fixfull
                        doc.children.push t
                        return
                if t.type is TYPE_DOCTYPE
+                       # FIXME check identifiers, set quirks, etc
                        # fixfull
-                       t.name = 'html'
                        doc.children.push t
                        insertion_mode = ins_mode_before_html
                        return
@@ -1173,6 +1184,7 @@ parse_html = (txt, parse_error_cb = null) ->
                        return
                if t.type is TYPE_START_TAG and t.name is 'html'
                        el = token_to_element t, NS_HTML, doc
+                       doc.children.push el
                        open_els.unshift(el)
                        # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
                        insertion_mode = ins_mode_before_head
@@ -1253,7 +1265,7 @@ parse_html = (txt, parse_error_cb = null) ->
                        # fixfull encoding stuff
                        return
                if t.type is TYPE_START_TAG and t.name is 'title'
-                       parse_generic_rcdata_element t
+                       parse_generic_rcdata_text t
                        return
                if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
                        parse_generic_raw_text t
@@ -1265,7 +1277,7 @@ parse_html = (txt, parse_error_cb = null) ->
                if t.type is TYPE_START_TAG and t.name is 'script'
                        ail = adjusted_insertion_location()
                        el = token_to_element t, NS_HTML, ail
-                       el.flag_parser_inserted true # FIXME implement
+                       el.flag 'parser-inserted', true # FIXME implement
                        # fixfull frament case
                        ail[0].children.splice ail[1], 0, el
                        open_els.unshift el
@@ -1306,12 +1318,12 @@ parse_html = (txt, parse_error_cb = null) ->
                        parse_error()
                        return
                ins_mode_in_head_else t
-       
+
        # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
        ins_mode_in_head_noscript = (t) ->
                # FIXME ?fixfull
                console.log "ins_mode_in_head_noscript unimplemented"
-       
+
        # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
        ins_mode_after_head_else = (t) ->
                body_tok = new_open_tag 'body'
@@ -2276,6 +2288,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                tok_state = tok_state_end_tag_open
                        when '?'
                                parse_error()
+                               tok_cur_tag = new_comment_token '?'
                                tok_state = tok_state_bogus_comment
                        else
                                if lc_alpha.indexOf(c) > -1
@@ -2310,6 +2323,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                        tok_state = tok_state_tag_name
                                else
                                        parse_error()
+                                       tok_cur_tag = new_comment_token '/'
                                        tok_state = tok_state_bogus_comment
                return null
 
@@ -2677,6 +2691,601 @@ parse_html = (txt, parse_error_cb = null) ->
                                cur -= 1 # we didn't handle that char
                return null
 
+       # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
+       # WARNING: put a comment token in tok_cur_tag before setting this state
+       tok_state_bogus_comment = ->
+               next_gt = txt.indexOf '>', cur
+               if next_gt is -1
+                       val = txt.substr cur
+                       cur = txt.length
+               else
+                       val = txt.substr cur, (next_gt - cur)
+                       cur = next_gt + 1
+               val = val.replace "\u0000", "\ufffd"
+               tok_cur_tag.text += val
+               tok_state = tok_state_data
+               return tok_cur_tag
+
+       # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
+       tok_state_markup_declaration_open = ->
+               if txt.substr(cur, 2) is '--'
+                       cur += 2
+                       tok_cur_tag = new_comment_token ''
+                       tok_state = tok_state_comment_start
+                       return
+               if txt.substr(cur, 7).toLowerCase() is 'doctype'
+                       cur += 7
+                       tok_state = tok_state_doctype
+                       return
+               acn = adjusted_current_node()
+               if acn and acn.namespace isnt NS_HTML and text.substr(cur, 7) is '[CDATA['
+                       cur += 7
+                       tok_state = tok_state_cdata_section
+                       return
+               # Otherwise
+               parse_error()
+               tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
+               tok_state = tok_state_bogus_comment
+               return
+
+       # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
+       tok_state_comment_start = ->
+               switch c = txt.charAt(cur++)
+                       when '-'
+                               tok_state = tok_state_comment_start_dash
+                       when "\u0000"
+                               parse_error()
+                               return new_character_token "\ufffd"
+                       when '>'
+                               parse_error()
+                               tok_state = tok_state_data
+                               return tok_cur_tag
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                               cur -= 1 # Reconsume
+                               return tok_cur_tag
+                       else
+                               tok_cur_tag.text += c
+               return null
+
+       # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
+       tok_state_comment_start_dash = ->
+               switch c = txt.charAt(cur++)
+                       when '-'
+                               tok_state = tok_state_comment_end
+                       when "\u0000"
+                               parse_error()
+                               tok_cur_tag.text += "-\ufffd"
+                               tok_state = tok_state_comment
+                       when '>'
+                               parse_error()
+                               tok_state = tok_state_data
+                               return tok_cur_tag
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                               cur -= 1 # Reconsume
+                               return tok_cur_tag
+                       else
+                               tok_cur_tag.text += "-#{c}"
+                               tok_state = tok_state_comment
+               return null
+
+       # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
+       tok_state_comment = ->
+               switch c = txt.charAt(cur++)
+                       when '-'
+                               tok_state = tok_state_comment_end_dash
+                       when "\u0000"
+                               parse_error()
+                               tok_cur_tag.text += "\ufffd"
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                               cur -= 1 # Reconsume
+                               return tok_cur_tag
+                       else
+                               tok_cur_tag.text += c
+               return null
+
+       # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
+       tok_state_comment_end_dash = ->
+               switch c = txt.charAt(cur++)
+                       when '-'
+                               tok_state = tok_state_comment_end
+                       when "\u0000"
+                               parse_error()
+                               tok_cur_tag.text += "-\ufffd"
+                               tok_state = tok_state_comment
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                               cur -= 1 # Reconsume
+                               return tok_cur_tag
+                       else
+                               tok_cur_tag.text += "-#{c}"
+                               tok_state = tok_state_comment
+               return null
+
+       # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
+       tok_state_comment_end = ->
+               switch c = txt.charAt(cur++)
+                       when '>'
+                               tok_state = tok_state_data
+                               return tok_cur_tag
+                       when "\u0000"
+                               parse_error()
+                               tok_cur_tag.text += "--\ufffd"
+                               tok_state = tok_state_comment
+                       when '!'
+                               parse_error()
+                               tok_state = tok_state_comment_end_bang
+                       when '-'
+                               parse_error()
+                               tok_cur_tag.text += '-'
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                               cur -= 1 # Reconsume
+                               return tok_cur_tag
+                       else
+                               parse_error()
+                               tok_cur_tag.text += "--#{c}"
+                               tok_state = tok_state_comment
+               return null
+
+       # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
+       tok_state_comment_end_bang = ->
+               switch c = txt.charAt(cur++)
+                       when '-'
+                               tok_cur_tag.text += "--!#{c}"
+                               tok_state = tok_state_comment_end_dash
+                       when '>'
+                               tok_state = tok_state_data
+                               return tok_cur_tag
+                       when "\u0000"
+                               parse_error()
+                               tok_cur_tag.text += "--!\ufffd"
+                               tok_state = tok_state_comment
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                               cur -= 1 # Reconsume
+                               return tok_cur_tag
+                       else
+                               tok_cur_tag.text += "--!#{c}"
+                               tok_state = tok_state_comment
+               return null
+
+       # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
+       tok_state_doctype = ->
+               switch c = txt.charAt(cur++)
+                       when "\t", "\u000a", "\u000c", ' '
+                               tok_state = tok_state_before_doctype_name
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                               el = new_doctype_token ''
+                               el.flag 'force-quirks', true
+                               cur -= 1 # Reconsume
+                               return el
+                       else
+                               parse_error()
+                               tok_state = tok_state_before_doctype_name
+                               cur -= 1 # Reconsume
+               return null
+
+       # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
+       tok_state_before_doctype_name = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+                       return
+               if uc_alpha.indexOf(c) > -1
+                       tok_cur_tag = new_doctype_token c.toLowerCase()
+                       tok_state = tok_state_doctype_name
+                       return
+               if c is "\u0000"
+                       parse_error()
+                       tok_cur_tag = new_doctype_token "\ufffd"
+                       tok_state = tok_state_doctype_name
+                       return
+               if c is '>'
+                       parse_error()
+                       el = new_doctype_token ''
+                       el.flag 'force-quirks', true
+                       tok_state = tok_state_data
+                       return el
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       el = new_doctype_token ''
+                       el.flag 'force-quirks', true
+                       cur -= 1 # Reconsume
+                       return el
+               # Anything else
+               tok_cur_tag = new_doctype_token c
+               tok_state = tok_state_doctype_name
+               return null
+
+       # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
+       tok_state_doctype_name = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+                       tok_state = tok_state_after_doctype_name
+                       return
+               if c is '>'
+                       tok_state = tok_state_data
+                       return tok_cur_tag
+               if uc_alpha.indexOf(c) > -1
+                       tok_cur_tag.name += c.toLowerCase()
+                       return
+               if c is "\u0000"
+                       parse_error()
+                       tok_cur_tag.name += "\ufffd"
+                       return
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       tok_cur_tag.flag 'force-quirks', true
+                       cur -= 1 # Reconsume
+                       return tok_cur_tag
+               # Anything else
+               tok_cur_tag.name += c
+               return null
+
+       # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
+       tok_state_after_doctype_name = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+                       return
+               if c is '>'
+                       tok_state = tok_state_data
+                       return tok_cur_tag
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       tok_cur_tag.flag 'force-quirks', true
+                       cur -= 1 # Reconsume
+                       return tok_cur_tag
+               # Anything else
+               if txt.substr(cur - 1, 6).toLowerCase() is 'public'
+                       cur += 5
+                       tok_state = tok_state_after_doctype_public_keyword
+                       return
+               if txt.substr(cur - 1, 6).toLowerCase() is 'system'
+                       cur += 5
+                       tok_state = tok_state_after_doctype_system_keyword
+                       return
+               parse_error()
+               tok_cur_tag.flag 'force-quirks', true
+               tok_state = tok_state_bogus_doctype
+               return null
+
+       # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
+       tok_state_after_doctype_public_keyword = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+                       tok_state = tok_state_before_doctype_public_identifier
+                       return
+               if c is '"'
+                       parse_error()
+                       tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
+                       tok_state = tok_state_doctype_public_identifier_double_quoted
+                       return
+               if c is "'"
+                       parse_error()
+                       tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
+                       tok_state = tok_state_doctype_public_identifier_single_quoted
+                       return
+               if c is '>'
+                       parse_error()
+                       tok_cur_tag.flag 'force-quirks', true
+                       tok_state = tok_state_data
+                       return tok_cur_tag
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       tok_cur_tag.flag 'force-quirks', true
+                       cur -= 1 # Reconsume
+                       return tok_cur_tag
+               # Anything else
+               parse_error()
+               tok_cur_tag.flag 'force-quirks', true
+               tok_state = tok_state_bogus_doctype
+               return null
+
+       # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
+       tok_state_before_doctype_public_identifier = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+                       return
+               if c is '"'
+                       parse_error()
+                       tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
+                       tok_state = tok_state_doctype_public_identifier_double_quoted
+                       return
+               if c is "'"
+                       parse_error()
+                       tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
+                       tok_state = tok_state_doctype_public_identifier_single_quoted
+                       return
+               if c is '>'
+                       parse_error()
+                       tok_cur_tag.flag 'force-quirks', true
+                       tok_state = tok_state_data
+                       return tok_cur_tag
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       tok_cur_tag.flag 'force-quirks', true
+                       cur -= 1 # Reconsume
+                       return tok_cur_tag
+               # Anything else
+               parse_error()
+               tok_cur_tag.flag 'force-quirks', true
+               tok_state = tok_state_bogus_doctype
+               return null
+
+
+       # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
+       tok_state_doctype_public_identifier_double_quoted = ->
+               c = txt.charAt(cur++)
+               if c is '"'
+                       tok_state = tok_state_after_doctype_public_identifier
+                       return
+               if c is "\u0000"
+                       parse_error()
+                       tok_cur_tag.public_identifier += "\ufffd"
+                       return
+               if c is '>'
+                       parse_error()
+                       tok_cur_tag.flag 'force-quirks', true
+                       tok_state = tok_state_data
+                       return tok_cur_tag
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       tok_cur_tag.flag 'force-quirks', true
+                       cur -= 1 # Reconsume
+                       return tok_cur_tag
+               # Anything else
+               tok_cur_tag.public_identifier += c
+               return null
+
+       # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
+       tok_state_doctype_public_identifier_single_quoted = ->
+               c = txt.charAt(cur++)
+               if c is "'"
+                       tok_state = tok_state_after_doctype_public_identifier
+                       return
+               if c is "\u0000"
+                       parse_error()
+                       tok_cur_tag.public_identifier += "\ufffd"
+                       return
+               if c is '>'
+                       parse_error()
+                       tok_cur_tag.flag 'force-quirks', true
+                       tok_state = tok_state_data
+                       return tok_cur_tag
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       tok_cur_tag.flag 'force-quirks', true
+                       cur -= 1 # Reconsume
+                       return tok_cur_tag
+               # Anything else
+               tok_cur_tag.public_identifier += c
+               return null
+
+       # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
+       tok_state_after_doctype_public_identifier = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+                       tok_state = tok_state_between_doctype_public_and_system_identifiers
+                       return
+               if c is '>'
+                       tok_state = tok_state_data
+                       return tok_cur_tag
+               if c is '"'
+                       parse_error()
+                       tok_cur_tag.system_identifier = ''
+                       tok_state = tok_state_doctype_system_identifier_double_quoted
+                       return
+               if c is "'"
+                       parse_error()
+                       tok_cur_tag.system_identifier = ''
+                       tok_state = tok_state_doctype_system_identifier_single_quoted
+                       return
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       tok_cur_tag.flag 'force-quirks', true
+                       cur -= 1 # Reconsume
+                       return tok_cur_tag
+               # Anything else
+               parse_error()
+               tok_cur_tag.flag 'force-quirks', true
+               tok_state = tok_state_bogus_doctype
+               return null
+
+       # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
+       tok_state_between_doctype_public_and_system_identifiers = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+                       return
+               if c is '>'
+                       tok_state = tok_state_data
+                       return tok_cur_tag
+               if c is '"'
+                       parse_error()
+                       tok_cur_tag.system_identifier = ''
+                       tok_state = tok_state_doctype_system_identifier_double_quoted
+                       return
+               if c is "'"
+                       parse_error()
+                       tok_cur_tag.system_identifier = ''
+                       tok_state = tok_state_doctype_system_identifier_single_quoted
+                       return
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       tok_cur_tag.flag 'force-quirks', true
+                       cur -= 1 # Reconsume
+                       return tok_cur_tag
+               # Anything else
+               parse_error()
+               tok_cur_tag.flag 'force-quirks', true
+               tok_state = tok_state_bogus_doctype
+               return null
+
+       # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
+       tok_state_after_doctype_system_keyword = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+                       tok_state = tok_state_before_doctype_system_identifier
+                       return
+               if c is '"'
+                       parse_error()
+                       tok_cur_tag.system_identifier = ''
+                       tok_state = tok_state_doctype_system_identifier_double_quoted
+                       return
+               if c is "'"
+                       parse_error()
+                       tok_cur_tag.system_identifier = ''
+                       tok_state = tok_state_doctype_system_identifier_single_quoted
+                       return
+               if c is '>'
+                       parse_error()
+                       tok_cur_tag.flag 'force-quirks', true
+                       tok_state = tok_state_data
+                       return tok_cur_tag
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       tok_cur_tag.flag 'force-quirks', true
+                       cur -= 1 # Reconsume
+                       return tok_cur_tag
+               # Anything else
+               parse_error()
+               tok_cur_tag.flag 'force-quirks', true
+               tok_state = tok_state_bogus_doctype
+               return null
+
+       # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
+       tok_state_before_doctype_system_identifier = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+                       return
+               if c is '"'
+                       tok_cur_tag.system_identifier = ''
+                       tok_state = tok_state_doctype_system_identifier_double_quoted
+                       return
+               if c is "'"
+                       tok_cur_tag.system_identifier = ''
+                       tok_state = tok_state_doctype_system_identifier_single_quoted
+                       return
+               if c is '>'
+                       parse_error()
+                       tok_cur_tag.flag 'force-quirks', true
+                       tok_state = tok_state_data
+                       return tok_cur_tag
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       tok_cur_tag.flag 'force-quirks', true
+                       cur -= 1 # Reconsume
+                       return tok_cur_tag
+               # Anything else
+               parse_error()
+               tok_cur_tag.flag 'force-quirks', true
+               tok_state = tok_state_bogus_doctype
+               return null
+
+       # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
+       tok_state_doctype_system_identifier_double_quoted = ->
+               c = txt.charAt(cur++)
+               if c is '"'
+                       tok_state = tok_state_after_doctype_system_identifier
+                       return
+               if c is "\u0000"
+                       parse_error()
+                       tok_cur_tag.system_identifier += "\ufffd"
+                       return
+               if c is '>'
+                       parse_error()
+                       tok_cur_tag.flag 'force-quirks', true
+                       tok_state = tok_state_data
+                       return tok_cur_tag
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       tok_cur_tag.flag 'force-quirks', true
+                       cur -= 1 # Reconsume
+                       return tok_cur_tag
+               # Anything else
+               tok_cur_tag.system_identifier += c
+               return null
+
+       # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
+       tok_state_doctype_system_identifier_single_quoted = ->
+               c = txt.charAt(cur++)
+               if c is "'"
+                       tok_state = tok_state_after_doctype_system_identifier
+                       return
+               if c is "\u0000"
+                       parse_error()
+                       tok_cur_tag.system_identifier += "\ufffd"
+                       return
+               if c is '>'
+                       parse_error()
+                       tok_cur_tag.flag 'force-quirks', true
+                       tok_state = tok_state_data
+                       return tok_cur_tag
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       tok_cur_tag.flag 'force-quirks', true
+                       cur -= 1 # Reconsume
+                       return tok_cur_tag
+               # Anything else
+               tok_cur_tag.system_identifier += c
+               return null
+
+       # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
+       tok_state_after_doctype_system_identifier = ->
+               c = txt.charAt(cur++)
+               if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
+                       return
+               if c is '>'
+                       tok_state = tok_state_data
+                       return tok_cur_tag
+               if c is '' # EOF
+                       parse_error()
+                       tok_state = tok_state_data
+                       tok_cur_tag.flag 'force-quirks', true
+                       cur -= 1 # Reconsume
+                       return tok_cur_tag
+               # Anything else
+               parse_error()
+               # do _not_ tok_cur_tag.flag 'force-quirks', true
+               tok_state = tok_state_bogus_doctype
+               return null
+
+       # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
+       tok_state_bogus_doctype = ->
+               c = txt.charAt(cur++)
+               if c is '>'
+                       tok_state = tok_state_data
+                       return tok_cur_tag
+               if c is '' # EOF
+                       tok_state = tok_state_data
+                       cur -= 1 # Reconsume
+                       return tok_cur_tag
+               # Anything else
+               return null
+
+
        # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
        # Don't set this as a state, just call it
        # returns a string (NOT a text node)
@@ -2756,7 +3365,7 @@ parse_html = (txt, parse_error_cb = null) ->
        # tree constructor initialization
        # see comments on TYPE_TAG/etc for the structure of this data
        doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
-       open_els = [doc]
+       open_els = []
        afe = [] # active formatting elements
        template_insertion_modes = []
        insertion_mode = ins_mode_initial
@@ -2770,6 +3379,7 @@ parse_html = (txt, parse_error_cb = null) ->
        pending_table_character_tokens = []
        head_element_pointer = null
        flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
+       context_element = null # FIXME initialize from args.fragment
 
        # tokenizer initialization
        tok_state = tok_state_data