JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
implement fragment parsing algorithm
[peach-html5-editor.git] / parse-html.coffee
index f5437c9..425fe3c 100644 (file)
@@ -84,6 +84,11 @@ NS_HTML = 1
 NS_MATHML = 2
 NS_SVG = 3
 
+# quirks mode constants
+QUIRKS_NO = 1
+QUIRKS_LIMITED = 2
+QUIRKS_YES = 3
+
 g_debug_log = []
 debug_log_reset = ->
        g_debug_log = []
@@ -249,6 +254,64 @@ unicode_fixes[0x9C] = "\u0153"
 unicode_fixes[0x9E] = "\u017E"
 unicode_fixes[0x9F] = "\u0178"
 
+quirks_yes_pi_prefixes = [
+       "+//silmaril//dtd html pro v0r11 19970101//"
+       "-//as//dtd html 3.0 aswedit + extensions//"
+       "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
+       "-//ietf//dtd html 2.0 level 1//"
+       "-//ietf//dtd html 2.0 level 2//"
+       "-//ietf//dtd html 2.0 strict level 1//"
+       "-//ietf//dtd html 2.0 strict level 2//"
+       "-//ietf//dtd html 2.0 strict//"
+       "-//ietf//dtd html 2.0//"
+       "-//ietf//dtd html 2.1e//"
+       "-//ietf//dtd html 3.0//"
+       "-//ietf//dtd html 3.2 final//"
+       "-//ietf//dtd html 3.2//"
+       "-//ietf//dtd html 3//"
+       "-//ietf//dtd html level 0//"
+       "-//ietf//dtd html level 1//"
+       "-//ietf//dtd html level 2//"
+       "-//ietf//dtd html level 3//"
+       "-//ietf//dtd html strict level 0//"
+       "-//ietf//dtd html strict level 1//"
+       "-//ietf//dtd html strict level 2//"
+       "-//ietf//dtd html strict level 3//"
+       "-//ietf//dtd html strict//"
+       "-//ietf//dtd html//"
+       "-//metrius//dtd metrius presentational//"
+       "-//microsoft//dtd internet explorer 2.0 html strict//"
+       "-//microsoft//dtd internet explorer 2.0 html//"
+       "-//microsoft//dtd internet explorer 2.0 tables//"
+       "-//microsoft//dtd internet explorer 3.0 html strict//"
+       "-//microsoft//dtd internet explorer 3.0 html//"
+       "-//microsoft//dtd internet explorer 3.0 tables//"
+       "-//netscape comm. corp.//dtd html//"
+       "-//netscape comm. corp.//dtd strict html//"
+       "-//o'reilly and associates//dtd html 2.0//"
+       "-//o'reilly and associates//dtd html extended 1.0//"
+       "-//o'reilly and associates//dtd html extended relaxed 1.0//"
+       "-//sq//dtd html 2.0 hotmetal + extensions//"
+       "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
+       "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
+       "-//spyglass//dtd html 2.0 extended//"
+       "-//sun microsystems corp.//dtd hotjava html//"
+       "-//sun microsystems corp.//dtd hotjava strict html//"
+       "-//w3c//dtd html 3 1995-03-24//"
+       "-//w3c//dtd html 3.2 draft//"
+       "-//w3c//dtd html 3.2 final//"
+       "-//w3c//dtd html 3.2//"
+       "-//w3c//dtd html 3.2s draft//"
+       "-//w3c//dtd html 4.0 frameset//"
+       "-//w3c//dtd html 4.0 transitional//"
+       "-//w3c//dtd html experimental 19960712//"
+       "-//w3c//dtd html experimental 970421//"
+       "-//w3c//dtd w3 html//"
+       "-//w3o//dtd w3 html 3.0//"
+       "-//webtechs//dtd mozilla html 2.0//"
+       "-//webtechs//dtd mozilla html//"
+]
+
 # These are the character references that don't need a terminating semicolon
 # min length: 2, max: 6, none are a prefix of any other.
 legacy_char_refs = {
@@ -605,18 +668,29 @@ parse_html = (args) ->
                else
                        console.log "Parse error at character #{cur} of #{txt.length}"
 
+       # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
+       # "Noah's Ark clause" but with three
        afe_push = (new_el) ->
                matches = 0
                for el, i in afe
+                       if el.type is TYPE_AFE_MARKER
+                               break
                        if el.name is new_el.name and el.namespace is new_el.namespace
+                               attrs_match = true
                                for k, v of el.attrs
-                                       continue unless new_el.attrs[k] is v
-                               for k, v of new_el.attrs
-                                       continue unless el.attrs[k] is v
-                               matches += 1
-                               if matches is 3
-                                       afe.splice i, 1
-                                       break
+                                       unless new_el.attrs[k] is v
+                                               attrs_match = false
+                                               break
+                               if attrs_match
+                                       for k, v of new_el.attrs
+                                               unless el.attrs[k] is v
+                                                       attrs_match = false
+                                                       break
+                               if attrs_match
+                                       matches += 1
+                                       if matches is 3
+                                               afe.splice i, 1
+                                               break
                afe.unshift new_el
        afe_push_marker = ->
                afe.unshift new_afe_marker()
@@ -626,24 +700,24 @@ parse_html = (args) ->
 
        # But first... the helpers
        template_tag_is_open = ->
-               for t in open_els
-                       if t.name is 'template' and t.namespace is NS_HTML
+               for el in open_els
+                       if el.name is 'template' and el.namespace is NS_HTML
                                return true
                return false
        is_in_scope_x = (tag_name, scope, namespace) ->
-               for t in open_els
-                       if t.name is tag_name and (namespace is null or namespace is t.namespace)
+               for el in open_els
+                       if el.name is tag_name and (namespace is null or namespace is el.namespace)
                                return true
-                       if scope[t.name] is t.namespace
+                       if scope[el.name] is el.namespace
                                return false
                return false
        is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
-               for t in open_els
-                       if t.name is tag_name and (namespace is null or namespace is t.namespace)
+               for el in open_els
+                       if el.name is tag_name and (namespace is null or namespace is el.namespace)
                                return true
-                       if scope[t.name] is t.namespace
+                       if scope[el.name] is el.namespace
                                return false
-                       if scope2[t.name] is t.namespace
+                       if scope2[el.name] is el.namespace
                                return false
                return false
        standard_scopers = {
@@ -743,8 +817,8 @@ parse_html = (args) ->
                loop
                        if node_i is open_els.length - 1
                                last = true
-                               # fixfull (fragment case)
-
+                               if flag_fragment_parsing
+                                       node = context_element
                        # 4. If node is a select element, run these substeps:
                        if node.name is 'select' and node.namespace is NS_HTML
                                # 1. If last is true, jump to the step below labeled done.
@@ -1419,6 +1493,35 @@ parse_html = (args) ->
 
        # 8.2.5.4.1 The "initial" insertion mode
        # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
+       is_quirks_yes_doctype = (t) ->
+               if t.flag 'force-quirks'
+                       return true
+               if t.name isnt 'html'
+                       return true
+               if t.public_identifier?
+                       pi = t.public_identifier.toLowerCase()
+                       for p in quirks_yes_pi_prefixes
+                               if pi.substr(0, p.length) is p
+                                       return true
+                       if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
+                               return true
+               if t.system_identifier?
+                       if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
+                               return true
+               else if t.public_identifier?
+                       # already did this: pi = t.public_identifier.toLowerCase()
+                       if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
+                               return true
+               return false
+       is_quirks_limited_doctype = (t) ->
+               if t.public_identifier?
+                       pi = t.public_identifier.toLowerCase()
+                       if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
+                               return true
+                       if t.system_identifier?
+                               if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
+                                       return true
+               return false
        ins_mode_initial = (t) ->
                if is_space_tok t
                        return
@@ -1427,13 +1530,20 @@ parse_html = (args) ->
                        doc.children.push t
                        return
                if t.type is TYPE_DOCTYPE
-                       # FIXME check identifiers, set quirks, etc
-                       # fixfull
+                       # fixfull syntax error from first paragraph and following bullets
+                       # fixfull set doc.doctype
+                       # fixfull is the "not an iframe srcdoc" thing relevant?
+                       if is_quirks_yes_doctype t
+                               doc.flag 'quirks mode', QUIRKS_YES
+                       else if is_quirks_limited_doctype t
+                               doc.flag 'quirks mode', QUIRKS_LIMITED
                        doc.children.push t
                        ins_mode = ins_mode_before_html
                        return
                # Anything else
-               #fixfull (iframe, quirks)
+               # fixfull not iframe srcdoc?
+               parse_error()
+               doc.flag 'quirks mode', QUIRKS_YES
                ins_mode = ins_mode_before_html
                process_token t
                return
@@ -1451,6 +1561,7 @@ parse_html = (args) ->
                if t.type is TYPE_START_TAG and t.name is 'html'
                        el = token_to_element t, NS_HTML, doc
                        doc.children.push el
+                       el.document = doc
                        open_els.unshift(el)
                        # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
                        ins_mode = ins_mode_before_head
@@ -1462,9 +1573,9 @@ parse_html = (args) ->
                                parse_error()
                                return
                # Anything else
-               html_tok = new_open_tag 'html'
-               el = token_to_element html_tok, NS_HTML, doc
+               el = token_to_element new_open_tag('html'), NS_HTML, doc
                doc.children.push el
+               el.document = doc
                open_els.unshift el
                # ?fixfull browsing context
                ins_mode = ins_mode_before_head
@@ -1496,8 +1607,7 @@ parse_html = (args) ->
                                parse_error()
                                return
                # Anything else
-               head_tok = new_open_tag 'head'
-               el = insert_html_element head_tok
+               el = insert_html_element new_open_tag 'head'
                head_element_pointer = el
                ins_mode = ins_mode_in_head
                process_token t
@@ -1651,7 +1761,7 @@ parse_html = (args) ->
                        parse_error()
                        open_els.unshift head_element_pointer
                        ins_mode_in_head t
-                       for el, i of open_els
+                       for el, i in open_els
                                if el is head_element_pointer
                                        open_els.splice i, 1
                                        return
@@ -1671,17 +1781,23 @@ parse_html = (args) ->
 
        # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
        in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
-               for el, i in open_els
-                       if el.name is name and el.namespace is NS_HTML
+               node = open_els[0]
+               loop
+                       if node.name is name and node.namespace is NS_HTML
                                generate_implied_end_tags name # arg is exception
-                               parse_error() unless i is 0
-                               while i >= 0
-                                       open_els.shift()
-                                       i -= 1
-                               return
-                       if special_elements[el.name] is el.namespace
+                               unless node is open_els[0]
+                                       parse_error()
+                               loop
+                                       el = open_els.shift()
+                                       if el is node
+                                               return
+                       if special_elements[node.name] is node.namespace
                                parse_error()
                                return
+                       for el, i in open_els
+                               if node is el
+                                       node = open_els[i + 1]
+                                       break
                return
        ins_mode_in_body = (t) ->
                if t.type is TYPE_TEXT and t.text is "\u0000"
@@ -1721,7 +1837,7 @@ parse_html = (args) ->
                        return unless second.name is 'body'
                        return if template_tag_is_open()
                        flag_frameset_ok = false
-                       for a of t.attrs_a
+                       for a in t.attrs_a
                                second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
                        return
                if t.type is TYPE_START_TAG and t.name is 'frameset'
@@ -1809,11 +1925,7 @@ parse_html = (args) ->
                if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
                        close_p_if_in_button_scope()
                        insert_html_element t
-                       # spec: If the next token is a "LF" (U+000A) character token, then
-                       # ignore that token and move on to the next one. (Newlines at the
-                       # start of pre blocks are ignored as an authoring convenience.)
-                       if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
-                               cur += 1
+                       eat_next_token_if_newline()
                        flag_frameset_ok = false
                        return
                if t.type is TYPE_START_TAG and t.name is 'form'
@@ -2008,6 +2120,10 @@ parse_html = (args) ->
                        return
                if t.type is TYPE_START_TAG and t.name is 'nobr'
                        reconstruct_afe()
+                       if is_in_scope 'nobr', NS_HTML
+                               parse_error()
+                               adoption_agency 'nobr'
+                               reconstruct_afe()
                        el = insert_html_element t
                        afe_push el
                        return
@@ -2034,14 +2150,16 @@ parse_html = (args) ->
                        clear_afe_to_marker()
                        return
                if t.type is TYPE_START_TAG and t.name is 'table'
-                       close_p_if_in_button_scope() # fixfull quirksmode thing
+                       unless doc.flag('quirks mode') is QUIRKS_YES
+                               close_p_if_in_button_scope() # test
                        insert_html_element t
                        flag_frameset_ok = false
                        ins_mode = ins_mode_in_table
                        return
                if t.type is TYPE_END_TAG and t.name is 'br'
                        parse_error()
-                       t.type is TYPE_START_TAG
+                       # W3C: t.type = TYPE_START_TAG
+                       t = new_open_tag 'br' # WHATWG
                        # fall through
                if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
                        reconstruct_afe()
@@ -2058,7 +2176,8 @@ parse_html = (args) ->
                        unless is_input_hidden_tok t
                                flag_frameset_ok = false
                        return
-               if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
+               if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
+                       # WHATWG adds 'menuitem' for this block
                        insert_html_element t
                        open_els.shift()
                        t.acknowledge_self_closing()
@@ -2118,8 +2237,7 @@ parse_html = (args) ->
                        return
                if t.type is TYPE_START_TAG and t.name is 'textarea'
                        insert_html_element t
-                       if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
-                               cur += 1
+                       eat_next_token_if_newline()
                        tok_state = tok_state_rcdata
                        original_ins_mode = ins_mode
                        flag_frameset_ok = false
@@ -2614,7 +2732,7 @@ parse_html = (args) ->
                        insert_html_element t
                        return
                if t.type is TYPE_END_TAG and t.name is 'optgroup'
-                       if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
+                       if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
                                if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
                                        open_els.shift()
                        if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
@@ -2650,7 +2768,7 @@ parse_html = (args) ->
                        return
                if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
                        parse_error()
-                       if is_in_select_scope 'select', NS_HTML
+                       unless is_in_select_scope 'select', NS_HTML
                                return
                        loop
                                el = open_els.shift()
@@ -2976,7 +3094,7 @@ parse_html = (args) ->
                                tok_state = tok_state_tag_open
                        when "\u0000"
                                parse_error()
-                               return new_text_node "\ufffd"
+                               return new_text_node c
                        when '' # EOF
                                return new_eof_token()
                        else
@@ -3673,7 +3791,7 @@ parse_html = (args) ->
                        return
                if c is '>'
                        tok_state = tok_state_data
-                       return
+                       return tok_cur_tag
                if is_uc_alpha(c)
                        tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
                        tok_state = tok_state_attribute_name
@@ -4426,7 +4544,10 @@ parse_html = (args) ->
                else
                        val = txt.substr cur, (next_gt - cur)
                        cur = next_gt + 3
-               return new_character_token val # fixfull split
+               val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
+               if val.length > 0
+                       return new_character_token val # fixfull split
+               return null
 
        # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
        # Don't set this as a state, just call it
@@ -4517,11 +4638,31 @@ parse_html = (args) ->
                                        return '&'
                return # never reached
 
+       eat_next_token_if_newline = ->
+               old_cur = cur
+               t = null
+               until t?
+                       t = tok_state()
+               if t.type is TYPE_TEXT
+                       # definition of a newline depends on whether it was a character ref or not
+                       if cur - old_cur is 1
+                               # not a character reference
+                               if t.text is "\u000d" or t.text is "\u000a"
+                                       return
+                       else
+                               if t.text is "\u000a"
+                                       return
+               # not a "newline"
+               cur = old_cur
+               return
+
        # tree constructor initialization
        # see comments on TYPE_TAG/etc for the structure of this data
        txt = args.html
        cur = 0
-       doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+       doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
+       doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
+       fragment_root = null # fragment parsing algorithm returns children of this
        open_els = []
        afe = [] # active formatting elements
        template_ins_modes = []
@@ -4535,28 +4676,105 @@ parse_html = (args) ->
        temporary_buffer = null
        pending_table_character_tokens = []
        head_element_pointer = null
-       flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
-       context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
+       flag_fragment_parsing = false
+       context_element = null
        prev_node_id = 0 # just for debugging
 
        # tokenizer initialization
        tok_state = tok_state_data
 
-       # text pre-processing
-       # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
-       txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
-       txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
-       txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
+       parse_init = ->
+               # fragment parsing (text arg)
+               if args.fragment?
+                       # this handles the fragment from the tests in the format described here:
+                       # https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/README.md
+                       f = args.fragment
+                       ns = NS_HTML
+                       if f.substr(0, 5) is 'math '
+                               f = f.substr 5
+                               ns = NS_MATHML
+                       else if f.substr(0, 4) is 'svg '
+                               f = f.substr 4
+                               ns = NS_SVG
+                       t = new_open_tag f
+                       context_element = token_to_element t, ns
+                       context_element.document = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
+                       context_element.document.flag 'quirks mode', QUIRKS_NO
+               # fragment parsing (Node arg)
+               if args.context?
+                       context_element = args.context
+
+               # http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
+               # fragment parsing algorithm
+               if context_element?
+                       flag_fragment_parsing = true
+                       doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+                       # search up the tree from context, to try to find it's document,
+                       # because this file only puts a "document" property on the root
+                       # element.
+                       old_doc = null
+                       el = context_element
+                       loop
+                               if el.document?
+                                       old_doc = el.document
+                                       break
+                               if el.parent
+                                       el = el.parent
+                               else
+                                       break
+                       if old_doc
+                               doc.flag 'quirks mode', old_doc.flag 'quirks mode'
+                       # set tok_state
+                       if context_element.namespace is NS_HTML
+                               switch context_element.name
+                                       when 'title', 'textarea'
+                                               tok_state = tok_state_rcdata
+                                       when 'style', 'xmp', 'iframe', 'noembed', 'noframes'
+                                               tok_state = tok_state_rawtext
+                                       when 'script'
+                                               tok_state = tok_state_script_data
+                                       when 'noscript'
+                                               if flag_scripting
+                                                       tok_state = tok_state_rawtext
+                                       when 'plaintext'
+                                               tok_state = tok_state_plaintext
+                       fragment_root = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+                       doc.children.push fragment_root
+                       fragment_root.document = doc
+                       open_els = [fragment_root]
+                       if context_element.name is 'template' and context_element.namespace is NS_HTML
+                               template_ins_modes.unshift ins_mode_in_template
+                       # fixfull create token for context (it should have it's original one already)
+                       reset_ins_mode()
+                       # set form_element pointer... in the foreign doc?!
+                       el = context_element
+                       loop
+                               if el.name is 'form' and el.namespace is NS_HTML
+                                       form_element_pointer = el
+                                       break
+                               if el.parent
+                                       el = el.parent
+                               else
+                                       break
+
+               # text pre-processing
+               # FIXME check http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
+               txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
+               txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
 
-       if args.name is "tests18.dat #17"
-               console.log "hi"
-       # proccess input
        # http://www.w3.org/TR/html5/syntax.html#tree-construction
-       while flag_parsing
-               t = tok_state()
-               if t?
-                       process_token t
-                       # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
+       parse_main_loop = ->
+               while flag_parsing
+                       t = tok_state()
+                       if t?
+                               process_token t
+                               # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
+               return
+       parse_init()
+       parse_main_loop()
+
+       if flag_fragment_parsing
+               return fragment_root.children
        return doc.children
 
 serialize_els = (els, shallow, show_ids) ->
@@ -4578,3 +4796,6 @@ module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
 module.exports.NS_HTML = NS_HTML
 module.exports.NS_MATHML = NS_MATHML
 module.exports.NS_SVG = NS_SVG
+module.exports.QUIRKS_NO = QUIRKS_NO
+module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
+module.exports.QUIRKS_YES = QUIRKS_YES