JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
update .gitignore
[peach-html5-editor.git] / parse-html.coffee
index 4a93f05..a6d501f 100644 (file)
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
 
-# This file implements a parser for html snippets, meant to be used by a
+# This file implements a thorough parser for html5, meant to be used by a
 # WYSIWYG editor.
 
 # The implementation is a pretty direct implementation of the parsing algorithm
 # described here:
-# http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
 #
-# Deviations from that spec:
+#     http://www.w3.org/TR/html5/syntax.html
 #
-#   Purposeful: search this file for "WHATWG"
+# except for some places marked "WHATWG" that are implemented as described here:
 #
-#   Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
+#     https://html.spec.whatwg.org/multipage/syntax.html
+#
+# This code passes all of the tests in the .dat files at:
+#
+#     https://github.com/JasonWoof/html5lib-tests/tree/patch-1/tree-construction
+
+
+##################################
+## how to use this code
+##################################
+#
+# See README.md for how to run this file in the browser or in node.js.
+#
+# This file exports a single useful function: parse_tml, and some constants
+# (see the bottom of this file for those.)
+#
+# Call it like this:
+#
+#     wheic.parse_html("<p><b>hi</p>")
+#
+# Or, if you don't want <html><head><body>/etc, do this:
+#
+#     wheic.parse_html("<p><b>hi</p>", {fragment: "body"})
+#
+# return value is an array of Nodes, see "class Node" below.
+
+# This code is a work in progress, eg try search this file for "fixfull",
+# "TODO" and "FIXME"
 
 
-# stacks/lists
+# Notes:  stacks/lists
 #
-# the spec uses a many different words do indicate which ends of lists/stacks
-# they are talking about (and relative movement within the lists/stacks). This
-# section splains. I'm implementing "lists" (afe and open_els) the same way
-# (both as stacks)
+# Jason was frequently confused by the terminology used to refer to different
+# parts of the stacks and lists in the spec, so he made this chart to help keep
+# his head straight:
 #
 # stacks grow downward (current element is index=0)
 #
 # example: open_els = [a, b, c, d, e, f, g]
 #
-# "grows downwards" means it's visualized like this: (index: el, names)
+# "grows downwards" means it's visualized like this: (index: el "names")
 #
 #   6: g "start of the list", "topmost", "first"
 #   5: f
 #   1: b
 #   0: a "end of the list", "current node", "bottommost", "last"
 
-
-# browser
-# note: to get this to run outside a browser, you'll have to write a native
-# implementation of decode_named_char_ref()
-unless module?.exports?
+if (typeof module) isnt 'undefined' and module.exports?
+       context = 'module'
+       exports = module.exports
+else
+       context = 'browser'
        window.wheic = {}
-       module = exports: window.wheic
+       exports = window.wheic
 
 from_code_point = (x) ->
        if String.fromCodePoint?
@@ -89,14 +114,18 @@ QUIRKS_NO = 1
 QUIRKS_LIMITED = 2
 QUIRKS_YES = 3
 
+# queue up debug logs, so eg they can be shown only for tests that fail
 g_debug_log = []
 debug_log_reset = ->
        g_debug_log = []
+       return
 debug_log = (str) ->
        g_debug_log.push str
+       return
 debug_log_each = (cb) ->
        for str in g_debug_log
                cb str
+       return
 
 prev_node_id = 0
 class Node
@@ -120,55 +149,13 @@ class Node
                        @token.flag 'did_self_close', true
                else
                        @flag 'did_self_close', true
+               return
        flag: (key, value = null) ->
                if value?
                        @flags[key] = value
                else
                        return @flags[key]
-       serialize: (shallow = false, show_ids = false) -> # for unit tests
-               ret = ''
-               switch @type
-                       when TYPE_TAG
-                               ret += 'tag:'
-                               ret += JSON.stringify @name
-                               ret += ','
-                               if show_ids
-                                       ret += "##{@id},"
-                               if shallow
-                                       break
-                               attr_keys = []
-                               for k of @attrs
-                                       attr_keys.push k
-                               attr_keys.sort()
-                               ret += '{'
-                               sep = ''
-                               for k in attr_keys
-                                       ret += sep
-                                       sep = ','
-                                       ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
-                               ret += '},['
-                               sep = ''
-                               for c in @children
-                                       ret += sep
-                                       sep = ','
-                                       ret += c.serialize shallow, show_ids
-                               ret += ']'
-                       when TYPE_TEXT
-                               ret += 'text:'
-                               ret += JSON.stringify @text
-                       when TYPE_COMMENT
-                               ret += 'comment:'
-                               ret += JSON.stringify @text
-                       when TYPE_DOCTYPE
-                               ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
-                       when TYPE_AFE_MARKER
-                               ret += 'marker'
-                       when TYPE_AAA_BOOKMARK
-                               ret += 'aaa_bookmark'
-                       else
-                               ret += 'unknown:'
-                               console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
-               return ret
+               return
 
 # helpers: (only take args that are normally known when parser creates nodes)
 new_open_tag = (name) ->
@@ -616,27 +603,31 @@ adjust_foreign_attributes = (t) ->
 
 # decode_named_char_ref()
 #
-# The list of named character references is _huge_ so ask the browser to decode
-# for us instead of wasting bandwidth/space on including the table here.
-#
-# Pass without the "&" but with the ";" examples:
-#    for "&amp" pass "amp;"
-#    for "&#x2032" pass "x2032;"
-g_dncr = {
-       cache: {}
-       textarea: document.createElement('textarea')
-}
-# TODO test this in IE8
+# The list of named character references is _huge_ so if we're running in a
+# browser, we get the browser to decode them, rather than increasing the code
+# size to include the table.
+if context is 'module'
+       _decode_named_char_ref = require './html5-named-entities.coffee'
+else
+       # TODO test this in IE8
+       decode_named_char_ref_el = document.createElement('textarea')
+       _decode_named_char_ref = (txt) ->
+               txt = "&#{txt};"
+               decode_named_char_ref_el.innerHTML = txt
+               decoded = decode_named_char_ref_el.value
+               return null if decoded is txt
+               return decoded
+# Pass the name of a named entity _that has a terminating semicolon_
+# Entities without terminating semicolons should use legacy_char_refs[]
+# Do not include the "&" or ";" in your argument, eg pass "alpha"
+decode_named_char_ref_cache = {}
 decode_named_char_ref = (txt) ->
-       txt = "&#{txt}"
-       decoded = g_dncr.cache[txt]
+       decoded = decode_named_char_ref_cache[txt]
        return decoded if decoded?
-       g_dncr.textarea.innerHTML = txt
-       decoded = g_dncr.textarea.value
-       return null if decoded is txt
-       return g_dncr.cache[txt] = decoded
+       decoded = _decode_named_char_ref txt
+       return decode_named_char_ref_cache[txt] = decoded
 
-parse_html = (args) ->
+parse_html = (args_html, args = {}) ->
        txt = null
        cur = null # index of next char in txt to be parsed
        # declare doc and tokenizer variables so they're in scope below
@@ -661,12 +652,14 @@ parse_html = (args) ->
 
        stop_parsing = ->
                flag_parsing = false
+               return
 
        parse_error = ->
                if args.error_cb?
                        args.error_cb cur
                else
                        console.log "Parse error at character #{cur} of #{txt.length}"
+               return
 
        # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
        # "Noah's Ark clause" but with three
@@ -692,8 +685,11 @@ parse_html = (args) ->
                                                afe.splice i, 1
                                                break
                afe.unshift new_el
+               return
+
        afe_push_marker = ->
                afe.unshift new_afe_marker()
+               return
 
        # the functions below impliment the Tree Contstruction algorithm
        # http://www.w3.org/TR/html5/syntax.html#tree-construction
@@ -817,8 +813,8 @@ parse_html = (args) ->
                loop
                        if node_i is open_els.length - 1
                                last = true
-                               # fixfull (fragment case)
-
+                               if flag_fragment_parsing
+                                       node = context_element
                        # 4. If node is a select element, run these substeps:
                        if node.name is 'select' and node.namespace is NS_HTML
                                # 1. If last is true, jump to the step below labeled done.
@@ -927,6 +923,7 @@ parse_html = (args) ->
                        node_i += 1
                        node = open_els[node_i]
                        # 19. Return to the step labeled loop.
+               return
 
        # 8.2.3.2
 
@@ -958,6 +955,7 @@ parse_html = (args) ->
                        afe[i] = el
                        break if i is 0
                        i -= 1 # Advance
+               return
 
        # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
        # adoption agency algorithm
@@ -966,10 +964,6 @@ parse_html = (args) ->
        #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
        #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
        adoption_agency = (subject) ->
-               debug_log "adoption_agency()"
-               debug_log "tree: #{serialize_els doc.children, false, true}"
-               debug_log "open_els: #{serialize_els open_els, true, true}"
-               debug_log "afe: #{serialize_els afe, true, true}"
 # this block implements tha W3C spec
 #              # 1. If the current node is an HTML element whose tag name is subject,
 #              # then run these substeps:
@@ -989,7 +983,6 @@ parse_html = (args) ->
 #                              if t is el
 #                                      afe.splice i, 1
 #                                      break
-#                      debug_log "aaa: starting off with subject on top of stack, exiting"
 #                      return
 # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
                # If the current node is an HTML element whose tag name is subject, and
@@ -997,7 +990,6 @@ parse_html = (args) ->
                # then pop the current node off the stack of open elements, and abort
                # these steps.
                if open_els[0].name is subject and open_els[0].namespace is NS_HTML
-                       debug_log "aaa: starting off with subject on top of stack, exiting"
                        # remove it from the list of active formatting elements (if found)
                        in_afe = false
                        for el, i in afe
@@ -1005,7 +997,6 @@ parse_html = (args) ->
                                        in_afe = true
                                        break
                        unless in_afe
-                               debug_log "aaa: ...and not in afe, aaa done"
                                open_els.shift()
                                return
                        # fall through
@@ -1029,7 +1020,6 @@ parse_html = (args) ->
                        # If there is no such element, then abort these steps and instead
                        # act as described in the "any other end tag" entry above.
                        if fe is null
-                               debug_log "aaa: fe not found in afe"
                                in_body_any_other_end_tag subject
                                return
                        # 6. If formatting element is not in the stack of open elements,
@@ -1041,7 +1031,6 @@ parse_html = (args) ->
                                        in_open_els = true
                                        break
                        unless in_open_els
-                               debug_log "aaa: fe not found in open_els"
                                parse_error()
                                # "remove it from the list" must mean afe, since it's not in open_els
                                afe.splice fe_of_afe, 1
@@ -1050,7 +1039,6 @@ parse_html = (args) ->
                        # the element is not in scope, then this is a parse error; abort
                        # these steps.
                        unless el_is_in_scope fe
-                               debug_log "aaa: fe not in scope"
                                parse_error()
                                return
                        # 8. If formatting element is not the current node, this is a parse
@@ -1076,7 +1064,6 @@ parse_html = (args) ->
                        # formatting element from the list of active formatting elements,
                        # and finally abort these steps.
                        if fb is null
-                               debug_log "aaa: no fb"
                                loop
                                        t = open_els.shift()
                                        if t is fe
@@ -1108,21 +1095,12 @@ parse_html = (args) ->
                                                node_next = open_els[i + 1]
                                                break
                                node = node_next ? node_above
-                               debug_log "inner loop #{inner}"
-                               debug_log "tree: #{serialize_els doc.children, false, true}"
-                               debug_log "open_els: #{serialize_els open_els, true, true}"
-                               debug_log "afe: #{serialize_els afe, true, true}"
-                               debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
-                               debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
-                               debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
-                               debug_log "node: #{node.serialize true, true}"
                                # TODO make sure node_above gets re-set if/when node is removed from open_els
 
                                # 4. If node is formatting element, then go to the next step in
                                # the overall algorithm.
                                if node is fe
                                        break
-                               debug_log "the meat"
                                # 5. If inner loop counter is greater than three and node is in
                                # the list of active formatting elements, then remove node from
                                # the list of active formatting elements.
@@ -1131,23 +1109,19 @@ parse_html = (args) ->
                                        if t is node
                                                if inner > 3
                                                        afe.splice i, 1
-                                                       debug_log "max out inner"
                                                else
                                                        node_in_afe = true
-                                                       debug_log "in afe"
                                                break
                                # 6. If node is not in the list of active formatting elements,
                                # then remove node from the stack of open elements and then go
                                # back to the step labeled inner loop.
                                unless node_in_afe
-                                       debug_log "not in afe"
                                        for t, i in open_els
                                                if t is node
                                                        node_above = open_els[i + 1]
                                                        open_els.splice i, 1
                                                        break
                                        continue
-                               debug_log "the bones"
                                # 7. create an element for the token for which the element node
                                # was created, in the HTML namespace, with common ancestor as
                                # the intended parent; replace the entry for node in the list
@@ -1159,13 +1133,11 @@ parse_html = (args) ->
                                for t, i in afe
                                        if t is node
                                                afe[i] = new_node
-                                               debug_log "replaced in afe"
                                                break
                                for t, i in open_els
                                        if t is node
                                                node_above = open_els[i + 1]
                                                open_els[i] = new_node
-                                               debug_log "replaced in open_els"
                                                break
                                node = new_node
                                # 8. If last node is furthest block, then move the
@@ -1175,29 +1147,23 @@ parse_html = (args) ->
                                        for t, i in afe
                                                if t is bookmark
                                                        afe.splice i, 1
-                                                       debug_log "removed bookmark"
                                                        break
                                        for t, i in afe
                                                if t is node
                                                        # "after" means lower
                                                        afe.splice i, 0, bookmark # "after as <-
-                                                       debug_log "placed bookmark after node"
-                                                       debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
                                                        break
                                # 9. Insert last node into node, first removing it from its
                                # previous parent node if any.
                                if last_node.parent?
-                                       debug_log "last_node has parent"
                                        for c, i in last_node.parent.children
                                                if c is last_node
-                                                       debug_log "removing last_node from parent"
                                                        last_node.parent.children.splice i, 1
                                                        break
                                node.children.push last_node
                                last_node.parent = node
                                # 10. Let last node be node.
                                last_node = node
-                               debug_log "at last"
                                # 11. Return to the step labeled inner loop.
                        # 14. Insert whatever last node ended up being in the previous step
                        # at the appropriate place for inserting a node, but using common
@@ -1208,36 +1174,15 @@ parse_html = (args) ->
                        #   * last_node is fb
                        #   * last_node is still in the tree (not a duplicate)
                        if last_node.parent?
-                               debug_log "FEFIRST? last_node has parent"
                                for c, i in last_node.parent.children
                                        if c is last_node
-                                               debug_log "removing last_node from parent"
                                                last_node.parent.children.splice i, 1
                                                break
-
-                       debug_log "after aaa inner loop"
-                       debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
-                       debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
-                       debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
-                       debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
-                       debug_log "tree: #{serialize_els doc.children, false, true}"
-
-                       debug_log "insert"
-
-
                        # can't use standard insert token thing, because it's already in
                        # open_els and must stay at it's current position in open_els
                        dest = adjusted_insertion_location ca
                        dest[0].children.splice dest[1], 0, last_node
                        last_node.parent = dest[0]
-
-
-                       debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
-                       debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
-                       debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
-                       debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
-                       debug_log "tree: #{serialize_els doc.children, false, true}"
-
                        # 15. Create an element for the token for which formatting element
                        # was created, in the HTML namespace, with furthest block as the
                        # intended parent.
@@ -1275,11 +1220,7 @@ parse_html = (args) ->
                                        open_els.splice i, 0, new_element
                                        break
                        # 20. Jump back to the step labeled outer loop.
-                       debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
-                       debug_log "tree: #{serialize_els doc.children, false, true}"
-                       debug_log "open_els: #{serialize_els open_els, true, true}"
-                       debug_log "afe: #{serialize_els afe, true, true}"
-               debug_log "AAA DONE"
+               return
 
        # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
        close_p_element = ->
@@ -1290,9 +1231,11 @@ parse_html = (args) ->
                        el = open_els.shift()
                        if el.name is 'p' and el.namespace is NS_HTML
                                return
+               return
        close_p_if_in_button_scope = ->
                if is_in_button_scope 'p', NS_HTML
                        close_p_element()
+               return
 
        # http://www.w3.org/TR/html5/syntax.html#insert-a-character
        # aka insert_a_character = (t) ->
@@ -1305,7 +1248,7 @@ parse_html = (args) ->
                                prev.text += t.text
                                return
                dest[0].children.splice dest[1], 0, t
-
+               return
 
        # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
        process_token = (t) ->
@@ -1461,13 +1404,14 @@ parse_html = (args) ->
                return el
        # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
        insert_html_element = (token) ->
-               insert_foreign_element token, NS_HTML
+               return insert_foreign_element token, NS_HTML
 
        # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
        # position should be [node, index_within_children]
        insert_comment = (t, position = null) ->
                position ?= adjusted_insertion_location()
                position[0].children.splice position[1], 0, t
+               return
 
        # 8.2.5.2
        # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
@@ -1476,17 +1420,20 @@ parse_html = (args) ->
                tok_state = tok_state_rawtext
                original_ins_mode = ins_mode
                ins_mode = ins_mode_text
+               return
        parse_generic_rcdata_text = (t) ->
                insert_html_element t
                tok_state = tok_state_rcdata
                original_ins_mode = ins_mode
                ins_mode = ins_mode_text
+               return
 
        # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
        # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
        generate_implied_end_tags = (except = null) ->
                while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
                        open_els.shift()
+               return
 
        # 8.2.5.4 The rules for parsing tokens in HTML content
        # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
@@ -1561,6 +1508,7 @@ parse_html = (args) ->
                if t.type is TYPE_START_TAG and t.name is 'html'
                        el = token_to_element t, NS_HTML, doc
                        doc.children.push el
+                       el.document = doc
                        open_els.unshift(el)
                        # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
                        ins_mode = ins_mode_before_head
@@ -1574,7 +1522,7 @@ parse_html = (args) ->
                # Anything else
                el = token_to_element new_open_tag('html'), NS_HTML, doc
                doc.children.push el
-               el.parent = doc
+               el.document = doc
                open_els.unshift el
                # ?fixfull browsing context
                ins_mode = ins_mode_before_head
@@ -1610,12 +1558,14 @@ parse_html = (args) ->
                head_element_pointer = el
                ins_mode = ins_mode_in_head
                process_token t
+               return
 
        # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
        ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
                open_els.shift() # spec says this will be a 'head' node
                ins_mode = ins_mode_after_head
                process_token t
+               return
        ins_mode_in_head = (t) ->
                if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
                        insert_character t
@@ -1694,6 +1644,7 @@ parse_html = (args) ->
                        parse_error()
                        return
                ins_mode_in_head_else t
+               return
 
        # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
        ins_mode_in_head_noscript_else = (t) ->
@@ -1701,6 +1652,7 @@ parse_html = (args) ->
                open_els.shift()
                ins_mode = ins_mode_in_head
                process_token t
+               return
        ins_mode_in_head_noscript = (t) ->
                if t.type is TYPE_DOCTYPE
                        parse_error()
@@ -1725,8 +1677,6 @@ parse_html = (args) ->
                ins_mode_in_head_noscript_else t
                return
 
-
-
        # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
        ins_mode_after_head_else = (t) ->
                body_tok = new_open_tag 'body'
@@ -1764,7 +1714,6 @@ parse_html = (args) ->
                                if el is head_element_pointer
                                        open_els.splice i, 1
                                        return
-                       console.log "warning: 23904 couldn't find head element in open_els"
                        return
                if t.type is TYPE_END_TAG and t.name is 'template'
                        ins_mode_in_head t
@@ -1777,6 +1726,7 @@ parse_html = (args) ->
                        return
                # Anything else
                ins_mode_after_head_else t
+               return
 
        # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
        in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
@@ -2157,7 +2107,8 @@ parse_html = (args) ->
                        return
                if t.type is TYPE_END_TAG and t.name is 'br'
                        parse_error()
-                       t.type = TYPE_START_TAG
+                       # W3C: t.type = TYPE_START_TAG
+                       t = new_open_tag 'br' # WHATWG
                        # fall through
                if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
                        reconstruct_afe()
@@ -2353,7 +2304,7 @@ parse_html = (args) ->
                        open_els.shift()
                        ins_mode = original_ins_mode
                        return
-               console.log 'warning: end of ins_mode_text reached'
+               return
 
        # the functions below implement the tokenizer stats described here:
        # http://www.w3.org/TR/html5/syntax.html#tokenization
@@ -2454,6 +2405,7 @@ parse_html = (args) ->
                                ins_mode_in_body t
                        else
                                ins_mode_in_table_else t
+               return
 
 
        # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
@@ -2480,6 +2432,7 @@ parse_html = (args) ->
                pending_table_character_tokens = []
                ins_mode = original_ins_mode
                process_token t
+               return
 
        # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
        ins_mode_in_caption = (t) ->
@@ -2515,6 +2468,7 @@ parse_html = (args) ->
                        return
                # Anything else
                ins_mode_in_body t
+               return
 
        # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
        ins_mode_in_column_group = (t) ->
@@ -2603,6 +2557,7 @@ parse_html = (args) ->
                        return
                # Anything else
                ins_mode_in_table t
+               return
 
        # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
        ins_mode_in_row = (t) ->
@@ -2644,6 +2599,7 @@ parse_html = (args) ->
                        return
                # Anything else
                ins_mode_in_table t
+               return
 
        # http://www.w3.org/TR/html5/syntax.html#close-the-cell
        close_the_cell = ->
@@ -2656,6 +2612,7 @@ parse_html = (args) ->
                                break
                clear_afe_to_marker()
                ins_mode = ins_mode_in_row
+               return
 
        # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
        ins_mode_in_cell = (t) ->
@@ -2699,6 +2656,7 @@ parse_html = (args) ->
                        return
                # Anything Else
                ins_mode_in_body t
+               return
 
        # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
        ins_mode_in_select = (t) ->
@@ -2865,6 +2823,7 @@ parse_html = (args) ->
                        template_ins_modes.shift()
                        reset_ins_mode()
                        process_token t
+               return
 
        # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
        ins_mode_after_body = (t) ->
@@ -2894,6 +2853,7 @@ parse_html = (args) ->
                parse_error()
                ins_mode = ins_mode_in_body
                process_token t
+               return
 
        # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
        ins_mode_in_frameset = (t) ->
@@ -3081,6 +3041,7 @@ parse_html = (args) ->
                                if node.namespace is NS_HTML
                                        break
                        ins_mode t # explicitly call HTML insertion mode
+               return
 
 
        # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
@@ -3092,7 +3053,7 @@ parse_html = (args) ->
                                tok_state = tok_state_tag_open
                        when "\u0000"
                                parse_error()
-                               return new_text_node "\ufffd"
+                               return new_text_node c
                        when '' # EOF
                                return new_eof_token()
                        else
@@ -3274,12 +3235,8 @@ parse_html = (args) ->
 
        # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
        is_appropriate_end_tag = (t) ->
-               # spec says to check against "the tag name of the last start tag to
-               # have been emitted from this tokenizer", but this is only called from
-               # the various "raw" states, so it's hopefully ok to assume that
-               # open_els[0].name will work instead TODO: verify this after the script
-               # data states are implemented
-               debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
+               # fixfull: this assumes that open_els[0].name is "the tag name of the last
+               # start tag to have been emitted from this tokenizer"
                return t.type is TYPE_END_TAG and t.name is open_els[0].name
 
        # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
@@ -3810,6 +3767,7 @@ parse_html = (args) ->
                # Anything else
                tok_cur_tag.attrs_a.unshift [c, '']
                tok_state = tok_state_attribute_name
+               return
 
        # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
        tok_state_before_attribute_value = ->
@@ -4542,6 +4500,7 @@ parse_html = (args) ->
                else
                        val = txt.substr cur, (next_gt - cur)
                        cur = next_gt + 3
+               val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
                if val.length > 0
                        return new_character_token val # fixfull split
                return null
@@ -4604,35 +4563,33 @@ parse_html = (args) ->
                                        # exit early, because parse_error() below needs at least one alnum
                                        return '&'
                                if txt.charAt(cur + i) is ';'
-                                       i += 1 # include ';' terminator in value
                                        decoded = decode_named_char_ref txt.substr(cur, i)
+                                       i += 1 # scan past the ';' (after, so we dno't pass it to decode)
                                        if decoded?
                                                cur += i
                                                return decoded
-                                       parse_error()
-                                       return '&'
-                               else
-                                       # no ';' terminator (only legacy char refs)
-                                       max = i
-                                       for i in [2..max] # no prefix matches, so ok to check shortest first
-                                               c = legacy_char_refs[txt.substr(cur, i)]
-                                               if c?
-                                                       if in_attr
-                                                               if txt.charAt(cur + i) is '='
-                                                                       # "because some legacy user agents will
-                                                                       # misinterpret the markup in those cases"
-                                                                       parse_error()
-                                                                       return '&'
-                                                               if alnum.indexOf(txt.charAt(cur + i)) > -1
-                                                                       # this makes attributes forgiving about url args
-                                                                       return '&'
-                                                       # ok, and besides the weird exceptions for attributes...
-                                                       # return the matching char
-                                                       cur += i # consume entity chars
-                                                       parse_error() # because no terminating ";"
-                                                       return c
-                                       parse_error()
-                                       return '&'
+                                       # else FALL THROUGH (check for match without last char(s) or ";")
+                               # no ';' terminator (only legacy char refs)
+                               max = i
+                               for i in [2..max] # no prefix matches, so ok to check shortest first
+                                       c = legacy_char_refs[txt.substr(cur, i)]
+                                       if c?
+                                               if in_attr
+                                                       if txt.charAt(cur + i) is '='
+                                                               # "because some legacy user agents will
+                                                               # misinterpret the markup in those cases"
+                                                               parse_error()
+                                                               return '&'
+                                                       if alnum.indexOf(txt.charAt(cur + i)) > -1
+                                                               # this makes attributes forgiving about url args
+                                                               return '&'
+                                               # ok, and besides the weird exceptions for attributes...
+                                               # return the matching char
+                                               cur += i # consume entity chars
+                                               parse_error() # because no terminating ";"
+                                               return c
+                               parse_error()
+                               return '&'
                return # never reached
 
        eat_next_token_if_newline = ->
@@ -4655,10 +4612,11 @@ parse_html = (args) ->
 
        # tree constructor initialization
        # see comments on TYPE_TAG/etc for the structure of this data
-       txt = args.html
+       txt = args_html
        cur = 0
-       doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+       doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
        doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
+       fragment_root = null # fragment parsing algorithm returns children of this
        open_els = []
        afe = [] # active formatting elements
        template_ins_modes = []
@@ -4672,22 +4630,94 @@ parse_html = (args) ->
        temporary_buffer = null
        pending_table_character_tokens = []
        head_element_pointer = null
-       flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
-       context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
+       flag_fragment_parsing = false
+       context_element = null
        prev_node_id = 0 # just for debugging
 
        # tokenizer initialization
        tok_state = tok_state_data
 
-       # text pre-processing
-       # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
-       txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
-       txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
-       txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
+       parse_init = ->
+               # fragment parsing (text arg)
+               if args.fragment?
+                       # this handles the fragment from the tests in the format described here:
+                       # https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/README.md
+                       f = args.fragment
+                       ns = NS_HTML
+                       if f.substr(0, 5) is 'math '
+                               f = f.substr 5
+                               ns = NS_MATHML
+                       else if f.substr(0, 4) is 'svg '
+                               f = f.substr 4
+                               ns = NS_SVG
+                       t = new_open_tag f
+                       context_element = token_to_element t, ns
+                       context_element.document = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
+                       context_element.document.flag 'quirks mode', QUIRKS_NO
+               # fragment parsing (Node arg)
+               if args.context?
+                       context_element = args.context
+
+               # http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
+               # fragment parsing algorithm
+               if context_element?
+                       flag_fragment_parsing = true
+                       doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+                       # search up the tree from context, to try to find it's document,
+                       # because this file only puts a "document" property on the root
+                       # element.
+                       old_doc = null
+                       el = context_element
+                       loop
+                               if el.document?
+                                       old_doc = el.document
+                                       break
+                               if el.parent
+                                       el = el.parent
+                               else
+                                       break
+                       if old_doc
+                               doc.flag 'quirks mode', old_doc.flag 'quirks mode'
+                       # set tok_state
+                       if context_element.namespace is NS_HTML
+                               switch context_element.name
+                                       when 'title', 'textarea'
+                                               tok_state = tok_state_rcdata
+                                       when 'style', 'xmp', 'iframe', 'noembed', 'noframes'
+                                               tok_state = tok_state_rawtext
+                                       when 'script'
+                                               tok_state = tok_state_script_data
+                                       when 'noscript'
+                                               if flag_scripting
+                                                       tok_state = tok_state_rawtext
+                                       when 'plaintext'
+                                               tok_state = tok_state_plaintext
+                       fragment_root = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+                       doc.children.push fragment_root
+                       fragment_root.document = doc
+                       open_els = [fragment_root]
+                       if context_element.name is 'template' and context_element.namespace is NS_HTML
+                               template_ins_modes.unshift ins_mode_in_template
+                       # fixfull create token for context (it should have it's original one already)
+                       reset_ins_mode()
+                       # set form_element pointer... in the foreign doc?!
+                       el = context_element
+                       loop
+                               if el.name is 'form' and el.namespace is NS_HTML
+                                       form_element_pointer = el
+                                       break
+                               if el.parent
+                                       el = el.parent
+                               else
+                                       break
+
+               # text pre-processing
+               # FIXME check http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
+               txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
+               txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
+
+               return
 
-       if args.name is "webkit01.dat #12"
-               console.log "hi"
-       # proccess input
        # http://www.w3.org/TR/html5/syntax.html#tree-construction
        parse_main_loop = ->
                while flag_parsing
@@ -4695,28 +4725,24 @@ parse_html = (args) ->
                        if t?
                                process_token t
                                # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
+               return
+       parse_init()
        parse_main_loop()
+
+       if flag_fragment_parsing
+               return fragment_root.children
        return doc.children
 
-serialize_els = (els, shallow, show_ids) ->
-       serialized = ''
-       sep = ''
-       for t in els
-               serialized += sep
-               sep = ','
-               serialized += t.serialize shallow, show_ids
-       return serialized
-
-module.exports.parse_html = parse_html
-module.exports.debug_log_reset = debug_log_reset
-module.exports.debug_log_each = debug_log_each
-module.exports.TYPE_TAG = TYPE_TAG
-module.exports.TYPE_TEXT = TYPE_TEXT
-module.exports.TYPE_COMMENT = TYPE_COMMENT
-module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
-module.exports.NS_HTML = NS_HTML
-module.exports.NS_MATHML = NS_MATHML
-module.exports.NS_SVG = NS_SVG
-module.exports.QUIRKS_NO = QUIRKS_NO
-module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
-module.exports.QUIRKS_YES = QUIRKS_YES
+exports.parse_html = parse_html
+exports.debug_log_reset = debug_log_reset
+exports.debug_log_each = debug_log_each
+exports.TYPE_TAG = TYPE_TAG
+exports.TYPE_TEXT = TYPE_TEXT
+exports.TYPE_COMMENT = TYPE_COMMENT
+exports.TYPE_DOCTYPE = TYPE_DOCTYPE
+exports.NS_HTML = NS_HTML
+exports.NS_MATHML = NS_MATHML
+exports.NS_SVG = NS_SVG
+exports.QUIRKS_NO = QUIRKS_NO
+exports.QUIRKS_LIMITED = QUIRKS_LIMITED
+exports.QUIRKS_YES = QUIRKS_YES