JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
implemented adoption agency algorithm
authorJason Woofenden <jason@jasonwoof.com>
Fri, 18 Dec 2015 05:25:21 +0000 (00:25 -0500)
committerJason Woofenden <jason@jasonwoof.com>
Fri, 18 Dec 2015 18:25:41 +0000 (13:25 -0500)
parse-html.coffee

index 6db8ab0..f0f5e91 100644 (file)
 # indicate places where additional code is needed for full HTML document
 # parsing.
 #
-# Instead, the data structure produced by this parser is an array of nodes.
+# Instead, the data structure produced by this parser is an array of Nodes.
+
+
+# stacks/lists
+#
+# the spec uses a many different words do indicate which ends of lists/stacks
+# they are talking about (and relative movement within the lists/stacks). This
+# section splains. I'm implementing "lists" (afe and open_els) the same way
+# (both as stacks)
+#
+# stacks grow downward (current element is index=0)
 #
+# example: open_els = [a, b, c, d, e, f, g]
+#
+# "grows downwards" means it's visualized like this: (index: el, names)
+#
+#   6: g "start of the list", "topmost"
+#   5: f
+#   4: e "previous" (to d), "above"
+#   3: d   (previous/next are relative to this element)
+#   2: c "next", "after", "lower", "below"
+#   1: b
+#   0: a "end of the list", "current node", "bottommost", "last"
+
+
+
 # Each node is an obect of the Node class. Here are the Node types:
 TYPE_TAG = 0 # name, {attributes}, [children]
 TYPE_TEXT = 1 # "text"
@@ -41,6 +65,16 @@ NS_HTML = 1
 NS_MATHML = 2
 NS_SVG = 3
 
+g_debug_log = []
+debug_log_reset = ->
+       g_debug_log = []
+debug_log = (str) ->
+       g_debug_log.push str
+debug_log_each = (cb) ->
+       for str in g_debug_log
+               cb str
+
+prev_node_id = 0
 class Node
        constructor: (type, args = {}) ->
                @type = type # one of the TYPE_* constants above
@@ -51,25 +85,33 @@ class Node
                @children = args.children ? []
                @namespace = args.namespace ? NS_HTML
                @parent = args.parent ? null
+               if args.id?
+                       @id = "#{args.id}+"
+               else
+                       @id = "#{++prev_node_id}"
        shallow_clone: -> # return a new node that's the same except without the children or parent
                # WARNING this doesn't work right on open tags that are still being parsed
                attrs = {}
                attrs[k] = v for k, v of @attrs
-               return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace
-       serialize: -> # for unit tests
+               return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
+       serialize: (shallow = false, show_ids = false) -> # for unit tests
                ret = ''
                switch @type
                        when TYPE_TAG
                                ret += 'tag:'
                                ret += JSON.stringify @name
                                ret += ','
+                               if show_ids
+                                       ret += "##{@id},"
+                               if shallow
+                                       break
                                ret += JSON.stringify @attrs
                                ret += ',['
                                sep = ''
                                for c in @children
                                        ret += sep
                                        sep = ','
-                                       ret += c.serialize()
+                                       ret += c.serialize shallow, show_ids
                                ret += ']'
                        when TYPE_TEXT
                                ret += 'text:'
@@ -80,6 +122,10 @@ class Node
                        when TYPE_DOCTYPE
                                ret += 'doctype'
                                # FIXME
+                       when TYPE_MARKER
+                               ret += 'marker'
+                       when TYPE_AAA_BOOKMARK
+                               ret += 'aaa_bookmark'
                        else
                                ret += 'unknown:'
                                console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
@@ -90,6 +136,8 @@ new_open_tag = (name) ->
        return new Node TYPE_OPEN_TAG, name: name
 new_end_tag = (name) ->
        return new Node TYPE_END_TAG, name: name
+new_element = (name) ->
+       return new Node TYPE_TAG, name: name
 new_text_node = (txt) ->
        return new Node TYPE_TEXT, text: txt
 new_comment_node = (txt) ->
@@ -228,8 +276,24 @@ formatting_elements = {
         u: true
 }
 
+# all html I presume
+end_tag_implied = {
+       dd: true
+       dt: true
+       li: true
+       option: true
+       optgroup: true
+       p: true
+       rb: true
+       rp: true
+       rt: true
+       rtc: true
+}
+
 el_is_special = (e) ->
-       return special_elements[e] is e.namespace
+       return special_elements[e.name]?
+       # FIXME it should really be:
+       #return special_elements[e.name] is e.namespace
 
 # decode_named_char_ref()
 #
@@ -279,45 +343,49 @@ parse_html = (txt, parse_error_cb = null) ->
        # But first... the helpers
        template_tag_is_open = ->
                for t in open_els
-                       if t.type is TYPE_TAG and t.name is 'template'
+                       if t.name is 'template' # maybe should also check: and t.namespace is 'html'
                                return true
                return false
-       is_in_scope_x = (tag_name, scope) ->
+       is_in_scope_x = (tag_name, scope, namespace) ->
                for t in open_els
-                       if t.name is tag_name
+                       if t.name is tag_name and (namespace is null or namespace is t.namespace)
                                return true
-                       if t.name of scope
+                       if scope[t.name] is t.namespace
                                return false
                return false
-       is_in_scope_x_y = (tag_name, scope, scope2) ->
+       is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
                for t in open_els
-                       if t.name is tag_name
+                       if t.name is tag_name and (namespace is null or namespace is t.namespace)
                                return true
-                       if t.name of scope
+                       if scope[t.name] is t.namespace
                                return false
-                       if t.name of scope2
+                       if scope2[t.name] is t.namespace
                                return false
                return false
        standard_scopers = { # FIXME these are supposed to be namespace specific
-               'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
-               'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
-               'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
-               'foreignObject': true, 'desc': true, 'title'
+               applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
+               td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
+               template: NS_HTML, mi: NS_MATHML,
+
+               mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
+               'annotation-xml': NS_MATHML,
+
+               foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
        }
-       button_scopers = button: true
-       li_scopers = ol: true, ul: true
-       table_scopers = html: true, table: true, template: true
-       is_in_scope = (tag_name) ->
-               return is_in_scope_x tag_name, standard_scopers
-       is_in_button_scope = (tag_name) ->
-               return is_in_scope_x_y tag_name, standard_scopers, button_scopers
-       is_in_table_scope = (tag_name) ->
-               return is_in_scope_x tag_name, table_scopers
-       is_in_select_scope = (tag_name) ->
+       button_scopers = button: NS_HTML
+       li_scopers = ol: NS_HTML, ul: NS_HTML
+       table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
+       is_in_scope = (tag_name, namespace = null) ->
+               return is_in_scope_x tag_name, standard_scopers, namespace
+       is_in_button_scope = (tag_name, namespace = null) ->
+               return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
+       is_in_table_scope = (tag_name, namespace = null) ->
+               return is_in_scope_x tag_name, table_scopers, namespace
+       is_in_select_scope = (tag_name, namespace = null) ->
                for t in open_els
-                       if t.name is tag_name
+                       if t.name is tag_name and (namespace is null or namespace is t.namespace)
                                return true
-                       if t.name isnt 'optgroup' and t.name isnt 'option'
+                       if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
                                return false
                return false
        # this checks for a particular element, not by name
@@ -325,7 +393,7 @@ parse_html = (txt, parse_error_cb = null) ->
                for t in open_els
                        if t is el
                                return true
-                       if t.name of standard_scopers
+                       if standard_scopers[t.name] is t.namespace
                                return false
                return false
 
@@ -355,6 +423,10 @@ parse_html = (txt, parse_error_cb = null) ->
 
        # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
        # adoption agency algorithm
+       # overview here:
+       #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
+       #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
+       #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
        adoption_agency = (subject) ->
                if open_els[0].name is subject
                        el = open_els[0]
@@ -370,82 +442,136 @@ parse_html = (txt, parse_error_cb = null) ->
                        if outer >= 8
                                return
                        outer += 1
+                       # 5. Let formatting element be the last element in the list of
+                       # active formatting elements that: is between the end of the list
+                       # and the last scope marker in the list, if any, or the start of
+                       # the list otherwise, and  has the tag name subject.
                        fe = null
-                       for t, fe_index in afe
+                       for t, fe_of_afe in afe
                                if t.type is TYPE_MARKER
                                        break
                                if t.name is subject
                                        fe = t
                                        break
+                       # If there is no such element, then abort these steps and instead
+                       # act as described in the "any other end tag" entry above.
                        if fe is null
                                in_body_any_other_end_tag subject
                                return
+                       # 6. If formatting element is not in the stack of open elements,
+                       # then this is a parse error; remove the element from the list, and
+                       # abort these steps.
                        in_open_els = false
-                       for t in open_els
+                       for t, fe_of_open_els in open_els
                                if t is fe
                                        in_open_els = true
                                        break
                        unless in_open_els
                                parse_error()
                                # "remove it from the list" must mean afe, since it's not in open_els
-                               afe.splice fe_index, 1
+                               afe.splice fe_of_afe, 1
                                return
+                       # 7. If formatting element is in the stack of open elements, but
+                       # the element is not in scope, then this is a parse error; abort
+                       # these steps.
                        unless el_is_in_scope fe
                                parse_error()
                                return
+                       # 8. If formatting element is not the current node, this is a parse
+                       # error. (But do not abort these steps.)
                        unless open_els[0] is fe
                                parse_error()
                                # continue
+                       # 9. Let furthest block be the topmost node in the stack of open
+                       # elements that is lower in the stack than formatting element, and
+                       # is an element in the special category. There might not be one.
                        fb = null
-                       fb_index
+                       fb_of_open_els = null
                        for t, i in open_els
                                if t is fe
                                        break
                                if el_is_special t
                                        fb = t
-                                       fb_index = i
+                                       fb_of_open_els = i
+                                       # and continue, to see if there's one that's more "topmost"
+                       # 10. If there is no furthest block, then the UA must first pop all
+                       # the nodes from the bottom of the stack of open elements, from the
+                       # current node up to and including formatting element, then remove
+                       # formatting element from the list of active formatting elements,
+                       # and finally abort these steps.
                        if fb is null
                                loop
                                        t = open_els.shift()
                                        if t is fe
-                                               afe.splice fe_index, 1
+                                               afe.splice fe_of_afe, 1
                                                return
-                       ca = open_els[fe_index + 1] # common ancestor
-                       node_above = open_els[fb_index + 1] # next node if node isn't in open_els anymore
+                       # 11. Let common ancestor be the element immediately above
+                       # formatting element in the stack of open elements.
+                       ca = open_els[fe_of_open_els + 1] # common ancestor
+
+                       node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
                        # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
                        bookmark = new_aaa_bookmark()
                        for t, i in afe
                                if t is fe
                                        afe.splice i, 0, bookmark
+                                       break
                        node = last_node = fb
                        inner = 0
                        loop
                                inner += 1
+                               # 3. Let node be the element immediately above node in the
+                               # stack of open elements, or if node is no longer in the stack
+                               # of open elements (e.g. because it got removed by this
+                               # algorithm), the element that was immediately above node in
+                               # the stack of open elements before node was removed.
                                node_next = null
                                for t, i in open_els
                                        if t is node
                                                node_next = open_els[i + 1]
                                                break
                                node = node_next ? node_above
+                               debug_log "inner loop #{inner}"
+                               debug_log "open_els: #{serialize_els open_els, true, true}"
+                               debug_log "tree: #{serialize_els tree.children, false, true}"
+                               debug_log "afe: #{serialize_els afe, true, true}"
+                               debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
+                               debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
+                               debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
+                               debug_log "node: #{node.serialize true, true}"
                                # TODO make sure node_above gets re-set if/when node is removed from open_els
+
+                               # 4. If node is formatting element, then go to the next step in
+                               # the overall algorithm.
                                if node is fe
                                        break
+                               debug_log "the meat"
+                               # 5. If inner loop counter is greater than three and node is in
+                               # the list of active formatting elements, then remove node from
+                               # the list of active formatting elements.
                                node_in_afe = false
-                               for t, i of afe
+                               for t, i in afe
                                        if t is node
                                                if inner > 3
                                                        afe.splice i, 1
+                                                       debug_log "max out inner"
                                                else
                                                        node_in_afe = true
+                                                       debug_log "in afe"
                                                break
+                               # 6. If node is not in the list of active formatting elements,
+                               # then remove node from the stack of open elements and then go
+                               # back to the step labeled inner loop.
                                unless node_in_afe
+                                       debug_log "not in afe"
                                        for t, i in open_els
                                                if t is node
                                                        node_above = open_els[i + 1]
                                                        open_els.splice i, 1
                                                        break
                                        continue
-                               # 7. reate an element for the token for which the element node
+                               debug_log "the bones"
+                               # 7. create an element for the token for which the element node
                                # was created, in the HTML namespace, with common ancestor as
                                # the intended parent; replace the entry for node in the list
                                # of active formatting elements with an entry for the new
@@ -456,10 +582,13 @@ parse_html = (txt, parse_error_cb = null) ->
                                for t, i in afe
                                        if t is node
                                                afe[i] = new_node
+                                               debug_log "replaced in afe"
                                                break
                                for t, i in open_els
                                        if t is node
+                                               node_above = open_els[i + 1]
                                                open_els[i] = new_node
+                                               debug_log "replaced in open_els"
                                                break
                                node = new_node
                                # 8. If last node is furthest block, then move the
@@ -469,29 +598,73 @@ parse_html = (txt, parse_error_cb = null) ->
                                        for t, i in afe
                                                if t is bookmark
                                                        afe.splice i, 1
+                                                       debug_log "removed bookmark"
+                                                       break
                                        for t, i in afe
                                                if t is node
-                                                       # TODO test: position i gets you "after"?
-                                                       afe.splice i, 0, new_aaa_bookmark()
+                                                       # "after" means lower
+                                                       afe.splice i, 0, bookmark # "after as <-
+                                                       debug_log "placed bookmark after node"
+                                                       debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
+                                                       break
                                # 9. Insert last node into node, first removing it from its
                                # previous parent node if any.
                                if last_node.parent?
-                                       for c, i of last_node.parent.children
+                                       debug_log "last_node has parent"
+                                       for c, i in last_node.parent.children
                                                if c is last_node
+                                                       debug_log "removing last_node from parent"
                                                        last_node.parent.children.splice i, 1
+                                                       break
                                node.children.push last_node
                                last_node.parent = node
                                # 10. Let last node be node.
                                last_node = node
+                               debug_log "at last"
                                # 11. Return to the step labeled inner loop.
                        # 14. Insert whatever last node ended up being in the previous step
                        # at the appropriate place for inserting a node, but using common
                        # ancestor as the override target.
-                       tree_insert_element last_node, ca
+
+                       # JASON: In the case where fe is immediately followed by fb:
+                       #   * inner loop exits out early (node==fe)
+                       #   * last_node is fb
+                       #   * last_node is still in the tree (not a duplicate)
+                       if last_node.parent?
+                               debug_log "FEFIRST? last_node has parent"
+                               for c, i in last_node.parent.children
+                                       if c is last_node
+                                               debug_log "removing last_node from parent"
+                                               last_node.parent.children.splice i, 1
+                                               break
+
+                       debug_log "after aaa inner loop"
+                       debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
+                       debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
+                       debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
+                       debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
+                       debug_log "tree: #{serialize_els tree.children, false, true}"
+
+                       debug_log "insert"
+
+
+                       # can't use standard insert token thing, because it's already in
+                       # open_els and must stay at it's current position in open_els
+                       dest = adjusted_insertion_location ca
+                       dest[0].children.splice dest[1], 0, last_node
+                       last_node.parent = dest[0]
+
+
+                       debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
+                       debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
+                       debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
+                       debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
+                       debug_log "tree: #{serialize_els tree.children, false, true}"
+
                        # 15. Create an element for the token for which formatting element
                        # was created, in the HTML namespace, with furthest block as the
                        # intended parent.
-                       new_element = fe.shallow_clone()
+                       new_element = fe.shallow_clone() # FIXME intended parent thing
                        # 16. Take all of the child nodes of furthest block and append them
                        # to the element created in the last step.
                        while fb.children.length
@@ -511,32 +684,39 @@ parse_html = (txt, parse_error_cb = null) ->
                                        break
                        for t, i in afe
                                if t is bookmark
-                                       afe[i] = node
+                                       afe[i] = new_element
                                        break
                        # 19. Remove formatting element from the stack of open elements,
                        # and insert the new element into the stack of open elements
                        # immediately below the position of furthest block in that stack.
-                       for t, i of open_els
+                       for t, i in open_els
                                if t is fe
                                        open_els.splice i, 1
                                        break
-                       for t, i of open_els
+                       for t, i in open_els
                                if t is fb
                                        open_els.splice i, 0, new_element
                                        break
                        # 20. Jump back to the step labeled outer loop.
+                       debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
+                       debug_log "tree: #{serialize_els tree.children, false, true}"
+                       debug_log "open_els: #{serialize_els open_els, true, true}"
+                       debug_log "afe: #{serialize_els afe, true, true}"
+               debug_log "AAA DONE"
 
        # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
-       # FIXME implement this
+       # FIXME test this (particularly emplied end tags)
+       close_p_element = ->
+               generate_implied_end_tags 'p' # arg is exception
+               if open_els[0].name isnt 'p'
+                       parse_error()
+               while open_els.length > 1 # just in case
+                       t = open_els.shift()
+                       if t.name is 'p'
+                               return
        close_p_if_in_button_scope = ->
-               if open_els[0].name is 'p'
-                       open_els.pop()
-               return
-               #p = find_button_scope 'p'
-               #if p?
-                       # TODO generate_implied_end_tags except for p tags
-                       # TODO parse_error unless open_els[0].name is 'p'
-                       # TODO pop stack until 'p' popped
+               if is_in_button_scope 'p'
+                       close_a_p_element()
 
        # http://www.w3.org/TR/html5/syntax.html#insert-a-character
        tree_insert_text = (t) ->
@@ -629,15 +809,35 @@ parse_html = (txt, parse_error_cb = null) ->
 
                return el
 
+       # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
+       insert_foreign_element = (token, namespace) ->
+               ail = adjusted_insertion_location()
+               ail_el = ail[0]
+               ail_i = ail[1]
+               el = token_to_element token, namespace, ail_el
+               # TODO skip this next step if it's broken (eg ail_el is document with child already)
+               el.parent = ail_el
+               ail_el.children.splice ail_i, 0, el
+               open_els.unshift el
+               return el
+       # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
+       insert_html_element = insert_foreign_element # (token, namespace) ->
+
        # FIXME read implement "foster parenting" part
        # FIXME read spec, do this right
        # FIXME implement the override target thing
        # note: this assumes it's an open tag
-       # TODO tree_insert_html_element = (t, ...
+       # FIXME what part of the spec is this?
+       # TODO look through all callers of this, and see what they should really be doing.
+       #   eg probably insert_html_element for tokens
        tree_insert_element = (el, override_target = null, namespace = null) ->
+               if namespace?
+                       el.namespace = namespace
                dest = adjusted_insertion_location override_target
                if el.type is TYPE_OPEN_TAG # means it's a "token"
                        el = token_to_element el, namespace, dest[0]
+               unless el.namespace?
+                       namespace = dest.namespace
                # fixfull: Document nodes sometimes can't accept more chidren
                dest[0].children.splice dest[1], 0, el
                el.parent = dest[0]
@@ -649,18 +849,23 @@ parse_html = (txt, parse_error_cb = null) ->
                # FIXME read spec for "adjusted insertion location, etc, this might be wrong
                open_els[0].children.push t
 
+       # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
+       # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
+       generate_implied_end_tags = (except = null) ->
+               while end_tag_implied[open_els[0]] and open_els[0].name isnt except
+                       open_els.shift()
+
        # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
        in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
                for node, i in open_els
-                       if node.name is name
-                               # FIXME generate implied end tags except those with name==name
+                       if node.name is name # FIXME check namespace too
+                               generate_implied_end_tags name # arg is exception
                                parse_error() unless i is 0
-                               while i > 0
+                               while i >= 0
                                        open_els.shift()
                                        i -= 1
-                               open_els.shift()
                                return
-                       if special_elements[node.name]?
+                       if special_elements[node.name]? # FIXME check namespac too
                                parse_error()
                                return
        tree_in_body = (t) ->
@@ -685,7 +890,7 @@ parse_html = (txt, parse_error_cb = null) ->
                                        when 'html'
                                                parse_error()
                                                return if template_tag_is_open()
-                                               root_attrs = open_els[open_els.length - 1].children
+                                               root_attrs = open_els[open_els.length - 1].attrs
                                                for k, v of t.attrs
                                                        root_attrs[k] = v unless root_attrs[k]?
                                        when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
@@ -699,18 +904,47 @@ parse_html = (txt, parse_error_cb = null) ->
                                                # TODO
                                        when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
                                                close_p_if_in_button_scope()
-                                               tree_insert_element t
+                                               insert_html_element t
                                        when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
                                                close_p_if_in_button_scope()
                                                if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
                                                        parse_error()
                                                        open_els.shift()
-                                               tree_insert_element t
+                                               insert_html_element t
                                        # TODO lots more to implement here
+                                       when 'a'
+                                               # If the list of active formatting elements
+                                               # contains an a element between the end of the list and
+                                               # the last marker on the list (or the start of the list
+                                               # if there is no marker on the list), then this is a
+                                               # parse error; run the adoption agency algorithm for
+                                               # the tag name "a", then remove that element from the
+                                               # list of active formatting elements and the stack of
+                                               # open elements if the adoption agency algorithm didn't
+                                               # already remove it (it might not have if the element
+                                               # is not in table scope).
+                                               found = false
+                                               for el in afe
+                                                       if el.type is TYPE_MARKER
+                                                               break
+                                                       if el.name is 'a'
+                                                               found = el
+                                               if found?
+                                                       parse_error()
+                                                       adoption_agency 'a'
+                                                       for el, i in afe
+                                                               if el is found
+                                                                       afe.splice i, 1
+                                                       for el, i in open_els
+                                                               if el is found
+                                                                       open_els.splice i, 1
+                                               reconstruct_active_formatting_elements()
+                                               el = tree_insert_element t
+                                               afe.unshift el
                                        when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
                                                reconstruct_active_formatting_elements()
                                                el = tree_insert_element t
-                                               afe.push el
+                                               afe.unshift el
                                        # TODO lots more to implement here
                                        else # any other start tag
                                                reconstruct_active_formatting_elements()
@@ -738,6 +972,23 @@ parse_html = (txt, parse_error_cb = null) ->
                                                        parse_error()
                                                        return
                                                # TODO implement parse error and move to tree_after_body, reprocess
+                                       when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
+                                               unless is_in_scope t.name, NS_HTML
+                                                       parse_error()
+                                                       return
+                                               generate_implied_end_tags()
+                                               unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
+                                                       parse_error()
+                                               loop
+                                                       el = open_els.shift()
+                                                       if el.name is t.name and el.namespace is NS_HTML
+                                                               return
+                                       # TODO lots more close tags to implement here
+                                       when 'p'
+                                               unless is_in_button_scope 'p'
+                                                       parse_error()
+                                                       insert_html_element new_open_tag 'p'
+                                                       close_p_element()
                                        # TODO lots more close tags to implement here
                                        when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
                                                adoption_agency t.name
@@ -1089,7 +1340,7 @@ parse_html = (txt, parse_error_cb = null) ->
 
        # tree constructor initialization
        # see comments on TYPE_TAG/etc for the structure of this data
-       tree = new Node TYPE_TAG, name: 'html'
+       tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
        open_els = [tree]
        tree_state = tree_in_body
        flag_frameset_ok = true
@@ -1115,18 +1366,25 @@ test_equals = (description, output, expected_output) ->
                console.log "FAILED: \"#{description}\""
                console.log "   Expected: #{expected_output}"
                console.log "     Actual: #{output}"
+serialize_els = (els, shallow, show_ids) ->
+       serialized = ''
+       sep = ''
+       for t in els
+               serialized += sep
+               sep = ','
+               serialized += t.serialize shallow, show_ids
+       return serialized
 test_parser = (args) ->
+       debug_log_reset()
        parse_errors = []
        errors_cb = (i) ->
                parse_errors.push i
+       prev_node_id = 0 # reset counter
        parsed = parse_html args.html, errors_cb
-       serialized = ''
-       sep = ''
-       for t in parsed
-               serialized += sep
-               sep = ','
-               serialized += t.serialize()
-       if serialized isnt args.expected or parse_errors.length isnt args.errors
+       serialized = serialize_els parsed, false, false
+       if serialized isnt args.expected # or parse_errors.length isnt args.errors
+               debug_log_each (str) ->
+                       console.log str
                console.log "FAILED: \"#{args.name}\""
        else
                console.log "passed \"#{args.name}\""
@@ -1134,8 +1392,8 @@ test_parser = (args) ->
                console.log "      Input: #{args.html}"
                console.log "    Correct: #{args.expected}"
                console.log "     Output: #{serialized}"
-       if parse_errors.length isnt args.errors
-               console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
+               if parse_errors.length isnt args.errors
+                       console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
 
 test_parser name: "empty", \
        html: "",
@@ -1197,6 +1455,14 @@ test_parser name: "mis-matched formatting elements", \
        html: "12<b>34<i>56</b>78</i>90",
        expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
        errors: 1 # no idea how many their should be
+test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
+       html: '<p>1<b>2<i>3</b>4</i>5</p>',
+       expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]',
+       errors: 1
+test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
+       html: '<b>1<p>2</b>3</p>',
+       expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]',
+       errors: 1
 test_parser name: "crazy formatting elements test", \
        html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
        # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
@@ -1222,5 +1488,5 @@ test_parser name: "html5lib aaa 4", \
        errors: 2
 test_parser name: "html5lib aaa 5", \
        html: '<a>1<div>2<div>3</a>4</div>5</div>',
-       expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"]text:"5"]',
+       expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]',
        errors: 3