JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
implement most details about where to insert nodes
[peach-html5-editor.git] / parse-html.coffee
index 1ef077a..08dd98f 100644 (file)
 # This file implements a parser for html snippets, meant to be used by a
 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
 # or <body> tags, nor does it produce the top level "document" node in the dom
-# tree, nor nodes for html, head or body.
+# tree, nor nodes for html, head or body. Comments containing "fixfull"
+# indicate places where additional code is needed for full HTML document
+# parsing.
 #
 # Instead, the data structure produced by this parser is an array of nodes.
 #
-# Each node is an array. The first element in the array is an integer (one of
-# the TYPE_* constants below) followed by the appropriate fields for that type
-# (shown below in the comments after the TYPE_* definition.)
-
+# Each node is an obect of the Node class. Here are the Node types:
 TYPE_TAG = 0 # name, {attributes}, [children]
 TYPE_TEXT = 1 # "text"
-TYPE_WHITESPACE = 2
-TYPE_COMMENT = 3
+TYPE_COMMENT = 2
+TYPE_DOCTYPE = 3
 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
+TYPE_END_TAG = 5 # name
+TYPE_EOF = 6
+TYPE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
+TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
+
+# namespace constants
+NS_HTML = 1
+NS_MATHML = 2
+NS_SVG = 3
+
+class Node
+       constructor: (type, args = {}) ->
+               @type = type # one of the TYPE_* constants above
+               @name = args.name ? '' # tag name
+               @text = args.text ? '' # contents for text/comment nodes
+               @attrs = args.attrs ? {}
+               @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
+               @children = args.children ? []
+               @namespace = args.namespace ? NS_HTML
+               @parent = args.parent ? null
+       shallow_clone: -> # return a new node that's the same except without the children or parent
+               # WARNING this doesn't work right on open tags that are still being parsed
+               attrs = {}
+               attrs[k] = v for k, v of @attrs
+               return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace
+       serialize: -> # for unit tests
+               ret = ''
+               switch @type
+                       when TYPE_TAG
+                               ret += 'tag:'
+                               ret += JSON.stringify @name
+                               ret += ','
+                               ret += JSON.stringify @attrs
+                               ret += ',['
+                               sep = ''
+                               for c in @children
+                                       ret += sep
+                                       sep = ','
+                                       ret += c.serialize()
+                               ret += ']'
+                       when TYPE_TEXT
+                               ret += 'text:'
+                               ret += JSON.stringify @text
+                       when TYPE_COMMENT
+                               ret += 'comment:'
+                               ret += JSON.stringify @text
+                       when TYPE_DOCTYPE
+                               ret += 'doctype'
+                               # FIXME
+                       else
+                               ret += 'unknown:'
+                               console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
+               return ret
+
+# helpers: (only take args that are normally known when parser creates nodes)
+new_open_tag = (name) ->
+       return new Node TYPE_OPEN_TAG, name: name
+new_end_tag = (name) ->
+       return new Node TYPE_END_TAG, name: name
+new_text_node = (txt) ->
+       return new Node TYPE_TEXT, text: txt
+new_comment_node = (txt) ->
+       return new Node TYPE_COMMENT, text: txt
+new_eof_token = ->
+       return new Node TYPE_EOF
+new_aaa_bookmark = ->
+       return new Node TYPE_AAA_BOOKMARK
 
 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
@@ -127,6 +193,43 @@ mathml_elements = [
 # foreign_elements = [svg_elements..., mathml_elements...]
 #normal_elements = All other allowed HTML elements are normal elements.
 
+special_elements = {
+       # HTML:
+       address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
+       aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
+       blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
+       caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
+       details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
+       embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
+       footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
+       h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
+       header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
+       img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
+       listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
+       noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
+       ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
+       script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
+       style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
+       template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
+       thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
+       wbr:NS_HTML, xmp:NS_HTML,
+
+       # MathML:
+       mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
+       'annotation-xml':NS_MATHML,
+
+       # SVG:
+       foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
+}
+
+formatting_elements = {
+        a: true, b: true, big: true, code: true, em: true, font: true, i: true,
+        nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
+        u: true
+}
+
+el_is_special = (e) ->
+       return special_elements[e] is e.namespace
 
 # decode_named_char_ref()
 #
@@ -150,14 +253,498 @@ decode_named_char_ref = (txt) ->
        return null if decoded is txt
        return g_dncr.cache[txt] = decoded
 
-parse_html = (txt) ->
+parse_html = (txt, parse_error_cb = null) ->
        cur = 0 # index of next char in txt to be parsed
        # declare tree and tokenizer variables so they're in scope below
        tree = null
-       tree_append_point = null
+       open_els = [] # stack of open elements
        tree_state = null
        tok_state = null
        tok_cur_tag = null # partially parsed tag
+       flag_frameset_ok = null
+       flag_parsing = null
+       flag_foster_parenting = null
+       afe = [] # active formatting elements
+
+       parse_error = ->
+               if parse_error_cb?
+                       parse_error_cb cur
+               else
+                       console.log "Parse error at character #{cur} of #{txt.length}"
+
+
+       # the functions below impliment the Tree Contstruction algorithm
+       # http://www.w3.org/TR/html5/syntax.html#tree-construction
+
+       # But first... the helpers
+       template_tag_is_open = ->
+               for t in open_els
+                       if t.type is TYPE_TAG and t.name is 'template'
+                               return true
+               return false
+       is_in_scope_x = (tag_name, scope) ->
+               for t in open_els
+                       if t.name is tag_name
+                               return true
+                       if t.name of scope
+                               return false
+               return false
+       is_in_scope_x_y = (tag_name, scope, scope2) ->
+               for t in open_els
+                       if t.name is tag_name
+                               return true
+                       if t.name of scope
+                               return false
+                       if t.name of scope2
+                               return false
+               return false
+       standard_scopers = { # FIXME these are supposed to be namespace specific
+               'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
+               'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
+               'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
+               'foreignObject': true, 'desc': true, 'title'
+       }
+       button_scopers = button: true
+       li_scopers = ol: true, ul: true
+       table_scopers = html: true, table: true, template: true
+       is_in_scope = (tag_name) ->
+               return is_in_scope_x tag_name, standard_scopers
+       is_in_button_scope = (tag_name) ->
+               return is_in_scope_x_y tag_name, standard_scopers, button_scopers
+       is_in_table_scope = (tag_name) ->
+               return is_in_scope_x tag_name, table_scopers
+       is_in_select_scope = (tag_name) ->
+               for t in open_els
+                       if t.name is tag_name
+                               return true
+                       if t.name isnt 'optgroup' and t.name isnt 'option'
+                               return false
+               return false
+       # this checks for a particular element, not by name
+       el_is_in_scope = (el) ->
+               for t in open_els
+                       if t is el
+                               return true
+                       if t.name of standard_scopers
+                               return false
+               return false
+
+       # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
+       # this implementation is structured (mostly) as described at the link above.
+       # capitalized comments are the "labels" described at the link above.
+       reconstruct_active_formatting_elements = ->
+               return if afe.length is 0
+               if afe[0].type is TYPE_MARKER or afe[0] in open_els
+                       return
+               # Rewind
+               i = 0
+               loop
+                       if i is afe.length - 1
+                               break
+                       i += 1
+                       if afe[i].type is TYPE_MARKER or afe[i] in open_els
+                               i -= 1 # Advance
+                               break
+               # Create
+               loop
+                       el = afe[i].shallow_clone()
+                       tree_insert_element el
+                       afe[i] = el
+                       break if i is 0
+                       i -= 1
+
+       # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
+       # adoption agency algorithm
+       adoption_agency = (subject) ->
+               if open_els[0].name is subject
+                       el = open_els[0]
+                       open_els.shift()
+                       # remove it from the list of active formatting elements (if found)
+                       for t, i in afe
+                               if t is el
+                                       afe.splice i, 1
+                                       break
+                       return
+               outer = 0
+               loop
+                       if outer >= 8
+                               return
+                       outer += 1
+                       fe = null
+                       for t, fe_index in afe
+                               if t.type is TYPE_MARKER
+                                       break
+                               if t.name is subject
+                                       fe = t
+                                       break
+                       if fe is null
+                               in_body_any_other_end_tag subject
+                               return
+                       in_open_els = false
+                       for t in open_els
+                               if t is fe
+                                       in_open_els = true
+                                       break
+                       unless in_open_els
+                               parse_error()
+                               # "remove it from the list" must mean afe, since it's not in open_els
+                               afe.splice fe_index, 1
+                               return
+                       unless el_is_in_scope fe
+                               parse_error()
+                               return
+                       unless open_els[0] is fe
+                               parse_error()
+                               # continue
+                       fb = null
+                       fb_index
+                       for t, i in open_els
+                               if t is fe
+                                       break
+                               if el_is_special t
+                                       fb = t
+                                       fb_index = i
+                       if fb is null
+                               loop
+                                       t = open_els.shift()
+                                       if t is fe
+                                               afe.splice fe_index, 1
+                                               return
+                       ca = open_els[fe_index + 1] # common ancestor
+                       node_above = open_els[fb_index + 1] # next node if node isn't in open_els anymore
+                       # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
+                       bookmark = new_aaa_bookmark()
+                       for t, i in afe
+                               if t is fe
+                                       afe.splice i, 0, bookmark
+                       node = last_node = fb
+                       inner = 0
+                       loop
+                               inner += 1
+                               node_next = null
+                               for t, i in open_els
+                                       if t is node
+                                               node_next = open_els[i + 1]
+                                               break
+                               node = node_next ? node_above
+                               # TODO make sure node_above gets re-set if/when node is removed from open_els
+                               if node is fe
+                                       break
+                               node_in_afe = false
+                               for t, i of afe
+                                       if t is node
+                                               if inner > 3
+                                                       afe.splice i, 1
+                                               else
+                                                       node_in_afe = true
+                                               break
+                               unless node_in_afe
+                                       for t, i in open_els
+                                               if t is node
+                                                       node_above = open_els[i + 1]
+                                                       open_els.splice i, 1
+                                                       break
+                                       continue
+                               # 7. reate an element for the token for which the element node
+                               # was created, in the HTML namespace, with common ancestor as
+                               # the intended parent; replace the entry for node in the list
+                               # of active formatting elements with an entry for the new
+                               # element, replace the entry for node in the stack of open
+                               # elements with an entry for the new element, and let node be
+                               # the new element.
+                               new_node = node.shallow_clone()
+                               for t, i in afe
+                                       if t is node
+                                               afe[i] = new_node
+                                               break
+                               for t, i in open_els
+                                       if t is node
+                                               open_els[i] = new_node
+                                               break
+                               node = new_node
+                               # 8. If last node is furthest block, then move the
+                               # aforementioned bookmark to be immediately after the new node
+                               # in the list of active formatting elements.
+                               if last_node is fb
+                                       for t, i in afe
+                                               if t is bookmark
+                                                       afe.splice i, 1
+                                       for t, i in afe
+                                               if t is node
+                                                       # TODO test: position i gets you "after"?
+                                                       afe.splice i, 0, new_aaa_bookmark()
+                               # 9. Insert last node into node, first removing it from its
+                               # previous parent node if any.
+                               if last_node.parent?
+                                       for c, i of last_node.parent.children
+                                               if c is last_node
+                                                       last_node.parent.children.splice i, 1
+                               node.children.push last_node
+                               last_node.parent = node
+                               # 10. Let last node be node.
+                               last_node = node
+                               # 11. Return to the step labeled inner loop.
+                       # 14. Insert whatever last node ended up being in the previous step
+                       # at the appropriate place for inserting a node, but using common
+                       # ancestor as the override target.
+                       tree_insert_element last_node, ca
+                       # 15. Create an element for the token for which formatting element
+                       # was created, in the HTML namespace, with furthest block as the
+                       # intended parent.
+                       new_element = fe.shallow_clone()
+                       # 16. Take all of the child nodes of furthest block and append them
+                       # to the element created in the last step.
+                       while fb.children.length
+                               t = fb.children.shift()
+                               t.parent = new_element
+                               new_element.children.push t
+                       # 17. Append that new element to furthest block.
+                       new_element.parent = fb
+                       fb.children.push new_element
+                       # 18. Remove formatting element from the list of active formatting
+                       # elements, and insert the new element into the list of active
+                       # formatting elements at the position of the aforementioned
+                       # bookmark.
+                       for t, i in afe
+                               if t is fe
+                                       afe.splice i, 1
+                                       break
+                       for t, i in afe
+                               if t is bookmark
+                                       afe[i] = node
+                                       break
+                       # 19. Remove formatting element from the stack of open elements,
+                       # and insert the new element into the stack of open elements
+                       # immediately below the position of furthest block in that stack.
+                       for t, i of open_els
+                               if t is fe
+                                       open_els.splice i, 1
+                                       break
+                       for t, i of open_els
+                               if t is fb
+                                       open_els.splice i, 0, new_element
+                                       break
+                       # 20. Jump back to the step labeled outer loop.
+
+       # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
+       # FIXME implement this
+       close_p_if_in_button_scope = ->
+               if open_els[0].name is 'p'
+                       open_els.pop()
+               return
+               #p = find_button_scope 'p'
+               #if p?
+                       # TODO generate_implied_end_tags except for p tags
+                       # TODO parse_error unless open_els[0].name is 'p'
+                       # TODO pop stack until 'p' popped
+
+       # http://www.w3.org/TR/html5/syntax.html#insert-a-character
+       tree_insert_text = (t) ->
+               dest = adjusted_insertion_location()
+               if dest[1] > 0
+                       prev = dest[0].children[dest[1] - 1]
+                       if prev.type is TYPE_TEXT
+                               prev.text += t.text
+                               return
+               dest[0].children.splice dest[1], 0, t
+
+       # 8.2.5.1
+       # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
+       # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
+       adjusted_insertion_location = (override_target = null) ->
+               # 1. If there was an override target specified, then let target be the
+               # override target.
+               if override_target?
+                       target = override_target
+               else # Otherwise, let target be the current node.
+                       target = open_els[0]
+               # 2. Determine the adjusted insertion location using the first matching
+               # steps from the following list:
+               #
+               # If foster parenting is enabled and target is a table, tbody, tfoot,
+               # thead, or tr element Foster parenting happens when content is
+               # misnested in tables.
+               if flag_foster_parenting and target.name in foster_parenting_targets
+                       console.log "foster parenting isn't implemented yet" # TODO
+                       # 1. Let last template be the last template element in the stack of
+                       # open elements, if any.
+                       # 2. Let last table be the last table element in the stack of open
+                       # elements, if any.
+
+                       # 3. If there is a last template and either there is no last table,
+                       # or there is one, but last template is lower (more recently added)
+                       # than last table in the stack of open elements, then: let adjusted
+                       # insertion location be inside last template's template contents,
+                       # after its last child (if any), and abort these substeps.
+
+                       # 4. If there is no last table, then let adjusted insertion
+                       # location be inside the first element in the stack of open
+                       # elements (the html element), after its last child (if any), and
+                       # abort these substeps. (fragment case)
+
+                       # 5. If last table has a parent element, then let adjusted
+                       # insertion location be inside last table's parent element,
+                       # immediately before last table, and abort these substeps.
+
+                       # 6. Let previous element be the element immediately above last
+                       # table in the stack of open elements.
+
+                       # 7. Let adjusted insertion location be inside previous element,
+                       # after its last child (if any).
+
+                       # Note: These steps are involved in part because it's possible for
+                       # elements, the table element in this case in particular, to have
+                       # been moved by a script around in the DOM, or indeed removed from
+                       # the DOM entirely, after the element was inserted by the parser.
+               else
+                       # Otherwise Let adjusted insertion location be inside target, after
+                       # its last child (if any).
+                       target_i = target.children.length
+
+               # 3. If the adjusted insertion location is inside a template element,
+               # let it instead be inside the template element's template contents,
+               # after its last child (if any). TODO
+
+               # 4. Return the adjusted insertion location.
+               return [target, target_i]
+
+       # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
+       # aka create_an_element_for_token
+       token_to_element = (t, namespace, intended_parent) ->
+               t.type = TYPE_TAG # not TYPE_OPEN_TAG
+               # convert attributes into a hash
+               attrs = {}
+               while t.attrs_a.length
+                       a = t.attrs_a.pop()
+                       attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
+               el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
+
+               # TODO 2. If the newly created element has an xmlns attribute in the
+               # XMLNS namespace whose value is not exactly the same as the element's
+               # namespace, that is a parse error. Similarly, if the newly created
+               # element has an xmlns:xlink attribute in the XMLNS namespace whose
+               # value is not the XLink Namespace, that is a parse error.
+
+               # fixfull: the spec says stuff about form pointers and ownerDocument
+
+               return el
+
+       # FIXME read implement "foster parenting" part
+       # FIXME read spec, do this right
+       # FIXME implement the override target thing
+       # note: this assumes it's an open tag
+       # TODO tree_insert_html_element = (t, ...
+       tree_insert_element = (el, override_target = null, namespace = null) ->
+               dest = adjusted_insertion_location override_target
+               if el.type is TYPE_OPEN_TAG # means it's a "token"
+                       el = token_to_element el, namespace, dest[0]
+               # fixfull: Document nodes sometimes can't accept more chidren
+               dest[0].children.splice dest[1], 0, el
+               el.parent = dest[0]
+               open_els.unshift el
+               return el
+
+       # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
+       tree_insert_a_comment = (t) ->
+               # FIXME read spec for "adjusted insertion location, etc, this might be wrong
+               open_els[0].children.push t
+
+       # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
+       in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
+               for node, i in open_els
+                       if node.name is name
+                               # FIXME generate implied end tags except those with name==name
+                               parse_error() unless i is 0
+                               while i > 0
+                                       open_els.shift()
+                                       i -= 1
+                               open_els.shift()
+                               return
+                       if special_elements[node.name]?
+                               parse_error()
+                               return
+       tree_in_body = (t) ->
+               switch t.type
+                       when TYPE_TEXT
+                               switch t.text
+                                       when "\u0000"
+                                               parse_error()
+                                       when "\t", "\u000a", "\u000c", "\u000d", ' '
+                                               reconstruct_active_formatting_elements()
+                                               tree_insert_text t
+                                       else
+                                               reconstruct_active_formatting_elements()
+                                               tree_insert_text t
+                                               flag_frameset_ok = false
+                       when TYPE_COMMENT
+                               tree_insert_a_comment t
+                       when TYPE_DOCTYPE
+                               parse_error()
+                       when TYPE_OPEN_TAG
+                               switch t.name
+                                       when 'html'
+                                               parse_error()
+                                               return if template_tag_is_open()
+                                               root_attrs = open_els[open_els.length - 1].children
+                                               for k, v of t.attrs
+                                                       root_attrs[k] = v unless root_attrs[k]?
+                                       when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
+                                               # FIXME also do this for </template> (end tag)
+                                               return tree_in_head t
+                                       when 'body'
+                                               parse_error()
+                                               # TODO
+                                       when 'frameset'
+                                               parse_error()
+                                               # TODO
+                                       when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
+                                               close_p_if_in_button_scope()
+                                               tree_insert_element t
+                                       when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
+                                               close_p_if_in_button_scope()
+                                               if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+                                                       parse_error()
+                                                       open_els.shift()
+                                               tree_insert_element t
+                                       # TODO lots more to implement here
+                                       when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
+                                               reconstruct_active_formatting_elements()
+                                               el = tree_insert_element t
+                                               afe.push el
+                                       # TODO lots more to implement here
+                                       else # any other start tag
+                                               reconstruct_active_formatting_elements()
+                                               tree_insert_element t
+                       when TYPE_EOF
+                               ok_tags = {
+                                       dd: true, dt: true, li: true, p: true, tbody: true, td: true,
+                                       tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
+                               }
+                               for t in open_els
+                                       unless ok_tags[t.name]?
+                                               parse_error()
+                                               break
+                               # TODO stack of template insertion modes thing
+                               flag_parsing = false # stop parsing
+                       when TYPE_END_TAG
+                               switch t.name
+                                       when 'body'
+                                               unless is_in_scope 'body'
+                                                       parse_error()
+                                                       return
+                                               # TODO implement parse error and move to tree_after_body
+                                       when 'html'
+                                               unless is_in_scope 'body' # weird, but it's what the spec says
+                                                       parse_error()
+                                                       return
+                                               # TODO implement parse error and move to tree_after_body, reprocess
+                                       # TODO lots more close tags to implement here
+                                       when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
+                                               adoption_agency t.name
+                                       # TODO lots more close tags to implement here
+                                       else
+                                               in_body_any_other_end_tag t.name
+               return
 
 
        # the functions below implement the tokenizer stats described here:
@@ -167,75 +754,21 @@ parse_html = (txt) ->
        tok_state_data = ->
                switch c = txt.charAt(cur++)
                        when '&'
-                               tok_state = tok_state_character_reference_in_data
+                               return new_text_node tokenize_character_reference()
                        when '<'
                                tok_state = tok_state_tag_open
                        when "\u0000"
-                               # Parse error
-                               return [TYPE_TEXT, c]
+                               parse_error()
+                               return new_text_node c
+                       when '' # EOF
+                               return new_eof_token()
                        else
-                               return [TYPE_TEXT, c]
+                               return new_text_node c
                return null
 
        # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
-       # & just got consumed
-       tok_state_character_reference_in_data = ->
-               tok_state = tok_state_data
-               if cur >= txt.length
-                       return [TYPE_TEXT, '&']
-               switch c = txt.charAt(cur)
-                       when ';'
-                               return [TYPE_TEXT, '&']
-                       when '#'
-                               if cur + 1 >= txt.length
-                                       return [TYPE_TEXT, '&']
-                               if txt.charAt(cur + 1).toLowerCase() is 'x'
-                                       prefix = '#x'
-                                       charset = hex_chars
-                                       start = cur + 2
-                               else
-                                       charset = digits
-                                       start = cur + 1
-                                       prefix = '#'
-                               i = 0
-                               while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
-                                       i += 1
-                               if i is 0
-                                       return [TYPE_TEXT, '&']
-                               if txt.charAt(start + i) is ';'
-                                       i += 1
-                               decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
-                               if decoded?
-                                       cur = start + i
-                                       return [TYPE_TEXT, decoded]
-                               return [TYPE_TEXT, '&']
-                       else
-                               for i in [0...31]
-                                       if alnum.indexOf(txt.charAt(cur + i)) is -1
-                                               break
-                               if i is 0
-                                       return [TYPE_TEXT, '&']
-                               if txt.charAt(cur + i) is ';'
-                                       i += 1 # include ';' terminator in value
-                                       decoded = decode_named_char_ref txt.substr(cur, i)
-                                       if decoded?
-                                               cur += i
-                                               return [TYPE_TEXT, decoded]
-                                       return [TYPE_TEXT, '&']
-                               else
-                                       # no ';' terminator (only legacy char refs)
-                                       if i < 2 or i > 6
-                                               return [TYPE_TEXT, '&']
-                                       # FIXME: if we're inside an attribute:
-                                       # 1.    don't parse refs that are followed by =
-                                       # 2.    don't parse refs that are followed by alnum
-                                       max = i
-                                       for i in [2..max] # no prefix matches, so ok to check shortest first
-                                               c = legacy_char_refs[txt.substr(cur, i)]
-                                               if c?
-                                                       cur += i # consume entity chars
-                                                       return [TYPE_TEXT, c]
-               return null
+       # not needed: tok_state_character_reference_in_data = ->
+       # just call tok_state_character_reference_in_data()
 
        # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
        tok_state_tag_open = ->
@@ -245,20 +778,42 @@ parse_html = (txt) ->
                        when '/'
                                tok_state = tok_state_end_tag_open
                        when '?'
-                               # Parse error
+                               parse_error()
                                tok_state = tok_state_bogus_comment
                        else
                                if lc_alpha.indexOf(c) > -1
-                                       tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
+                                       tok_cur_tag = new_open_tag c
                                        tok_state = tok_state_tag_name
                                else if uc_alpha.indexOf(c) > -1
-                                       tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
+                                       tok_cur_tag = new_open_tag c.toLowerCase()
                                        tok_state = tok_state_tag_name
                                else
-                                       # Parse error
+                                       parse_error()
                                        tok_state = tok_state_data
                                        cur -= 1 # we didn't parse/handle the char after <
-                                       return [TYPE_TEXT, '<']
+                                       return new_text_node '<'
+               return null
+
+       # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
+       tok_state_end_tag_open = ->
+               switch c = txt.charAt(cur++)
+                       when '>'
+                               parse_error()
+                               tok_state = tok_state_data
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                               return new_text_node '</'
+                       else
+                               if uc_alpha.indexOf(c) > -1
+                                       tok_cur_tag = new_end_tag c.toLowerCase()
+                                       tok_state = tok_state_tag_name
+                               else if lc_alpha.indexOf(c) > -1
+                                       tok_cur_tag = new_end_tag c
+                                       tok_state = tok_state_tag_name
+                               else
+                                       parse_error()
+                                       tok_state = tok_state_bogus_comment
                return null
 
        # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
@@ -274,13 +829,16 @@ parse_html = (txt) ->
                                tok_cur_tag = null
                                return tmp
                        when "\u0000"
-                               # Parse error
-                               tok_cur_tag[1] += "\ufffd"
+                               parse_error()
+                               tok_cur_tag.name += "\ufffd"
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
                        else
                                if uc_alpha.indexOf(c) > -1
-                                       tok_cur_tag[1] += c.toLowerCase()
+                                       tok_cur_tag.name += c.toLowerCase()
                                else
-                                       tok_cur_tag[1] += c
+                                       tok_cur_tag.name += c
                return null
 
        # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
@@ -298,18 +856,21 @@ parse_html = (txt) ->
                                tok_cur_tag = null
                                return tmp
                        when "\u0000"
-                               # Parse error
+                               parse_error()
                                attr_name = "\ufffd"
                        when '"', "'", '<', '='
-                               # Parse error
+                               parse_error()
                                attr_name = c
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
                        else
                                if uc_alpha.indexOf(c) > -1
                                        attr_name = c.toLowerCase()
                                else
                                        attr_name = c
                if attr_name?
-                       tok_cur_tag[2].unshift [attr_name, '']
+                       tok_cur_tag.attrs_a.unshift [attr_name, '']
                        tok_state = tok_state_attribute_name
                return null
 
@@ -328,14 +889,19 @@ parse_html = (txt) ->
                                tok_cur_tag = null
                                return tmp
                        when "\u0000"
-                               # Parse error
-                               tok_cur_tag[2][0][0] += "\ufffd"
+                               parse_error()
+                               tok_cur_tag.attrs_a[0][0] = "\ufffd"
+                       when '"', "'", '<'
+                               parse_error()
+                               tok_cur_tag.attrs_a[0][0] = c
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
                        else
                                if uc_alpha.indexOf(c) > -1
-                                       tok_cur_tag[2][0][0] += c.toLowerCase()
+                                       tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
                                else
-                                       # Parse error if ", ' or <
-                                       tok_cur_tag[2][0][0] += c
+                                       tok_cur_tag.attrs_a[0][0] += c
                return null
 
        # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
@@ -352,7 +918,7 @@ parse_html = (txt) ->
                                tok_state = tok_state_attribute_value_single_quoted
                        when "\u0000"
                                # Parse error
-                               tok_cur_tag[2][0][1] += "\ufffd"
+                               tok_cur_tag.attrs_a[0][1] += "\ufffd"
                                tok_state = tok_state_attribute_value_unquoted
                        when '>'
                                # Parse error
@@ -360,8 +926,11 @@ parse_html = (txt) ->
                                tmp = tok_cur_tag
                                tok_cur_tag = null
                                return tmp
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
                        else
-                               tok_cur_tag[2][0][1] += c
+                               tok_cur_tag.attrs_a[0][1] += c
                                tok_state = tok_state_attribute_value_unquoted
                return null
 
@@ -371,13 +940,15 @@ parse_html = (txt) ->
                        when '"'
                                tok_state = tok_state_after_attribute_value_quoted
                        when '&'
-                               tok_state = tok_state_character_reference_in_attribute_value
-                               tok_char_ref_addl_allowed = '"' # FIXME
+                               tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
                        when "\u0000"
                                # Parse error
-                               tok_cur_tag[2][0][1] += "\ufffd"
+                               tok_cur_tag.attrs_a[0][1] += "\ufffd"
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
                        else
-                               tok_cur_tag[2][0][1] += c
+                               tok_cur_tag.attrs_a[0][1] += c
                return null
 
        # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
@@ -386,13 +957,15 @@ parse_html = (txt) ->
                        when "'"
                                tok_state = tok_state_after_attribute_value_quoted
                        when '&'
-                               tok_state = tok_state_character_reference_in_attribute_value
-                               tok_char_ref_addl_allowed = "'" # FIXME
+                               tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
                        when "\u0000"
                                # Parse error
-                               tok_cur_tag[2][0][1] += "\ufffd"
+                               tok_cur_tag.attrs_a[0][1] += "\ufffd"
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
                        else
-                               tok_cur_tag[2][0][1] += c
+                               tok_cur_tag.attrs_a[0][1] += c
                return null
 
        # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
@@ -401,18 +974,20 @@ parse_html = (txt) ->
                        when "\t", "\n", "\u000c", ' '
                                tok_state = tok_state_before_attribute_name
                        when '&'
-                               tok_state = tok_state_character_reference_in_attribute_value
-                               tok_char_ref_addl_allowed = '>' # FIXME
+                               tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
                        when '>'
                                tok_state = tok_state_data
                                tmp = tok_cur_tag
                                tok_cur_tag = null
                                return tmp
                        when "\u0000"
-                               tok_cur_tag[2][0][1] += "\ufffd"
+                               tok_cur_tag.attrs_a[0][1] += "\ufffd"
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
                        else
                                # Parse Error if ', <, = or ` (backtick)
-                               tok_cur_tag[2][0][1] += c
+                               tok_cur_tag.attrs_a[0][1] += c
                return null
 
        # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
@@ -427,59 +1002,204 @@ parse_html = (txt) ->
                                tmp = tok_cur_tag
                                tok_cur_tag = null
                                return tmp
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
                        else
                                # Parse Error
                                tok_state = tok_state_before_attribute_name
                                cur -= 1 # we didn't handle that char
                return null
 
-       # the functions below impliment the Tree Contstruction algorithm here:
-       # http://www.w3.org/TR/html5/syntax.html#tree-construction
-       # FIXME this is just a bit of a hack that makes sense... read spec and do it that way
-       tree_append = (t) ->
-               if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
-                       tree_append_point[tree_append_point.length - 1][1] += t[1]
-               else
-                       tree_append_point.push t
-                       if t[0] is TYPE_OPEN_TAG
-                               t[0] = TYPE_TAG
-                               attrs = {}
-                               while t[2].length
-                                       a = t[2].pop()
-                                       attrs[a[0]] = a[1]
-                               t[2] = attrs
-                               tree_append_point = t[3]
+       # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
+       # Don't set this as a state, just call it
+       # returns a string (NOT a text node)
+       tokenize_character_reference = (allowed_char = null, in_attr = false) ->
+               if cur >= txt.length
+                       return '&'
+               switch c = txt.charAt(cur)
+                       when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
+                               # explicitly not a parse error
+                               return '&'
+                       when ';'
+                               # there has to be "one or more" alnums between & and ; to be a parse error
+                               return '&'
+                       when '#'
+                               if cur + 1 >= txt.length
+                                       return '&'
+                               if txt.charAt(cur + 1).toLowerCase() is 'x'
+                                       prefix = '#x'
+                                       charset = hex_chars
+                                       start = cur + 2
+                               else
+                                       charset = digits
+                                       start = cur + 1
+                                       prefix = '#'
+                               i = 0
+                               while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
+                                       i += 1
+                               if i is 0
+                                       return '&'
+                               if txt.charAt(start + i) is ';'
+                                       i += 1
+                               # FIXME This is supposed to generate parse errors for some chars
+                               decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
+                               if decoded?
+                                       cur = start + i
+                                       return decoded
+                               return '&'
+                       else
+                               for i in [0...31]
+                                       if alnum.indexOf(txt.charAt(cur + i)) is -1
+                                               break
+                               if i is 0
+                                       # exit early, because parse_error() below needs at least one alnum
+                                       return '&'
+                               if txt.charAt(cur + i) is ';'
+                                       i += 1 # include ';' terminator in value
+                                       decoded = decode_named_char_ref txt.substr(cur, i)
+                                       if decoded?
+                                               cur += i
+                                               return decoded
+                                       parse_error()
+                                       return '&'
+                               else
+                                       # no ';' terminator (only legacy char refs)
+                                       max = i
+                                       for i in [2..max] # no prefix matches, so ok to check shortest first
+                                               c = legacy_char_refs[txt.substr(cur, i)]
+                                               if c?
+                                                       if in_attr
+                                                               if txt.charAt(cur + i) is '='
+                                                                       # "because some legacy user agents will
+                                                                       # misinterpret the markup in those cases"
+                                                                       parse_error()
+                                                                       return '&'
+                                                               if alnum.indexOf(txt.charAt(cur + i)) > -1
+                                                                       # this makes attributes forgiving about url args
+                                                                       return '&'
+                                                       # ok, and besides the weird exceptions for attributes...
+                                                       # return the matching char
+                                                       cur += i # consume entity chars
+                                                       parse_error() # because no terminating ";"
+                                                       return c
+                                       parse_error()
+                                       return '&'
+               return # never reached
 
        # tree constructor initialization
-       tree = [] # see comments on TYPE_TAG/etc for the structure of this data
-       tree_append_point = tree
-       tree_state = tree_append
+       # see comments on TYPE_TAG/etc for the structure of this data
+       tree = new Node TYPE_TAG, name: 'html'
+       open_els = [tree]
+       tree_state = tree_in_body
+       flag_frameset_ok = true
+       flag_parsing = true
+       flag_foster_parenting = false
+       afe = [] # active formatting elements
 
        # tokenizer initialization
        tok_state = tok_state_data
 
        # proccess input
-       while cur < txt.length
+       while flag_parsing
                t = tok_state()
                if t?
                        tree_state t
-
-       return tree
+       return tree.children
 
 # everything below is tests on the above
-test_equals = (description, fn, args..., expected_output) ->
-       output = fn.apply this, args
+test_equals = (description, output, expected_output) ->
        if output is expected_output
-               console.log "passed: #{description}."
+               console.log "passed." # don't say name, so smart consoles can merge all of these
        else
-               console.log "FAILED: #{description}. Expected: #{expected_output}, actual: #{output}"
-html_to_json = (html) ->
-       return JSON.stringify parse_html html
-test_equals "empty", html_to_json, "", '[]'
-test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
-test_equals "named entity", html_to_json, "a&amp;1234", '[[1,"a&1234"]]'
-test_equals "broken named character references", html_to_json, "1&amp2&&amp;3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
-test_equals "numbered entity overrides", html_to_json, "1&#X80&#x80; &#x83", '[[1,"1€€ ƒ"]]'
-test_equals "open tag", html_to_json, "foo<span>bar", '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]'
-test_equals "open tag with attributes", html_to_json, "foo<span style=\"foo: bar\" title=\"hi\">bar", '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]'
-test_equals "open tag with attributes of various quotings", html_to_json, "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar", '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]'
+               console.log "FAILED: \"#{description}\""
+               console.log "   Expected: #{expected_output}"
+               console.log "     Actual: #{output}"
+test_parser = (args) ->
+       parse_errors = []
+       errors_cb = (i) ->
+               parse_errors.push i
+       parsed = parse_html args.html, errors_cb
+       serialized = ''
+       sep = ''
+       for t in parsed
+               serialized += sep
+               sep = ','
+               serialized += t.serialize()
+       if serialized isnt args.expected or parse_errors.length isnt args.errors
+               console.log "FAILED: \"#{args.name}\""
+       else
+               console.log "passed \"#{args.name}\""
+       if serialized isnt args.expected
+               console.log "      Input: #{args.html}"
+               console.log "    Correct: #{args.expected}"
+               console.log "     Output: #{serialized}"
+       if parse_errors.length isnt args.errors
+               console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
+
+test_parser name: "empty", \
+       html: "",
+       expected: '',
+       errors: 0
+test_parser name: "just text", \
+       html: "abc",
+       expected: 'text:"abc"',
+       errors: 0
+test_parser name: "named entity", \
+       html: "a&amp;1234",
+       expected: 'text:"a&1234"',
+       errors: 0
+test_parser name: "broken named character references", \
+       html: "1&amp2&&amp;3&aabbcc;",
+       expected: 'text:"1&2&&3&aabbcc;"',
+       errors: 2
+test_parser name: "numbered entity overrides", \
+       html: "1&#X80&#x80; &#x83",
+       expected: 'text:"1€€ ƒ"',
+       errors: 0
+test_parser name: "open tag", \
+       html: "foo<span>bar",
+       expected: 'text:"foo",tag:"span",{},[text:"bar"]',
+       errors: 1 # no close tag
+test_parser name: "open tag with attributes", \
+       html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
+       expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
+       errors: 1 # no close tag
+test_parser name: "open tag with attributes of various quotings", \
+       html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
+       expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
+       errors: 1 # no close tag
+test_parser name: "attribute entity exceptions dq", \
+       html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
+       expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
+       errors: 2 # no close tag, &amp= in attr
+test_parser name: "attribute entity exceptions sq", \
+       html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
+       expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
+       errors: 2 # no close tag, &amp= in attr
+test_parser name: "attribute entity exceptions uq", \
+       html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
+       expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
+       errors: 2 # no close tag, &amp= in attr
+test_parser name: "matching closing tags", \
+       html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
+       expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
+       errors: 0
+test_parser name: "missing closing tag inside", \
+       html: "foo<div>bar<span>baz</div>qux",
+       expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
+       errors: 1 # close tag mismatch
+test_parser name: "mis-matched closing tags", \
+       html: "<span>12<div>34</span>56</div>78",
+       expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
+       errors: 2 # misplaced </span>, no </span> at the end
+test_parser name: "mis-matched formatting elements", \
+       html: "12<b>34<i>56</b>78</i>90",
+       expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
+       errors: 1 # no idea how many their should be
+test_parser name: "crazy formatting elements test", \
+       html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
+       # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
+       # firefox does this:
+       expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"',
+       errors: 6 # no idea how many there should be