fix comment

[peach-html5-editor.git] / parse-html.coffee
diff --git a/parse-html.coffee b/parse-html.coffee

index 3c861ba..8404f1a 100644 (file)
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -15,26 +15,57 @@
  # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  
  
-# This file implements a parser for html snippets, meant to be used by a
+# This file implements a thorough parser for html5, meant to be used by a
  # WYSIWYG editor.
  
  # The implementation is a pretty direct implementation of the parsing algorithm
  # described here:
-# http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
  #
-# Deviations from that spec:
+#     http://www.w3.org/TR/html5/syntax.html
  #
-#   Purposeful: search this file for "WHATWG"
+# except for some places marked "WHATWG" that are implemented as described here:
  #
-#   Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
+#     https://html.spec.whatwg.org/multipage/syntax.html
+#
+# This code passes all of the tests in the .dat files at:
+#
+#     https://github.com/JasonWoof/html5lib-tests/tree/patch-1/tree-construction
+
+
+##################################
+## how to use this code
+##################################
+#
+# See README.md for how to pre-compile this file, or compile it in the browser.
+#
+# This file exports a single useful function: parse_tml
+#
+# Once you include this file in a page (see index.html for an example) you'll
+# have window.wheic
+#
+# Call it like this:
+#
+#     wheic.parse_html({html: "<p><b>hi</p>"})
+#
+# Or, if you don't want <html><head><body>/etc, do this:
+#
+#     wheic.parse_html({fragment: "body", html: "<p><b>hi</p>"})
+#
+# This code can _almost_ run outside the browser (eg under node.js). To get it
+# to run without the browser would require native implementation of
+# decode_named_char_ref(). The current implementation of that function uses the
+# browser's DOM api, to save space (the list of valid named characters is
+# massive.)
  
+# This code is a work in progress, eg try search this file for "fixfull",
+# "TODO" and "FIXME"
  
-# stacks/lists
+
+# Notes:  stacks/lists
  #
-# the spec uses a many different words do indicate which ends of lists/stacks
-# they are talking about (and relative movement within the lists/stacks). This
-# section splains. I'm implementing "lists" (afe and open_els) the same way
-# (both as stacks)
+# Jason was frequently confused by the terminology used to refer to different
+# parts of the stacks and lists in the spec, so he made this chart to help keep
+# his head straight:
  #
  # stacks grow downward (current element is index=0)
  #
@@ -50,10 +81,6 @@
  #   1: b
  #   0: a "end of the list", "current node", "bottommost", "last"
  
-
-# browser
-# note: to get this to run outside a browser, you'll have to write a native
-# implementation of decode_named_char_ref()
  unless module?.exports?
         window.wheic = {}
         module = exports: window.wheic
@@ -84,14 +111,23 @@ NS_HTML = 1
  NS_MATHML = 2
  NS_SVG = 3
  
+# quirks mode constants
+QUIRKS_NO = 1
+QUIRKS_LIMITED = 2
+QUIRKS_YES = 3
+
+# queue up debug logs, so eg they can be shown only for tests that fail
  g_debug_log = []
  debug_log_reset = ->
         g_debug_log = []
+       return
  debug_log = (str) ->
         g_debug_log.push str
+       return
  debug_log_each = (cb) ->
         for str in g_debug_log
                 cb str
+       return
  
  prev_node_id = 0
  class Node
@@ -115,55 +151,13 @@ class Node
                         @token.flag 'did_self_close', true
                 else
                         @flag 'did_self_close', true
+               return
         flag: (key, value = null) ->
                 if value?
                         @flags[key] = value
                 else
                         return @flags[key]
-       serialize: (shallow = false, show_ids = false) -> # for unit tests
-               ret = ''
-               switch @type
-                       when TYPE_TAG
-                               ret += 'tag:'
-                               ret += JSON.stringify @name
-                               ret += ','
-                               if show_ids
-                                       ret += "##{@id},"
-                               if shallow
-                                       break
-                               attr_keys = []
-                               for k of @attrs
-                                       attr_keys.push k
-                               attr_keys.sort()
-                               ret += '{'
-                               sep = ''
-                               for k in attr_keys
-                                       ret += sep
-                                       sep = ','
-                                       ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
-                               ret += '},['
-                               sep = ''
-                               for c in @children
-                                       ret += sep
-                                       sep = ','
-                                       ret += c.serialize shallow, show_ids
-                               ret += ']'
-                       when TYPE_TEXT
-                               ret += 'text:'
-                               ret += JSON.stringify @text
-                       when TYPE_COMMENT
-                               ret += 'comment:'
-                               ret += JSON.stringify @text
-                       when TYPE_DOCTYPE
-                               ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
-                       when TYPE_AFE_MARKER
-                               ret += 'marker'
-                       when TYPE_AAA_BOOKMARK
-                               ret += 'aaa_bookmark'
-                       else
-                               ret += 'unknown:'
-                               console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
-               return ret
+               return
  
  # helpers: (only take args that are normally known when parser creates nodes)
  new_open_tag = (name) ->
@@ -249,6 +243,64 @@ unicode_fixes[0x9C] = "\u0153"
  unicode_fixes[0x9E] = "\u017E"
  unicode_fixes[0x9F] = "\u0178"
  
+quirks_yes_pi_prefixes = [
+       "+//silmaril//dtd html pro v0r11 19970101//"
+       "-//as//dtd html 3.0 aswedit + extensions//"
+       "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
+       "-//ietf//dtd html 2.0 level 1//"
+       "-//ietf//dtd html 2.0 level 2//"
+       "-//ietf//dtd html 2.0 strict level 1//"
+       "-//ietf//dtd html 2.0 strict level 2//"
+       "-//ietf//dtd html 2.0 strict//"
+       "-//ietf//dtd html 2.0//"
+       "-//ietf//dtd html 2.1e//"
+       "-//ietf//dtd html 3.0//"
+       "-//ietf//dtd html 3.2 final//"
+       "-//ietf//dtd html 3.2//"
+       "-//ietf//dtd html 3//"
+       "-//ietf//dtd html level 0//"
+       "-//ietf//dtd html level 1//"
+       "-//ietf//dtd html level 2//"
+       "-//ietf//dtd html level 3//"
+       "-//ietf//dtd html strict level 0//"
+       "-//ietf//dtd html strict level 1//"
+       "-//ietf//dtd html strict level 2//"
+       "-//ietf//dtd html strict level 3//"
+       "-//ietf//dtd html strict//"
+       "-//ietf//dtd html//"
+       "-//metrius//dtd metrius presentational//"
+       "-//microsoft//dtd internet explorer 2.0 html strict//"
+       "-//microsoft//dtd internet explorer 2.0 html//"
+       "-//microsoft//dtd internet explorer 2.0 tables//"
+       "-//microsoft//dtd internet explorer 3.0 html strict//"
+       "-//microsoft//dtd internet explorer 3.0 html//"
+       "-//microsoft//dtd internet explorer 3.0 tables//"
+       "-//netscape comm. corp.//dtd html//"
+       "-//netscape comm. corp.//dtd strict html//"
+       "-//o'reilly and associates//dtd html 2.0//"
+       "-//o'reilly and associates//dtd html extended 1.0//"
+       "-//o'reilly and associates//dtd html extended relaxed 1.0//"
+       "-//sq//dtd html 2.0 hotmetal + extensions//"
+       "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
+       "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
+       "-//spyglass//dtd html 2.0 extended//"
+       "-//sun microsystems corp.//dtd hotjava html//"
+       "-//sun microsystems corp.//dtd hotjava strict html//"
+       "-//w3c//dtd html 3 1995-03-24//"
+       "-//w3c//dtd html 3.2 draft//"
+       "-//w3c//dtd html 3.2 final//"
+       "-//w3c//dtd html 3.2//"
+       "-//w3c//dtd html 3.2s draft//"
+       "-//w3c//dtd html 4.0 frameset//"
+       "-//w3c//dtd html 4.0 transitional//"
+       "-//w3c//dtd html experimental 19960712//"
+       "-//w3c//dtd html experimental 970421//"
+       "-//w3c//dtd w3 html//"
+       "-//w3o//dtd w3 html 3.0//"
+       "-//webtechs//dtd mozilla html 2.0//"
+       "-//webtechs//dtd mozilla html//"
+]
+
  # These are the character references that don't need a terminating semicolon
  # min length: 2, max: 6, none are a prefix of any other.
  legacy_char_refs = {
@@ -598,52 +650,68 @@ parse_html = (args) ->
  
         stop_parsing = ->
                 flag_parsing = false
+               return
  
         parse_error = ->
                 if args.error_cb?
                         args.error_cb cur
                 else
                         console.log "Parse error at character #{cur} of #{txt.length}"
+               return
  
+       # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
+       # "Noah's Ark clause" but with three
         afe_push = (new_el) ->
                 matches = 0
                 for el, i in afe
+                       if el.type is TYPE_AFE_MARKER
+                               break
                         if el.name is new_el.name and el.namespace is new_el.namespace
+                               attrs_match = true
                                 for k, v of el.attrs
-                                       continue unless new_el.attrs[k] is v
-                               for k, v of new_el.attrs
-                                       continue unless el.attrs[k] is v
-                               matches += 1
-                               if matches is 3
-                                       afe.splice i, 1
-                                       break
+                                       unless new_el.attrs[k] is v
+                                               attrs_match = false
+                                               break
+                               if attrs_match
+                                       for k, v of new_el.attrs
+                                               unless el.attrs[k] is v
+                                                       attrs_match = false
+                                                       break
+                               if attrs_match
+                                       matches += 1
+                                       if matches is 3
+                                               afe.splice i, 1
+                                               break
                 afe.unshift new_el
+               return
+
         afe_push_marker = ->
                 afe.unshift new_afe_marker()
+               return
  
         # the functions below impliment the Tree Contstruction algorithm
         # http://www.w3.org/TR/html5/syntax.html#tree-construction
  
         # But first... the helpers
         template_tag_is_open = ->
-               for t in open_els
-                       if t.name is 'template' and t.namespace is NS_HTML
+               for el in open_els
+                       if el.name is 'template' and el.namespace is NS_HTML
                                 return true
                 return false
         is_in_scope_x = (tag_name, scope, namespace) ->
-               for t in open_els
-                       if t.name is tag_name and (namespace is null or namespace is t.namespace)
+               for el in open_els
+                       if el.name is tag_name and (namespace is null or namespace is el.namespace)
                                 return true
-                       if scope[t.name] is t.namespace
+                       if scope[el.name] is el.namespace
                                 return false
                 return false
         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
-               for t in open_els
-                       if t.name is tag_name and (namespace is null or namespace is t.namespace)
+               for el in open_els
+                       if el.name is tag_name and (namespace is null or namespace is el.namespace)
                                 return true
-                       if scope[t.name] is t.namespace
+                       if scope[el.name] is el.namespace
                                 return false
-                       if scope2[t.name] is t.namespace
+                       if scope2[el.name] is el.namespace
                                 return false
                 return false
         standard_scopers = {
@@ -743,8 +811,8 @@ parse_html = (args) ->
                 loop
                         if node_i is open_els.length - 1
                                 last = true
-                               # fixfull (fragment case)
-
+                               if flag_fragment_parsing
+                                       node = context_element
                         # 4. If node is a select element, run these substeps:
                         if node.name is 'select' and node.namespace is NS_HTML
                                 # 1. If last is true, jump to the step below labeled done.
@@ -853,6 +921,7 @@ parse_html = (args) ->
                         node_i += 1
                         node = open_els[node_i]
                         # 19. Return to the step labeled loop.
+               return
  
         # 8.2.3.2
  
@@ -884,6 +953,7 @@ parse_html = (args) ->
                         afe[i] = el
                         break if i is 0
                         i -= 1 # Advance
+               return
  
         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
         # adoption agency algorithm
@@ -892,10 +962,6 @@ parse_html = (args) ->
         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
         adoption_agency = (subject) ->
-               debug_log "adoption_agency()"
-               debug_log "tree: #{serialize_els doc.children, false, true}"
-               debug_log "open_els: #{serialize_els open_els, true, true}"
-               debug_log "afe: #{serialize_els afe, true, true}"
  # this block implements tha W3C spec
  #              # 1. If the current node is an HTML element whose tag name is subject,
  #              # then run these substeps:
@@ -915,7 +981,6 @@ parse_html = (args) ->
  #                              if t is el
  #                                      afe.splice i, 1
  #                                      break
-#                      debug_log "aaa: starting off with subject on top of stack, exiting"
  #                      return
  # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
                 # If the current node is an HTML element whose tag name is subject, and
@@ -923,7 +988,6 @@ parse_html = (args) ->
                 # then pop the current node off the stack of open elements, and abort
                 # these steps.
                 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
-                       debug_log "aaa: starting off with subject on top of stack, exiting"
                         # remove it from the list of active formatting elements (if found)
                         in_afe = false
                         for el, i in afe
@@ -931,7 +995,6 @@ parse_html = (args) ->
                                         in_afe = true
                                         break
                         unless in_afe
-                               debug_log "aaa: ...and not in afe, aaa done"
                                 open_els.shift()
                                 return
                         # fall through
@@ -955,7 +1018,6 @@ parse_html = (args) ->
                         # If there is no such element, then abort these steps and instead
                         # act as described in the "any other end tag" entry above.
                         if fe is null
-                               debug_log "aaa: fe not found in afe"
                                 in_body_any_other_end_tag subject
                                 return
                         # 6. If formatting element is not in the stack of open elements,
@@ -967,7 +1029,6 @@ parse_html = (args) ->
                                         in_open_els = true
                                         break
                         unless in_open_els
-                               debug_log "aaa: fe not found in open_els"
                                 parse_error()
                                 # "remove it from the list" must mean afe, since it's not in open_els
                                 afe.splice fe_of_afe, 1
@@ -976,7 +1037,6 @@ parse_html = (args) ->
                         # the element is not in scope, then this is a parse error; abort
                         # these steps.
                         unless el_is_in_scope fe
-                               debug_log "aaa: fe not in scope"
                                 parse_error()
                                 return
                         # 8. If formatting element is not the current node, this is a parse
@@ -1002,7 +1062,6 @@ parse_html = (args) ->
                         # formatting element from the list of active formatting elements,
                         # and finally abort these steps.
                         if fb is null
-                               debug_log "aaa: no fb"
                                 loop
                                         t = open_els.shift()
                                         if t is fe
@@ -1034,21 +1093,12 @@ parse_html = (args) ->
                                                 node_next = open_els[i + 1]
                                                 break
                                 node = node_next ? node_above
-                               debug_log "inner loop #{inner}"
-                               debug_log "tree: #{serialize_els doc.children, false, true}"
-                               debug_log "open_els: #{serialize_els open_els, true, true}"
-                               debug_log "afe: #{serialize_els afe, true, true}"
-                               debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
-                               debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
-                               debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
-                               debug_log "node: #{node.serialize true, true}"
                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
  
                                 # 4. If node is formatting element, then go to the next step in
                                 # the overall algorithm.
                                 if node is fe
                                         break
-                               debug_log "the meat"
                                 # 5. If inner loop counter is greater than three and node is in
                                 # the list of active formatting elements, then remove node from
                                 # the list of active formatting elements.
@@ -1057,23 +1107,19 @@ parse_html = (args) ->
                                         if t is node
                                                 if inner > 3
                                                         afe.splice i, 1
-                                                       debug_log "max out inner"
                                                 else
                                                         node_in_afe = true
-                                                       debug_log "in afe"
                                                 break
                                 # 6. If node is not in the list of active formatting elements,
                                 # then remove node from the stack of open elements and then go
                                 # back to the step labeled inner loop.
                                 unless node_in_afe
-                                       debug_log "not in afe"
                                         for t, i in open_els
                                                 if t is node
                                                         node_above = open_els[i + 1]
                                                         open_els.splice i, 1
                                                         break
                                         continue
-                               debug_log "the bones"
                                 # 7. create an element for the token for which the element node
                                 # was created, in the HTML namespace, with common ancestor as
                                 # the intended parent; replace the entry for node in the list
@@ -1085,13 +1131,11 @@ parse_html = (args) ->
                                 for t, i in afe
                                         if t is node
                                                 afe[i] = new_node
-                                               debug_log "replaced in afe"
                                                 break
                                 for t, i in open_els
                                         if t is node
                                                 node_above = open_els[i + 1]
                                                 open_els[i] = new_node
-                                               debug_log "replaced in open_els"
                                                 break
                                 node = new_node
                                 # 8. If last node is furthest block, then move the
@@ -1101,29 +1145,23 @@ parse_html = (args) ->
                                         for t, i in afe
                                                 if t is bookmark
                                                         afe.splice i, 1
-                                                       debug_log "removed bookmark"
                                                         break
                                         for t, i in afe
                                                 if t is node
                                                         # "after" means lower
                                                         afe.splice i, 0, bookmark # "after as <-
-                                                       debug_log "placed bookmark after node"
-                                                       debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
                                                         break
                                 # 9. Insert last node into node, first removing it from its
                                 # previous parent node if any.
                                 if last_node.parent?
-                                       debug_log "last_node has parent"
                                         for c, i in last_node.parent.children
                                                 if c is last_node
-                                                       debug_log "removing last_node from parent"
                                                         last_node.parent.children.splice i, 1
                                                         break
                                 node.children.push last_node
                                 last_node.parent = node
                                 # 10. Let last node be node.
                                 last_node = node
-                               debug_log "at last"
                                 # 11. Return to the step labeled inner loop.
                         # 14. Insert whatever last node ended up being in the previous step
                         # at the appropriate place for inserting a node, but using common
@@ -1134,36 +1172,15 @@ parse_html = (args) ->
                         #   * last_node is fb
                         #   * last_node is still in the tree (not a duplicate)
                         if last_node.parent?
-                               debug_log "FEFIRST? last_node has parent"
                                 for c, i in last_node.parent.children
                                         if c is last_node
-                                               debug_log "removing last_node from parent"
                                                 last_node.parent.children.splice i, 1
                                                 break
-
-                       debug_log "after aaa inner loop"
-                       debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
-                       debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
-                       debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
-                       debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
-                       debug_log "tree: #{serialize_els doc.children, false, true}"
-
-                       debug_log "insert"
-
-
                         # can't use standard insert token thing, because it's already in
                         # open_els and must stay at it's current position in open_els
                         dest = adjusted_insertion_location ca
                         dest[0].children.splice dest[1], 0, last_node
                         last_node.parent = dest[0]
-
-
-                       debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
-                       debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
-                       debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
-                       debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
-                       debug_log "tree: #{serialize_els doc.children, false, true}"
-
                         # 15. Create an element for the token for which formatting element
                         # was created, in the HTML namespace, with furthest block as the
                         # intended parent.
@@ -1201,11 +1218,7 @@ parse_html = (args) ->
                                         open_els.splice i, 0, new_element
                                         break
                         # 20. Jump back to the step labeled outer loop.
-                       debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
-                       debug_log "tree: #{serialize_els doc.children, false, true}"
-                       debug_log "open_els: #{serialize_els open_els, true, true}"
-                       debug_log "afe: #{serialize_els afe, true, true}"
-               debug_log "AAA DONE"
+               return
  
         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
         close_p_element = ->
@@ -1216,9 +1229,11 @@ parse_html = (args) ->
                         el = open_els.shift()
                         if el.name is 'p' and el.namespace is NS_HTML
                                 return
+               return
         close_p_if_in_button_scope = ->
                 if is_in_button_scope 'p', NS_HTML
                         close_p_element()
+               return
  
         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
         # aka insert_a_character = (t) ->
@@ -1231,7 +1246,7 @@ parse_html = (args) ->
                                 prev.text += t.text
                                 return
                 dest[0].children.splice dest[1], 0, t
-
+               return
  
         # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
         process_token = (t) ->
@@ -1387,13 +1402,14 @@ parse_html = (args) ->
                 return el
         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
         insert_html_element = (token) ->
-               insert_foreign_element token, NS_HTML
+               return insert_foreign_element token, NS_HTML
  
         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
         # position should be [node, index_within_children]
         insert_comment = (t, position = null) ->
                 position ?= adjusted_insertion_location()
                 position[0].children.splice position[1], 0, t
+               return
  
         # 8.2.5.2
         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
@@ -1402,23 +1418,55 @@ parse_html = (args) ->
                 tok_state = tok_state_rawtext
                 original_ins_mode = ins_mode
                 ins_mode = ins_mode_text
+               return
         parse_generic_rcdata_text = (t) ->
                 insert_html_element t
                 tok_state = tok_state_rcdata
                 original_ins_mode = ins_mode
                 ins_mode = ins_mode_text
+               return
  
         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
         generate_implied_end_tags = (except = null) ->
                 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
                         open_els.shift()
+               return
  
         # 8.2.5.4 The rules for parsing tokens in HTML content
         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
  
         # 8.2.5.4.1 The "initial" insertion mode
         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
+       is_quirks_yes_doctype = (t) ->
+               if t.flag 'force-quirks'
+                       return true
+               if t.name isnt 'html'
+                       return true
+               if t.public_identifier?
+                       pi = t.public_identifier.toLowerCase()
+                       for p in quirks_yes_pi_prefixes
+                               if pi.substr(0, p.length) is p
+                                       return true
+                       if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
+                               return true
+               if t.system_identifier?
+                       if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
+                               return true
+               else if t.public_identifier?
+                       # already did this: pi = t.public_identifier.toLowerCase()
+                       if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
+                               return true
+               return false
+       is_quirks_limited_doctype = (t) ->
+               if t.public_identifier?
+                       pi = t.public_identifier.toLowerCase()
+                       if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
+                               return true
+                       if t.system_identifier?
+                               if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
+                                       return true
+               return false
         ins_mode_initial = (t) ->
                 if is_space_tok t
                         return
@@ -1427,13 +1475,20 @@ parse_html = (args) ->
                         doc.children.push t
                         return
                 if t.type is TYPE_DOCTYPE
-                       # FIXME check identifiers, set quirks, etc
-                       # fixfull
+                       # fixfull syntax error from first paragraph and following bullets
+                       # fixfull set doc.doctype
+                       # fixfull is the "not an iframe srcdoc" thing relevant?
+                       if is_quirks_yes_doctype t
+                               doc.flag 'quirks mode', QUIRKS_YES
+                       else if is_quirks_limited_doctype t
+                               doc.flag 'quirks mode', QUIRKS_LIMITED
                         doc.children.push t
                         ins_mode = ins_mode_before_html
                         return
                 # Anything else
-               #fixfull (iframe, quirks)
+               # fixfull not iframe srcdoc?
+               parse_error()
+               doc.flag 'quirks mode', QUIRKS_YES
                 ins_mode = ins_mode_before_html
                 process_token t
                 return
@@ -1451,6 +1506,7 @@ parse_html = (args) ->
                 if t.type is TYPE_START_TAG and t.name is 'html'
                         el = token_to_element t, NS_HTML, doc
                         doc.children.push el
+                       el.document = doc
                         open_els.unshift(el)
                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
                         ins_mode = ins_mode_before_head
@@ -1462,9 +1518,9 @@ parse_html = (args) ->
                                 parse_error()
                                 return
                 # Anything else
-               html_tok = new_open_tag 'html'
-               el = token_to_element html_tok, NS_HTML, doc
+               el = token_to_element new_open_tag('html'), NS_HTML, doc
                 doc.children.push el
+               el.document = doc
                 open_els.unshift el
                 # ?fixfull browsing context
                 ins_mode = ins_mode_before_head
@@ -1496,17 +1552,18 @@ parse_html = (args) ->
                                 parse_error()
                                 return
                 # Anything else
-               head_tok = new_open_tag 'head'
-               el = insert_html_element head_tok
+               el = insert_html_element new_open_tag 'head'
                 head_element_pointer = el
                 ins_mode = ins_mode_in_head
                 process_token t
+               return
  
         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
                 open_els.shift() # spec says this will be a 'head' node
                 ins_mode = ins_mode_after_head
                 process_token t
+               return
         ins_mode_in_head = (t) ->
                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
                         insert_character t
@@ -1585,6 +1642,7 @@ parse_html = (args) ->
                         parse_error()
                         return
                 ins_mode_in_head_else t
+               return
  
         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
         ins_mode_in_head_noscript_else = (t) ->
@@ -1592,6 +1650,7 @@ parse_html = (args) ->
                 open_els.shift()
                 ins_mode = ins_mode_in_head
                 process_token t
+               return
         ins_mode_in_head_noscript = (t) ->
                 if t.type is TYPE_DOCTYPE
                         parse_error()
@@ -1616,8 +1675,6 @@ parse_html = (args) ->
                 ins_mode_in_head_noscript_else t
                 return
  
-
-
         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
         ins_mode_after_head_else = (t) ->
                 body_tok = new_open_tag 'body'
@@ -1655,7 +1712,6 @@ parse_html = (args) ->
                                 if el is head_element_pointer
                                         open_els.splice i, 1
                                         return
-                       console.log "warning: 23904 couldn't find head element in open_els"
                         return
                 if t.type is TYPE_END_TAG and t.name is 'template'
                         ins_mode_in_head t
@@ -1668,20 +1724,27 @@ parse_html = (args) ->
                         return
                 # Anything else
                 ins_mode_after_head_else t
+               return
  
         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
-               for el, i in open_els
-                       if el.name is name and el.namespace is NS_HTML
+               node = open_els[0]
+               loop
+                       if node.name is name and node.namespace is NS_HTML
                                 generate_implied_end_tags name # arg is exception
-                               parse_error() unless i is 0
-                               while i >= 0
-                                       open_els.shift()
-                                       i -= 1
-                               return
-                       if special_elements[el.name] is el.namespace
+                               unless node is open_els[0]
+                                       parse_error()
+                               loop
+                                       el = open_els.shift()
+                                       if el is node
+                                               return
+                       if special_elements[node.name] is node.namespace
                                 parse_error()
                                 return
+                       for el, i in open_els
+                               if node is el
+                                       node = open_els[i + 1]
+                                       break
                 return
         ins_mode_in_body = (t) ->
                 if t.type is TYPE_TEXT and t.text is "\u0000"
@@ -1809,11 +1872,7 @@ parse_html = (args) ->
                 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
                         close_p_if_in_button_scope()
                         insert_html_element t
-                       # spec: If the next token is a "LF" (U+000A) character token, then
-                       # ignore that token and move on to the next one. (Newlines at the
-                       # start of pre blocks are ignored as an authoring convenience.)
-                       if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
-                               cur += 1
+                       eat_next_token_if_newline()
                         flag_frameset_ok = false
                         return
                 if t.type is TYPE_START_TAG and t.name is 'form'
@@ -2008,6 +2067,10 @@ parse_html = (args) ->
                         return
                 if t.type is TYPE_START_TAG and t.name is 'nobr'
                         reconstruct_afe()
+                       if is_in_scope 'nobr', NS_HTML
+                               parse_error()
+                               adoption_agency 'nobr'
+                               reconstruct_afe()
                         el = insert_html_element t
                         afe_push el
                         return
@@ -2034,14 +2097,16 @@ parse_html = (args) ->
                         clear_afe_to_marker()
                         return
                 if t.type is TYPE_START_TAG and t.name is 'table'
-                       close_p_if_in_button_scope() # fixfull quirksmode thing
+                       unless doc.flag('quirks mode') is QUIRKS_YES
+                               close_p_if_in_button_scope() # test
                         insert_html_element t
                         flag_frameset_ok = false
                         ins_mode = ins_mode_in_table
                         return
                 if t.type is TYPE_END_TAG and t.name is 'br'
                         parse_error()
-                       t.type is TYPE_START_TAG
+                       # W3C: t.type = TYPE_START_TAG
+                       t = new_open_tag 'br' # WHATWG
                         # fall through
                 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
                         reconstruct_afe()
@@ -2058,7 +2123,8 @@ parse_html = (args) ->
                         unless is_input_hidden_tok t
                                 flag_frameset_ok = false
                         return
-               if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
+               if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
+                       # WHATWG adds 'menuitem' for this block
                         insert_html_element t
                         open_els.shift()
                         t.acknowledge_self_closing()
@@ -2118,8 +2184,7 @@ parse_html = (args) ->
                         return
                 if t.type is TYPE_START_TAG and t.name is 'textarea'
                         insert_html_element t
-                       if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
-                               cur += 1
+                       eat_next_token_if_newline()
                         tok_state = tok_state_rcdata
                         original_ins_mode = ins_mode
                         flag_frameset_ok = false
@@ -2237,7 +2302,7 @@ parse_html = (args) ->
                         open_els.shift()
                         ins_mode = original_ins_mode
                         return
-               console.log 'warning: end of ins_mode_text reached'
+               return
  
         # the functions below implement the tokenizer stats described here:
         # http://www.w3.org/TR/html5/syntax.html#tokenization
@@ -2338,6 +2403,7 @@ parse_html = (args) ->
                                 ins_mode_in_body t
                         else
                                 ins_mode_in_table_else t
+               return
  
  
         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
@@ -2364,6 +2430,7 @@ parse_html = (args) ->
                 pending_table_character_tokens = []
                 ins_mode = original_ins_mode
                 process_token t
+               return
  
         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
         ins_mode_in_caption = (t) ->
@@ -2399,6 +2466,7 @@ parse_html = (args) ->
                         return
                 # Anything else
                 ins_mode_in_body t
+               return
  
         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
         ins_mode_in_column_group = (t) ->
@@ -2487,6 +2555,7 @@ parse_html = (args) ->
                         return
                 # Anything else
                 ins_mode_in_table t
+               return
  
         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
         ins_mode_in_row = (t) ->
@@ -2528,6 +2597,7 @@ parse_html = (args) ->
                         return
                 # Anything else
                 ins_mode_in_table t
+               return
  
         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
         close_the_cell = ->
@@ -2540,6 +2610,7 @@ parse_html = (args) ->
                                 break
                 clear_afe_to_marker()
                 ins_mode = ins_mode_in_row
+               return
  
         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
         ins_mode_in_cell = (t) ->
@@ -2583,6 +2654,7 @@ parse_html = (args) ->
                         return
                 # Anything Else
                 ins_mode_in_body t
+               return
  
         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
         ins_mode_in_select = (t) ->
@@ -2614,7 +2686,7 @@ parse_html = (args) ->
                         insert_html_element t
                         return
                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
-                       if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
+                       if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
                                 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
                                         open_els.shift()
                         if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
@@ -2650,7 +2722,7 @@ parse_html = (args) ->
                         return
                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
                         parse_error()
-                       if is_in_select_scope 'select', NS_HTML
+                       unless is_in_select_scope 'select', NS_HTML
                                 return
                         loop
                                 el = open_els.shift()
@@ -2749,6 +2821,7 @@ parse_html = (args) ->
                         template_ins_modes.shift()
                         reset_ins_mode()
                         process_token t
+               return
  
         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
         ins_mode_after_body = (t) ->
@@ -2778,6 +2851,7 @@ parse_html = (args) ->
                 parse_error()
                 ins_mode = ins_mode_in_body
                 process_token t
+               return
  
         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
         ins_mode_in_frameset = (t) ->
@@ -2965,6 +3039,7 @@ parse_html = (args) ->
                                 if node.namespace is NS_HTML
                                         break
                         ins_mode t # explicitly call HTML insertion mode
+               return
  
  
         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
@@ -2976,7 +3051,7 @@ parse_html = (args) ->
                                 tok_state = tok_state_tag_open
                         when "\u0000"
                                 parse_error()
-                               return new_text_node "\ufffd"
+                               return new_text_node c
                         when '' # EOF
                                 return new_eof_token()
                         else
@@ -3158,12 +3233,8 @@ parse_html = (args) ->
  
         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
         is_appropriate_end_tag = (t) ->
-               # spec says to check against "the tag name of the last start tag to
-               # have been emitted from this tokenizer", but this is only called from
-               # the various "raw" states, so it's hopefully ok to assume that
-               # open_els[0].name will work instead TODO: verify this after the script
-               # data states are implemented
-               debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
+               # fixfull: this assumes that open_els[0].name is "the tag name of the last
+               # start tag to have been emitted from this tokenizer"
                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
  
         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
@@ -3673,7 +3744,7 @@ parse_html = (args) ->
                         return
                 if c is '>'
                         tok_state = tok_state_data
-                       return
+                       return tok_cur_tag
                 if is_uc_alpha(c)
                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
                         tok_state = tok_state_attribute_name
@@ -3694,6 +3765,7 @@ parse_html = (args) ->
                 # Anything else
                 tok_cur_tag.attrs_a.unshift [c, '']
                 tok_state = tok_state_attribute_name
+               return
  
         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
         tok_state_before_attribute_value = ->
@@ -4426,7 +4498,10 @@ parse_html = (args) ->
                 else
                         val = txt.substr cur, (next_gt - cur)
                         cur = next_gt + 3
-               return new_character_token val # fixfull split
+               val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
+               if val.length > 0
+                       return new_character_token val # fixfull split
+               return null
  
         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
         # Don't set this as a state, just call it
@@ -4517,11 +4592,31 @@ parse_html = (args) ->
                                         return '&'
                 return # never reached
  
+       eat_next_token_if_newline = ->
+               old_cur = cur
+               t = null
+               until t?
+                       t = tok_state()
+               if t.type is TYPE_TEXT
+                       # definition of a newline depends on whether it was a character ref or not
+                       if cur - old_cur is 1
+                               # not a character reference
+                               if t.text is "\u000d" or t.text is "\u000a"
+                                       return
+                       else
+                               if t.text is "\u000a"
+                                       return
+               # not a "newline"
+               cur = old_cur
+               return
+
         # tree constructor initialization
         # see comments on TYPE_TAG/etc for the structure of this data
         txt = args.html
         cur = 0
-       doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+       doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
+       doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
+       fragment_root = null # fragment parsing algorithm returns children of this
         open_els = []
         afe = [] # active formatting elements
         template_ins_modes = []
@@ -4535,38 +4630,108 @@ parse_html = (args) ->
         temporary_buffer = null
         pending_table_character_tokens = []
         head_element_pointer = null
-       flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
-       context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
+       flag_fragment_parsing = false
+       context_element = null
         prev_node_id = 0 # just for debugging
  
         # tokenizer initialization
         tok_state = tok_state_data
  
-       # text pre-processing
-       # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
-       txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
-       txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
-       txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
+       parse_init = ->
+               # fragment parsing (text arg)
+               if args.fragment?
+                       # this handles the fragment from the tests in the format described here:
+                       # https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/README.md
+                       f = args.fragment
+                       ns = NS_HTML
+                       if f.substr(0, 5) is 'math '
+                               f = f.substr 5
+                               ns = NS_MATHML
+                       else if f.substr(0, 4) is 'svg '
+                               f = f.substr 4
+                               ns = NS_SVG
+                       t = new_open_tag f
+                       context_element = token_to_element t, ns
+                       context_element.document = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
+                       context_element.document.flag 'quirks mode', QUIRKS_NO
+               # fragment parsing (Node arg)
+               if args.context?
+                       context_element = args.context
+
+               # http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
+               # fragment parsing algorithm
+               if context_element?
+                       flag_fragment_parsing = true
+                       doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+                       # search up the tree from context, to try to find it's document,
+                       # because this file only puts a "document" property on the root
+                       # element.
+                       old_doc = null
+                       el = context_element
+                       loop
+                               if el.document?
+                                       old_doc = el.document
+                                       break
+                               if el.parent
+                                       el = el.parent
+                               else
+                                       break
+                       if old_doc
+                               doc.flag 'quirks mode', old_doc.flag 'quirks mode'
+                       # set tok_state
+                       if context_element.namespace is NS_HTML
+                               switch context_element.name
+                                       when 'title', 'textarea'
+                                               tok_state = tok_state_rcdata
+                                       when 'style', 'xmp', 'iframe', 'noembed', 'noframes'
+                                               tok_state = tok_state_rawtext
+                                       when 'script'
+                                               tok_state = tok_state_script_data
+                                       when 'noscript'
+                                               if flag_scripting
+                                                       tok_state = tok_state_rawtext
+                                       when 'plaintext'
+                                               tok_state = tok_state_plaintext
+                       fragment_root = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
+                       doc.children.push fragment_root
+                       fragment_root.document = doc
+                       open_els = [fragment_root]
+                       if context_element.name is 'template' and context_element.namespace is NS_HTML
+                               template_ins_modes.unshift ins_mode_in_template
+                       # fixfull create token for context (it should have it's original one already)
+                       reset_ins_mode()
+                       # set form_element pointer... in the foreign doc?!
+                       el = context_element
+                       loop
+                               if el.name is 'form' and el.namespace is NS_HTML
+                                       form_element_pointer = el
+                                       break
+                               if el.parent
+                                       el = el.parent
+                               else
+                                       break
+
+               # text pre-processing
+               # FIXME check http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
+               txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
+               txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
+
+               return
  
-       if args.name is "tests18.dat #17"
-               console.log "hi"
-       # proccess input
         # http://www.w3.org/TR/html5/syntax.html#tree-construction
-       while flag_parsing
-               t = tok_state()
-               if t?
-                       process_token t
-                       # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
-       return doc.children
+       parse_main_loop = ->
+               while flag_parsing
+                       t = tok_state()
+                       if t?
+                               process_token t
+                               # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
+               return
+       parse_init()
+       parse_main_loop()
  
-serialize_els = (els, shallow, show_ids) ->
-       serialized = ''
-       sep = ''
-       for t in els
-               serialized += sep
-               sep = ','
-               serialized += t.serialize shallow, show_ids
-       return serialized
+       if flag_fragment_parsing
+               return fragment_root.children
+       return doc.children
  
  module.exports.parse_html = parse_html
  module.exports.debug_log_reset = debug_log_reset
@@ -4578,3 +4743,6 @@ module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
  module.exports.NS_HTML = NS_HTML
  module.exports.NS_MATHML = NS_MATHML
  module.exports.NS_SVG = NS_SVG
+module.exports.QUIRKS_NO = QUIRKS_NO
+module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
+module.exports.QUIRKS_YES = QUIRKS_YES