test for correct number of parse errors

[peach-html5-editor.git] / parse-html.coffee
diff --git a/parse-html.coffee b/parse-html.coffee

index 04733fb..927cb95 100644 (file)
--- a/parse-html.coffee
+++ b/parse-html.coffee
@@ -28,12 +28,23 @@
  
  TYPE_TAG = 0 # name, {attributes}, [children]
  TYPE_TEXT = 1 # "text"
-TYPE_WHITESPACE = 2
-TYPE_COMMENT = 3
+TYPE_COMMENT = 2
+TYPE_DOCTYPE = 3
+# the following types are emited by the tokenizer, but shouldn't end up in the tree:
+TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
+TYPE_END_TAG = 5 # name
+TYPE_EOF = 6
  
-alnum = "abcdefghijklmnopqrstuvwxqzABCDEFGHIJKLMNOPQRSTUVWXQZ0123456789"
-hex_chars = "0123456789abcdefABCDEF"
+lc_alpha = "abcdefghijklmnopqrstuvwxqz"
+uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
  digits = "0123456789"
+alnum = lc_alpha + uc_alpha + digits
+hex_chars = digits + "abcdefABCDEF"
+scopers = { # FIXME these are supposed to be namespace specific
+       'applet', 'caption', 'html', 'table', 'td', 'th', 'marquee', 'object',
+       'template', 'mi', 'mo', 'mn', 'ms', 'mtext', 'annotation-xml',
+       'foreignObject', 'desc', 'title'
+}
  
  # some SVG elements have dashes in them
  tag_name_chars = alnum + "-"
@@ -123,6 +134,38 @@ mathml_elements = [
  # foreign_elements = [svg_elements..., mathml_elements...]
  #normal_elements = All other allowed HTML elements are normal elements.
  
+special_elements = {
+       # from HTML:
+       address: true, applet: true, area: true, article: true, aside: true,
+       base: true, basefont: true, bgsound: true, blockquote: true, body: true,
+       br: true, button: true, caption: true, center: true, col: true,
+       colgroup: true, dd: true, details: true, dir: true, div: true, dl: true,
+       dt: true, embed: true, fieldset: true, figcaption: true, figure: true,
+       footer: true, form: true, frame: true, frameset: true, h1: true, h2: true,
+       h3: true, h4: true, h5: true, h6: true, head: true, header: true,
+       hgroup: true, hr: true, html: true, iframe: true, img: true, input: true,
+       isindex: true, li: true, link: true, listing: true, main: true,
+       marquee: true, meta: true, nav: true, noembed: true, noframes: true,
+       noscript: true, object: true, ol: true, p: true, param: true,
+       plaintext: true, pre: true, script: true, section: true, select: true,
+       source: true, style: true, summary: true, table: true, tbody: true,
+       td: true, template: true, textarea: true, tfoot: true, th: true,
+       thead: true, title: true, tr: true, track: true, ul: true, wbr: true,
+       xmp: true,
+
+       # from MathML:
+       mi: true, mo: true, mn: true, ms: true, mtext: true, 'annotation-xml': true,
+
+       # from SVG:
+       foreignObject: true, desc: true, title: true
+}
+
+formatting_elements = {
+        a: true, b: true, big: true, code: true, em: true, font: true, i: true,
+        nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
+        u: true
+}
+
  
  # decode_named_char_ref()
  #
@@ -146,44 +189,450 @@ decode_named_char_ref = (txt) ->
         return null if decoded is txt
         return g_dncr.cache[txt] = decoded
  
-parse_html = (txt) ->
+parse_html = (txt, parse_error_cb = null) ->
         cur = 0 # index of next char in txt to be parsed
         # declare tree and tokenizer variables so they're in scope below
         tree = null
-       tree_append_point = null
+       open_tags = [] # stack of open elements
         tree_state = null
         tok_state = null
+       tok_cur_tag = null # partially parsed tag
+       flag_frameset_ok = null
+       flag_parsing = null
+
+       parse_error = ->
+               if parse_error_cb?
+                       parse_error_cb cur
+               else
+                       console.log "Parse error at character #{cur} of #{txt.length}"
+
+
+       # the functions below impliment the Tree Contstruction algorithm
+       # http://www.w3.org/TR/html5/syntax.html#tree-construction
+
+       # But first... the helpers
+       template_tag_is_open = ->
+               for t of open_tags
+                       if t[0] is TYPE_TAG and t[1] is 'template'
+                               return true
+               return false
+       is_in_scope = (tag_name) ->
+               for t of open_tags
+                       if t[0] is TYPE_TAG and t[1] is tag_name
+                               return true
+                       # FIXME bail if in scopers
+               return false
+
+       reconstruct_active_formatting_elements = ->
+               # FIXME implement this
+
+       # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
+       # FIXME implement this
+       close_p_if_in_button_scope = ->
+               if open_tags[0][1] is 'p' # FIXME
+                       open_tags.pop()
+               return
+               #p = find_button_scope 'p'
+               #if p?
+                       # TODO generate_implied_end_tags except for p tags
+                       # TODO parse_error unless open_tags[0][1] is 'p'
+                       # TODO pop stack until 'p' popped
+
+
+
+       # http://www.w3.org/TR/html5/syntax.html#insert-a-character
+       tree_insert_a_character = (t) ->
+               # FIXME read spec for "adjusted insertion location, etc, this might be wrong
+               if open_tags[0][3].length > 0 and open_tags[0][3][open_tags[0][3].length - 1][0] is TYPE_TEXT
+                       open_tags[0][3][open_tags[0][3].length - 1][1] += t[1]
+               else
+                       open_tags[0][3].push t
+
+       # FIXME read spec, do this right
+       # note: this assumes it's an open tag
+       tree_insert_tag = (t) ->
+               t[0] = TYPE_TAG # not TYPE_OPEN_TAG
+               # convert attributes into a hash
+               attrs = {}
+               while t[2].length
+                       a = t[2].pop()
+                       attrs[a[0]] = a[1]
+               t[2] = attrs
+               open_tags[0][3].push t
+               open_tags.unshift t
+
+       # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
+       tree_insert_a_comment = (t) ->
+               # FIXME read spec for "adjusted insertion location, etc, this might be wrong
+               open_tags[0][3].push t
+
+       # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
+       tree_in_body = (t) ->
+               switch t[0]
+                       when TYPE_TEXT
+                               switch t[1]
+                                       when "\u0000"
+                                               parse_error()
+                                       when "\t", "\u000a", "\u000c", "\u000d", ' '
+                                               reconstruct_active_formatting_elements()
+                                               tree_insert_a_character t
+                                       else
+                                               reconstruct_active_formatting_elements()
+                                               tree_insert_a_character t
+                                               flag_frameset_ok = false
+                       when TYPE_COMMENT
+                               tree_insert_a_comment t
+                       when TYPE_DOCTYPE
+                               parse_error()
+                       when TYPE_OPEN_TAG
+                               switch t[1]
+                                       when 'html'
+                                               parse_error()
+                                               return if template_tag_is_open()
+                                               root_attrs = open_tags[open_tags.length - 1][3]
+                                               for k, v of t[2]
+                                                       root_attrs[k] = v unless root_attrs[k]?
+                                       when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
+                                               # FIXME also do this for </template> (end tag)
+                                               return tree_in_head t
+                                       when 'body'
+                                               parse_error()
+                                               # TODO
+                                       when 'frameset'
+                                               parse_error()
+                                               # TODO
+                                       when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
+                                               close_p_if_in_button_scope()
+                                               tree_insert_tag t
+                                       when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
+                                               close_p_if_in_button_scope()
+                                               if open_tags[0][1] in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
+                                                       parse_error()
+                                                       open_tags.shift()
+                                               tree_insert_tag t
+                                       # TODO lots more to implement here
+                                       else # any other start tag
+                                               reconstruct_active_formatting_elements()
+                                               tree_insert_tag t
+                       when TYPE_EOF
+                               ok_tags = {
+                                       dd: true, dt: true, li: true, p: true, tbody: true, td: true,
+                                       tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
+                               }
+                               for t in open_tags
+                                       unless ok_tags[t[1]]?
+                                               parse_error()
+                                               break
+                               # TODO stack of template insertion modes thing
+                               flag_parsing = false # stop parsing
+                       when TYPE_END_TAG
+                               switch t[1]
+                                       when 'body'
+                                               unless is_in_scope 'body'
+                                                       parse_error()
+                                                       return
+                                               # TODO implement parse error and move to tree_after_body
+                                       when 'html'
+                                               unless is_in_scope 'body' # weird, but it's what the spec says
+                                                       parse_error()
+                                                       return
+                                               # TODO implement parse error and move to tree_after_body, reprocess
+                                       # TODO lots more close tags to implement here
+                                       else
+                                               for node, i in open_tags
+                                                       if node[1] is t[1]
+                                                               # FIXME generate implied end tags except those with name==t[1]
+                                                               parse_error() unless i is 0
+                                                               while i > 0
+                                                                       open_tags.shift()
+                                                                       i -= 1
+                                                               open_tags.shift()
+                                                               return
+                                                       if special_elements[node[1]]?
+                                                               parse_error()
+                                                               return
  
  
         # the functions below implement the tokenizer stats described here:
         # http://www.w3.org/TR/html5/syntax.html#tokenization
  
+       # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
         tok_state_data = ->
-               if cur >= txt.length
-                       return null
                 switch c = txt.charAt(cur++)
                         when '&'
-                               tok_state = tok_state_character_reference_in_data
+                               return [TYPE_TEXT, tokenize_character_reference()]
                         when '<'
                                 tok_state = tok_state_tag_open
                         when "\u0000"
-                               # Parse error
+                               parse_error()
                                 return [TYPE_TEXT, c]
+                       when '' # EOF
+                               return [TYPE_EOF]
                         else
                                 return [TYPE_TEXT, c]
                 return null
  
-       # & just got consumed
-       tok_state_character_reference_in_data = ->
-               tok_state = tok_state_data
+       # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
+       # not needed: tok_state_character_reference_in_data = ->
+       # just call tok_state_character_reference_in_data()
+
+       # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
+       tok_state_tag_open = ->
+               switch c = txt.charAt(cur++)
+                       when '!'
+                               tok_state = tok_state_markup_declaration_open
+                       when '/'
+                               tok_state = tok_state_end_tag_open
+                       when '?'
+                               parse_error()
+                               tok_state = tok_state_bogus_comment
+                       else
+                               if lc_alpha.indexOf(c) > -1
+                                       tok_cur_tag = [TYPE_OPEN_TAG, c, [], []]
+                                       tok_state = tok_state_tag_name
+                               else if uc_alpha.indexOf(c) > -1
+                                       tok_cur_tag = [TYPE_OPEN_TAG, c.toLowerCase(), [], []]
+                                       tok_state = tok_state_tag_name
+                               else
+                                       parse_error()
+                                       tok_state = tok_state_data
+                                       cur -= 1 # we didn't parse/handle the char after <
+                                       return [TYPE_TEXT, '<']
+               return null
+
+       # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
+       tok_state_end_tag_open = ->
+               switch c = txt.charAt(cur++)
+                       when '>'
+                               parse_error()
+                               tok_state = tok_state_data
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                               return [TYPE_TEXT, '</']
+                       else
+                               if uc_alpha.indexOf(c) > -1
+                                       tok_cur_tag = [TYPE_END_TAG, c.toLowerCase(), [], []]
+                                       tok_state = tok_state_tag_name
+                               else if lc_alpha.indexOf(c) > -1
+                                       tok_cur_tag = [TYPE_END_TAG, c, [], []]
+                                       tok_state = tok_state_tag_name
+                               else
+                                       parse_error()
+                                       tok_state = tok_state_bogus_comment
+               return null
+
+       # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
+       tok_state_tag_name = ->
+               switch c = txt.charAt(cur++)
+                       when "\t", "\n", "\u000c", ' '
+                               tok_state = tok_state_before_attribute_name
+                       when '/'
+                               tok_state = tok_state_self_closing_start_tag
+                       when '>'
+                               tok_state = tok_state_data
+                               tmp = tok_cur_tag
+                               tok_cur_tag = null
+                               return tmp
+                       when "\u0000"
+                               parse_error()
+                               tok_cur_tag[1] += "\ufffd"
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                       else
+                               if uc_alpha.indexOf(c) > -1
+                                       tok_cur_tag[1] += c.toLowerCase()
+                               else
+                                       tok_cur_tag[1] += c
+               return null
+
+       # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
+       tok_state_before_attribute_name = ->
+               attr_name = null
+               switch c = txt.charAt(cur++)
+                       when "\t", "\n", "\u000c", ' '
+                               return null
+                       when '/'
+                               tok_state = tok_state_self_closing_start_tag
+                               return null
+                       when '>'
+                               tok_state = tok_state_data
+                               tmp = tok_cur_tag
+                               tok_cur_tag = null
+                               return tmp
+                       when "\u0000"
+                               parse_error()
+                               attr_name = "\ufffd"
+                       when '"', "'", '<', '='
+                               parse_error()
+                               attr_name = c
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                       else
+                               if uc_alpha.indexOf(c) > -1
+                                       attr_name = c.toLowerCase()
+                               else
+                                       attr_name = c
+               if attr_name?
+                       tok_cur_tag[2].unshift [attr_name, '']
+                       tok_state = tok_state_attribute_name
+               return null
+
+       # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
+       tok_state_attribute_name = ->
+               switch c = txt.charAt(cur++)
+                       when "\t", "\n", "\u000c", ' '
+                               tok_state = tok_state_after_attribute_name
+                       when '/'
+                               tok_state = tok_state_self_closing_start_tag
+                       when '='
+                               tok_state = tok_state_before_attribute_value
+                       when '>'
+                               tok_state = tok_state_data
+                               tmp = tok_cur_tag
+                               tok_cur_tag = null
+                               return tmp
+                       when "\u0000"
+                               parse_error()
+                               tok_cur_tag[2][0][0] = "\ufffd"
+                       when '"', "'", '<'
+                               parse_error()
+                               tok_cur_tag[2][0][0] = c
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                       else
+                               if uc_alpha.indexOf(c) > -1
+                                       tok_cur_tag[2][0][0] = c.toLowerCase()
+                               else
+                                       tok_cur_tag[2][0][0] += c
+               return null
+
+       # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
+       tok_state_before_attribute_value = ->
+               switch c = txt.charAt(cur++)
+                       when "\t", "\n", "\u000c", ' '
+                               return null
+                       when '"'
+                               tok_state = tok_state_attribute_value_double_quoted
+                       when '&'
+                               tok_state = tok_state_attribute_value_unquoted
+                               cur -= 1
+                       when "'"
+                               tok_state = tok_state_attribute_value_single_quoted
+                       when "\u0000"
+                               # Parse error
+                               tok_cur_tag[2][0][1] += "\ufffd"
+                               tok_state = tok_state_attribute_value_unquoted
+                       when '>'
+                               # Parse error
+                               tok_state = tok_state_data
+                               tmp = tok_cur_tag
+                               tok_cur_tag = null
+                               return tmp
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                       else
+                               tok_cur_tag[2][0][1] += c
+                               tok_state = tok_state_attribute_value_unquoted
+               return null
+
+       # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
+       tok_state_attribute_value_double_quoted = ->
+               switch c = txt.charAt(cur++)
+                       when '"'
+                               tok_state = tok_state_after_attribute_value_quoted
+                       when '&'
+                               tok_cur_tag[2][0][1] += tokenize_character_reference '"', true
+                       when "\u0000"
+                               # Parse error
+                               tok_cur_tag[2][0][1] += "\ufffd"
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                       else
+                               tok_cur_tag[2][0][1] += c
+               return null
+
+       # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
+       tok_state_attribute_value_single_quoted = ->
+               switch c = txt.charAt(cur++)
+                       when "'"
+                               tok_state = tok_state_after_attribute_value_quoted
+                       when '&'
+                               tok_cur_tag[2][0][1] += tokenize_character_reference "'", true
+                       when "\u0000"
+                               # Parse error
+                               tok_cur_tag[2][0][1] += "\ufffd"
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                       else
+                               tok_cur_tag[2][0][1] += c
+               return null
+
+       # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
+       tok_state_attribute_value_unquoted = ->
+               switch c = txt.charAt(cur++)
+                       when "\t", "\n", "\u000c", ' '
+                               tok_state = tok_state_before_attribute_name
+                       when '&'
+                               tok_cur_tag[2][0][1] += tokenize_character_reference '>', true
+                       when '>'
+                               tok_state = tok_state_data
+                               tmp = tok_cur_tag
+                               tok_cur_tag = null
+                               return tmp
+                       when "\u0000"
+                               tok_cur_tag[2][0][1] += "\ufffd"
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                       else
+                               # Parse Error if ', <, = or ` (backtick)
+                               tok_cur_tag[2][0][1] += c
+               return null
+
+       # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
+       tok_state_after_attribute_value_quoted = ->
+               switch c = txt.charAt(cur++)
+                       when "\t", "\n", "\u000c", ' '
+                               tok_state = tok_state_before_attribute_name
+                       when '/'
+                               tok_state = tok_state_self_closing_start_tag
+                       when '>'
+                               tok_state = tok_state_data
+                               tmp = tok_cur_tag
+                               tok_cur_tag = null
+                               return tmp
+                       when '' # EOF
+                               parse_error()
+                               tok_state = tok_state_data
+                       else
+                               # Parse Error
+                               tok_state = tok_state_before_attribute_name
+                               cur -= 1 # we didn't handle that char
+               return null
+
+       # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
+       # Don't set this as a state, just call it
+       # returns a string (NOT a text node)
+       tokenize_character_reference = (allowed_char = null, in_attr = false) ->
                 if cur >= txt.length
-                       return [TYPE_TEXT, '&']
+                       return '&'
                 switch c = txt.charAt(cur)
+                       when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
+                               # explicitly not a parse error
+                               return '&'
                         when ';'
-                               return [TYPE_TEXT, '&']
+                               # there has to be "one or more" alnums between & and ; to be a parse error
+                               return '&'
                         when '#'
                                 if cur + 1 >= txt.length
-                                       return [TYPE_TEXT, '&']
+                                       return '&'
                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
                                         prefix = '#x'
                                         charset = hex_chars
@@ -196,78 +645,146 @@ parse_html = (txt) ->
                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
                                         i += 1
                                 if i is 0
-                                       return [TYPE_TEXT, '&']
+                                       return '&'
                                 if txt.charAt(start + i) is ';'
                                         i += 1
+                               # FIXME This is supposed to generate parse errors for some chars
                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
                                 if decoded?
                                         cur = start + i
-                                       return [TYPE_TEXT, decoded]
-                               return [TYPE_TEXT, '&']
+                                       return decoded
+                               return '&'
                         else
                                 for i in [0...31]
                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
                                                 break
                                 if i is 0
-                                       return [TYPE_TEXT, '&']
+                                       # exit early, because parse_error() below needs at least one alnum
+                                       return '&'
                                 if txt.charAt(cur + i) is ';'
                                         i += 1 # include ';' terminator in value
                                         decoded = decode_named_char_ref txt.substr(cur, i)
                                         if decoded?
                                                 cur += i
-                                               return [TYPE_TEXT, decoded]
-                                       return [TYPE_TEXT, '&']
+                                               return decoded
+                                       parse_error()
+                                       return '&'
                                 else
                                         # no ';' terminator (only legacy char refs)
-                                       if i < 2 or i > 6
-                                               return [TYPE_TEXT, '&']
-                                       # FIXME: if we're inside an attribute:
-                                       # 1.    don't parse refs that are followed by =
-                                       # 2.    don't parse refs that are followed by alnum
                                         max = i
                                         for i in [2..max] # no prefix matches, so ok to check shortest first
                                                 c = legacy_char_refs[txt.substr(cur, i)]
                                                 if c?
+                                                       if in_attr
+                                                               if txt.charAt(cur + i) is '='
+                                                                       # "because some legacy user agents will
+                                                                       # misinterpret the markup in those cases"
+                                                                       parse_error()
+                                                                       return '&'
+                                                               if alnum.indexOf(txt.charAt(cur + i)) > -1
+                                                                       # this makes attributes forgiving about url args
+                                                                       return '&'
+                                                       # ok, and besides the weird exceptions for attributes...
+                                                       # return the matching char
                                                         cur += i # consume entity chars
-                                                       return [TYPE_TEXT, c]
-               return null
-                               
-
-       # the functions below impliment the Tree Contstruction algorithm here:
-       # http://www.w3.org/TR/html5/syntax.html#tree-construction
-       tree_append = (t) ->
-               if t[0] is TYPE_TEXT and tree_append_point.length > 0 and tree_append_point[tree_append_point.length - 1][0] is TYPE_TEXT
-                       tree_append_point[tree_append_point.length - 1][1] += t[1]
-               else
-                       tree_append_point.push t
+                                                       parse_error() # because no terminating ";"
+                                                       return c
+                                       parse_error()
+                                       return '&'
+               return # never reached
  
         # tree constructor initialization
-       tree = [] # see comments on TYPE_TAG/etc for the structure of this data
-       tree_append_point = tree
-       tree_state = tree_append
+       # see comments on TYPE_TAG/etc for the structure of this data
+       tree = [TYPE_TAG, 'html', {}, []]
+       open_tags = [tree]
+       tree_state = tree_in_body
+       flag_frameset_ok = true
+       flag_parsing = true
  
         # tokenizer initialization
         tok_state = tok_state_data
  
         # proccess input
-       while cur < txt.length
+       while flag_parsing
                 t = tok_state()
                 if t?
                         tree_state t
-       
-       return tree
+       return tree[3]
  
  # everything below is tests on the above
-test_equals = (description, fn, args..., expected_output) ->
-       output = fn.apply this, args
+test_equals = (description, output, expected_output) ->
         if output is expected_output
-               console.log "passed: #{description}."
+               console.log "passed." # don't say name, so smart consoles can merge all of these
         else
-               console.log "FAILED: #{description}. Expected: #{expected_output}, actual: #{output}"
-html_to_json = (html) ->
-       return JSON.stringify parse_html html
-test_equals "empty", html_to_json, "", '[]'
-test_equals "just text", html_to_json, "abc", '[[1,"abc"]]'
-test_equals "named entity", html_to_json, "a&amp;1234", '[[1,"a&1234"]]'
-test_equals "broken named character references", html_to_json, "1&amp2&&amp;3&aabbcc;", '[[1,"1&2&&3&aabbcc;"]]'
-test_equals "numbered entity overrides", html_to_json, "1&#X80&#x80; &#x83", '[[1,"1€€ ƒ"]]'
+               console.log "FAILED: \"#{description}\""
+               console.log "   Expected: #{expected_output}"
+               console.log "     Actual: #{output}"
+test_parser = (args) ->
+       parse_errors = []
+       errors_cb = (i) ->
+               parse_errors.push i
+       parsed = parse_html args.html, errors_cb
+       parsed = JSON.stringify parsed
+       if parsed isnt args.expected or parse_errors.length isnt args.errors
+               console.log "test FAILED: \"#{args.name}\""
+       else
+               console.log 'test passed'
+       if parsed isnt args.expected
+               console.log "      Input: #{args.html}"
+               console.log "    Correct: #{args.expected}"
+               console.log "     Output: #{parsed}"
+       if parse_errors.length isnt args.errors
+               console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
+
+test_parser name: "empty", \
+       html: "",
+       expected: '[]',
+       errors: 0
+test_parser name: "just text", \
+       html: "abc",
+       expected: '[[1,"abc"]]',
+       errors: 0
+test_parser name: "named entity", \
+       html: "a&amp;1234",
+       expected: '[[1,"a&1234"]]',
+       errors: 0
+test_parser name: "broken named character references", \
+       html: "1&amp2&&amp;3&aabbcc;",
+       expected: '[[1,"1&2&&3&aabbcc;"]]',
+       errors: 2
+test_parser name: "numbered entity overrides", \
+       html: "1&#X80&#x80; &#x83",
+       expected: '[[1,"1€€ ƒ"]]',
+       errors: 0
+test_parser name: "open tag", \
+       html: "foo<span>bar",
+       expected: '[[1,"foo"],[0,"span",{},[[1,"bar"]]]]',
+       errors: 1 # no close tag
+test_parser name: "open tag with attributes", \
+       html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
+       expected: '[[1,"foo"],[0,"span",{"style":"foo: bar","title":"hi"},[[1,"bar"]]]]',
+       errors: 1 # no close tag
+test_parser name: "open tag with attributes of various quotings", \
+       html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
+       expected: '[[1,"foo"],[0,"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\\"","autofocus":""},[[1,"bar"]]]]',
+       errors: 1 # no close tag
+test_parser name: "attribute entity exceptions dq", \
+       html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
+       expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]',
+       errors: 2 # no close tag, &amp= in attr
+test_parser name: "attribute entity exceptions sq", \
+       html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
+       expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]',
+       errors: 2 # no close tag, &amp= in attr
+test_parser name: "attribute entity exceptions uq", \
+       html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
+       expected: '[[1,"foo"],[0,"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[[1,"bar"]]]]',
+       errors: 2 # no close tag, &amp= in attr
+test_parser name: "matching closing tags", \
+       html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
+       expected: '[[1,"foo"],[0,"a",{"href":"hi"},[[1,"hi"]]],[0,"div",{},[[1,"1"],[0,"div",{},[[1,"foo"]]],[1,"2"]]],[1,"bar"]]',
+       errors: 0
+test_parser name: "mis-matched closing tags", \
+       html: "foo<div>bar<span>baz</div>qux",
+       expected: '[[1,"foo"],[0,"div",{},[[1,"bar"],[0,"span",{},[[1,"baz"]]]]],[1,"qux"]]',
+       errors: 1 # close tag mismatch