JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
implemented adoption agency algorithm, tested a littl
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body.
22 #
23 # Instead, the data structure produced by this parser is an array of nodes.
24 #
25 # Each node is an obect of the Node class. Here are the Node types:
26 TYPE_TAG = 0 # name, {attributes}, [children]
27 TYPE_TEXT = 1 # "text"
28 TYPE_COMMENT = 2
29 TYPE_DOCTYPE = 3
30 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
31 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
32 TYPE_END_TAG = 5 # name
33 TYPE_EOF = 6
34 TYPE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
35 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
36
37 # namespace constants
38 NS_HTML = 1
39 NS_MATHML = 2
40 NS_SVG = 3
41
42 class Node
43         constructor: (type, args = {}) ->
44                 @type = type # one of the TYPE_* constants above
45                 @name = args.name ? '' # tag name
46                 @text = args.text ? '' # contents for text/comment nodes
47                 @attrs = args.attrs ? {}
48                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
49                 @children = args.children ? []
50                 @namespace = args.namespace ? NS_HTML
51                 @parent = args.parent ? null
52         shallow_clone: -> # return a new node that's the same except without the children or parent
53                 # WARNING this doesn't work right on open tags that are still being parsed
54                 attrs = {}
55                 attrs[k] = v for k, v of @attrs
56                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace
57         serialize: -> # for unit tests
58                 ret = ''
59                 switch @type
60                         when TYPE_TAG
61                                 ret += 'tag:'
62                                 ret += JSON.stringify @name
63                                 ret += ','
64                                 ret += JSON.stringify @attrs
65                                 ret += ','
66                                 sep = '['
67                                 for c in @children
68                                         ret += sep
69                                         sep = ','
70                                         ret += c.serialize()
71                                 ret += ']'
72                         when TYPE_TEXT
73                                 ret += 'text:'
74                                 ret += JSON.stringify @text
75                         when TYPE_COMMENT
76                                 ret += 'comment:'
77                                 ret += JSON.stringify @text
78                         when TYPE_DOCTYPE
79                                 ret += 'doctype'
80                                 # FIXME
81                         else
82                                 ret += 'unknown:'
83                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
84                 return ret
85
86 # helpers: (only take args that are normally known when parser creates nodes)
87 new_open_tag = (name) ->
88         return new Node TYPE_OPEN_TAG, name: name
89 new_end_tag = (name) ->
90         return new Node TYPE_END_TAG, name: name
91 new_text_node = (txt) ->
92         return new Node TYPE_TEXT, text: txt
93 new_comment_node = (txt) ->
94         return new Node TYPE_COMMENT, text: txt
95 new_eof_token = ->
96         return new Node TYPE_EOF
97 new_aaa_bookmark = ->
98         return new Node TYPE_AAA_BOOKMARK
99
100 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
101 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
102 digits = "0123456789"
103 alnum = lc_alpha + uc_alpha + digits
104 hex_chars = digits + "abcdefABCDEF"
105
106 # some SVG elements have dashes in them
107 tag_name_chars = alnum + "-"
108
109 # http://www.w3.org/TR/html5/infrastructure.html#space-character
110 space_chars = "\u0009\u000a\u000c\u000d\u0020"
111
112 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
113 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
114
115 # These are the character references that don't need a terminating semicolon
116 # min length: 2, max: 6, none are a prefix of any other.
117 legacy_char_refs = {
118         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
119         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
120         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
121         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
122         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
123         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
124         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
125         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
126         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
127         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
128         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
129         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
130         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
131         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
132         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
133         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
134         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
135         yen: '¥', yuml: 'ÿ'
136 }
137
138 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
139 raw_text_elements = ['script', 'style']
140 escapable_raw_text_elements = ['textarea', 'title']
141 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
142 svg_elements = [
143         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
144         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
145         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
146         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
147         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
148         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
149         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
150         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
151         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
152         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
153         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
154         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
155         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
156         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
157         'view', 'vkern'
158 ]
159
160 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
161 mathml_elements = [
162         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
163         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
164         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
165         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
166         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
167         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
168         'determinant', 'diff', 'divergence', 'divide', 'domain',
169         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
170         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
171         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
172         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
173         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
174         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
175         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
176         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
177         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
178         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
179         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
180         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
181         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
182         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
183         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
184         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
185         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
186         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
187         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
188         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
189         'vectorproduct', 'xor'
190 ]
191 # foreign_elements = [svg_elements..., mathml_elements...]
192 #normal_elements = All other allowed HTML elements are normal elements.
193
194 special_elements = {
195         # HTML:
196         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
197         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
198         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
199         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
200         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
201         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
202         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
203         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
204         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
205         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
206         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
207         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
208         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
209         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
210         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
211         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
212         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
213         wbr:NS_HTML, xmp:NS_HTML,
214
215         # MathML:
216         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
217         'annotation-xml':NS_MATHML,
218
219         # SVG:
220         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
221 }
222
223 formatting_elements = {
224          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
225          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
226          u: true
227 }
228
229 el_is_special = (e) ->
230         return special_elements[e] is e.namespace
231
232 # decode_named_char_ref()
233 #
234 # The list of named character references is _huge_ so ask the browser to decode
235 # for us instead of wasting bandwidth/space on including the table here.
236 #
237 # Pass without the "&" but with the ";" examples:
238 #    for "&amp" pass "amp;"
239 #    for "&#x2032" pass "x2032;"
240 g_dncr = {
241         cache: {}
242         textarea: document.createElement('textarea')
243 }
244 # TODO test this in IE8
245 decode_named_char_ref = (txt) ->
246         txt = "&#{txt}"
247         decoded = g_dncr.cache[txt]
248         return decoded if decoded?
249         g_dncr.textarea.innerHTML = txt
250         decoded = g_dncr.textarea.value
251         return null if decoded is txt
252         return g_dncr.cache[txt] = decoded
253
254 parse_html = (txt, parse_error_cb = null) ->
255         cur = 0 # index of next char in txt to be parsed
256         # declare tree and tokenizer variables so they're in scope below
257         tree = null
258         open_els = [] # stack of open elements
259         tree_state = null
260         tok_state = null
261         tok_cur_tag = null # partially parsed tag
262         flag_frameset_ok = null
263         flag_parsing = null
264         afe = [] # active formatting elements
265
266         parse_error = ->
267                 if parse_error_cb?
268                         parse_error_cb cur
269                 else
270                         console.log "Parse error at character #{cur} of #{txt.length}"
271
272
273         # the functions below impliment the Tree Contstruction algorithm
274         # http://www.w3.org/TR/html5/syntax.html#tree-construction
275
276         # But first... the helpers
277         template_tag_is_open = ->
278                 for t in open_els
279                         if t.type is TYPE_TAG and t.name is 'template'
280                                 return true
281                 return false
282         is_in_scope_x = (tag_name, scope) ->
283                 for t in open_els
284                         if t.name is tag_name
285                                 return true
286                         if t.name of scope
287                                 return false
288                 return false
289         is_in_scope_x_y = (tag_name, scope, scope2) ->
290                 for t in open_els
291                         if t.name is tag_name
292                                 return true
293                         if t.name of scope
294                                 return false
295                         if t.name of scope2
296                                 return false
297                 return false
298         standard_scopers = { # FIXME these are supposed to be namespace specific
299                 'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
300                 'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
301                 'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
302                 'foreignObject': true, 'desc': true, 'title'
303         }
304         button_scopers = button: true
305         li_scopers = ol: true, ul: true
306         table_scopers = html: true, table: true, template: true
307         is_in_scope = (tag_name) ->
308                 return is_in_scope_x tag_name, standard_scopers
309         is_in_button_scope = (tag_name) ->
310                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers
311         is_in_table_scope = (tag_name) ->
312                 return is_in_scope_x tag_name, table_scopers
313         is_in_select_scope = (tag_name) ->
314                 for t in open_els
315                         if t.name is tag_name
316                                 return true
317                         if t.name isnt 'optgroup' and t.name isnt 'option'
318                                 return false
319                 return false
320         # this checks for a particular element, not by name
321         el_is_in_scope = (el) ->
322                 for t in open_els
323                         if t is el
324                                 return true
325                         if t.name of standard_scopers
326                                 return false
327                 return false
328
329         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
330         # this implementation is structured (mostly) as described at the link above.
331         # capitalized comments are the "labels" described at the link above.
332         reconstruct_active_formatting_elements = ->
333                 return if afe.length is 0
334                 if afe[0].type is TYPE_MARKER or afe[0] in open_els
335                         return
336                 # Rewind
337                 i = 0
338                 loop
339                         if i is afe.length - 1
340                                 break
341                         i += 1
342                         if afe[i].type is TYPE_MARKER or afe[i] in open_els
343                                 i -= 1 # Advance
344                                 break
345                 # Create
346                 loop
347                         el = afe[i].shallow_clone()
348                         tree_insert_tag el
349                         afe[i] = el
350                         break if i is 0
351                         i -= 1
352
353         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
354         # adoption agency algorithm
355         adoption_agency = (subject) ->
356                 if open_els[0].name is subject
357                         el = open_els[0]
358                         open_els.shift()
359                         # remove it from the list of active formatting elements (if found)
360                         for t, i in afe
361                                 if t is el
362                                         afe.splice i, 1
363                                         break
364                         return
365                 outer = 0
366                 loop
367                         if outer >= 8
368                                 return
369                         outer += 1
370                         fe = null
371                         for t, fe_index in afe
372                                 if t.type is TYPE_MARKER
373                                         break
374                                 if t.name is subject
375                                         fe = t
376                                         break
377                         if fe is null
378                                 in_body_any_other_end_tag subject
379                                 return
380                         in_open_els = false
381                         for t in open_els
382                                 if t is fe
383                                         in_open_els = true
384                                         break
385                         unless in_open_els
386                                 parse_error()
387                                 # "remove it from the list" must mean afe, since it's not in open_els
388                                 afe.splice fe_index, 1
389                                 return
390                         unless el_is_in_scope fe
391                                 parse_error()
392                                 return
393                         unless open_els[0] is fe
394                                 parse_error()
395                                 # continue
396                         fb = null
397                         fb_index
398                         for t, i in open_els
399                                 if t is fe
400                                         break
401                                 if el_is_special t
402                                         fb = t
403                                         fb_index = i
404                         if fb is null
405                                 loop
406                                         t = open_els.shift()
407                                         if t is fe
408                                                 afe.splice fe_index, 1
409                                                 return
410                         ca = open_els[fe_index + 1] # common ancestor
411                         node_above = open_els[fb_index + 1] # next node if node isn't in open_els anymore
412                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
413                         bookmark = new_aaa_bookmark()
414                         for t, i in afe
415                                 if t is fe
416                                         afe.splice i, 0, bookmark
417                         node = last_node = fb
418                         inner = 0
419                         loop
420                                 inner += 1
421                                 node_next = null
422                                 for t, i in open_els
423                                         if t is node
424                                                 node_next = open_els[i + 1]
425                                                 break
426                                 node = node_next ? node_above
427                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
428                                 if node is fe
429                                         break
430                                 node_in_afe = false
431                                 for t, i of afe
432                                         if t is node
433                                                 if inner > 3
434                                                         afe.splice i, 1
435                                                 else
436                                                         node_in_afe = true
437                                                 break
438                                 unless node_in_afe
439                                         for t, i in open_els
440                                                 if t is node
441                                                         node_above = open_els[i + 1]
442                                                         open_els.splice i, 1
443                                                         break
444                                         continue
445                                 # 7. reate an element for the token for which the element node
446                                 # was created, in the HTML namespace, with common ancestor as
447                                 # the intended parent; replace the entry for node in the list
448                                 # of active formatting elements with an entry for the new
449                                 # element, replace the entry for node in the stack of open
450                                 # elements with an entry for the new element, and let node be
451                                 # the new element.
452                                 new_node = node.shallow_clone()
453                                 for t, i in afe
454                                         if t is node
455                                                 afe[i] = new_node
456                                                 break
457                                 for t, i in open_els
458                                         if t is node
459                                                 open_els[i] = new_node
460                                                 break
461                                 node = new_node
462                                 # 8. If last node is furthest block, then move the
463                                 # aforementioned bookmark to be immediately after the new node
464                                 # in the list of active formatting elements.
465                                 if last_node is fb
466                                         for t, i in afe
467                                                 if t is bookmark
468                                                         afe.splice i, 1
469                                         for t, i in afe
470                                                 if t is node
471                                                         # TODO test: position i gets you "after"?
472                                                         afe.splice i, 0, new_aaa_bookmark()
473                                 # 9. Insert last node into node, first removing it from its
474                                 # previous parent node if any.
475                                 if last_node.parent?
476                                         for c, i of last_node.parent.children
477                                                 if c is last_node
478                                                         last_node.parent.children.splice i, 1
479                                 node.children.push last_node
480                                 last_node.parent = node
481                                 # 10. Let last node be node.
482                                 last_node = node
483                                 # 11. Return to the step labeled inner loop.
484                         # 14. Insert whatever last node ended up being in the previous step
485                         # at the appropriate place for inserting a node, but using common
486                         # ancestor as the override target.
487                         tree_insert_tag last_node, ca
488                         # 15. Create an element for the token for which formatting element
489                         # was created, in the HTML namespace, with furthest block as the
490                         # intended parent.
491                         new_element = fe.shallow_clone()
492                         # 16. Take all of the child nodes of furthest block and append them
493                         # to the element created in the last step.
494                         while fb.children.length
495                                 t = fb.children.shift()
496                                 t.parent = new_element
497                                 new_element.children.push t
498                         # 17. Append that new element to furthest block.
499                         new_element.parent = fb
500                         fb.children.push new_element
501                         # 18. Remove formatting element from the list of active formatting
502                         # elements, and insert the new element into the list of active
503                         # formatting elements at the position of the aforementioned
504                         # bookmark.
505                         for t, i in afe
506                                 if t is fe
507                                         afe.splice i, 1
508                                         break
509                         for t, i in afe
510                                 if t is bookmark
511                                         afe[i] = node
512                                         break
513                         # 19. Remove formatting element from the stack of open elements,
514                         # and insert the new element into the stack of open elements
515                         # immediately below the position of furthest block in that stack.
516                         for t, i of open_els
517                                 if t is fe
518                                         open_els.splice i, 1
519                                         break
520                         for t, i of open_els
521                                 if t is fb
522                                         open_els.splice i, 0, new_element
523                                         break
524                         # 20. Jump back to the step labeled outer loop.
525
526         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
527         # FIXME implement this
528         close_p_if_in_button_scope = ->
529                 if open_els[0].name is 'p'
530                         open_els.pop()
531                 return
532                 #p = find_button_scope 'p'
533                 #if p?
534                         # TODO generate_implied_end_tags except for p tags
535                         # TODO parse_error unless open_els[0].name is 'p'
536                         # TODO pop stack until 'p' popped
537
538         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
539         tree_insert_a_character = (t) ->
540                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
541                 dest = open_els[0].children
542                 if dest.length > 0 and dest[dest.length - 1].type is TYPE_TEXT
543                         dest[dest.length - 1].text += t.text
544                 else
545                         dest.push t
546
547         # FIXME read spec, do this right
548         # FIXME implement the override target thing
549         # note: this assumes it's an open tag
550         tree_insert_tag = (t, override_target = null) ->
551                 t.type = TYPE_TAG # not TYPE_OPEN_TAG
552                 # convert attributes into a hash
553                 while t.attrs_a.length
554                         a = t.attrs_a.pop()
555                         t.attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
556                 if t.parent?
557                         for c, i of t.parent.children
558                                 if c is t
559                                         t.parent.children.splice i, 1
560                 # FIXME spec says to do something to figure out what parent should be
561                 parent = open_els[0]
562                 open_els.unshift t
563                 parent.children.push t
564                 t.parent = parent
565
566         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
567         tree_insert_a_comment = (t) ->
568                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
569                 open_els[0].children.push t
570
571         # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
572         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
573                 for node, i in open_els
574                         if node.name is name
575                                 # FIXME generate implied end tags except those with name==name
576                                 parse_error() unless i is 0
577                                 while i > 0
578                                         open_els.shift()
579                                         i -= 1
580                                 open_els.shift()
581                                 return
582                         if special_elements[node.name]?
583                                 parse_error()
584                                 return
585         tree_in_body = (t) ->
586                 switch t.type
587                         when TYPE_TEXT
588                                 switch t.text
589                                         when "\u0000"
590                                                 parse_error()
591                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
592                                                 reconstruct_active_formatting_elements()
593                                                 tree_insert_a_character t
594                                         else
595                                                 reconstruct_active_formatting_elements()
596                                                 tree_insert_a_character t
597                                                 flag_frameset_ok = false
598                         when TYPE_COMMENT
599                                 tree_insert_a_comment t
600                         when TYPE_DOCTYPE
601                                 parse_error()
602                         when TYPE_OPEN_TAG
603                                 switch t.name
604                                         when 'html'
605                                                 parse_error()
606                                                 return if template_tag_is_open()
607                                                 root_attrs = open_els[open_els.length - 1].children
608                                                 for k, v of t.attrs
609                                                         root_attrs[k] = v unless root_attrs[k]?
610                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
611                                                 # FIXME also do this for </template> (end tag)
612                                                 return tree_in_head t
613                                         when 'body'
614                                                 parse_error()
615                                                 # TODO
616                                         when 'frameset'
617                                                 parse_error()
618                                                 # TODO
619                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
620                                                 close_p_if_in_button_scope()
621                                                 tree_insert_tag t
622                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
623                                                 close_p_if_in_button_scope()
624                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
625                                                         parse_error()
626                                                         open_els.shift()
627                                                 tree_insert_tag t
628                                         # TODO lots more to implement here
629                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
630                                                 reconstruct_active_formatting_elements()
631                                                 tree_insert_tag t
632                                                 afe.push t
633                                         # TODO lots more to implement here
634                                         else # any other start tag
635                                                 reconstruct_active_formatting_elements()
636                                                 tree_insert_tag t
637                         when TYPE_EOF
638                                 ok_tags = {
639                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
640                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
641                                 }
642                                 for t in open_els
643                                         unless ok_tags[t.name]?
644                                                 parse_error()
645                                                 break
646                                 # TODO stack of template insertion modes thing
647                                 flag_parsing = false # stop parsing
648                         when TYPE_END_TAG
649                                 switch t.name
650                                         when 'body'
651                                                 unless is_in_scope 'body'
652                                                         parse_error()
653                                                         return
654                                                 # TODO implement parse error and move to tree_after_body
655                                         when 'html'
656                                                 unless is_in_scope 'body' # weird, but it's what the spec says
657                                                         parse_error()
658                                                         return
659                                                 # TODO implement parse error and move to tree_after_body, reprocess
660                                         # TODO lots more close tags to implement here
661                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
662                                                 adoption_agency t.name
663                                         # TODO lots more close tags to implement here
664                                         else
665                                                 in_body_any_other_end_tag t.name
666                 return
667
668
669         # the functions below implement the tokenizer stats described here:
670         # http://www.w3.org/TR/html5/syntax.html#tokenization
671
672         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
673         tok_state_data = ->
674                 switch c = txt.charAt(cur++)
675                         when '&'
676                                 return new_text_node tokenize_character_reference()
677                         when '<'
678                                 tok_state = tok_state_tag_open
679                         when "\u0000"
680                                 parse_error()
681                                 return new_text_node c
682                         when '' # EOF
683                                 return new_eof_token()
684                         else
685                                 return new_text_node c
686                 return null
687
688         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
689         # not needed: tok_state_character_reference_in_data = ->
690         # just call tok_state_character_reference_in_data()
691
692         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
693         tok_state_tag_open = ->
694                 switch c = txt.charAt(cur++)
695                         when '!'
696                                 tok_state = tok_state_markup_declaration_open
697                         when '/'
698                                 tok_state = tok_state_end_tag_open
699                         when '?'
700                                 parse_error()
701                                 tok_state = tok_state_bogus_comment
702                         else
703                                 if lc_alpha.indexOf(c) > -1
704                                         tok_cur_tag = new_open_tag c
705                                         tok_state = tok_state_tag_name
706                                 else if uc_alpha.indexOf(c) > -1
707                                         tok_cur_tag = new_open_tag c.toLowerCase()
708                                         tok_state = tok_state_tag_name
709                                 else
710                                         parse_error()
711                                         tok_state = tok_state_data
712                                         cur -= 1 # we didn't parse/handle the char after <
713                                         return new_text_node '<'
714                 return null
715
716         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
717         tok_state_end_tag_open = ->
718                 switch c = txt.charAt(cur++)
719                         when '>'
720                                 parse_error()
721                                 tok_state = tok_state_data
722                         when '' # EOF
723                                 parse_error()
724                                 tok_state = tok_state_data
725                                 return new_text_node '</'
726                         else
727                                 if uc_alpha.indexOf(c) > -1
728                                         tok_cur_tag = new_end_tag c.toLowerCase()
729                                         tok_state = tok_state_tag_name
730                                 else if lc_alpha.indexOf(c) > -1
731                                         tok_cur_tag = new_end_tag c
732                                         tok_state = tok_state_tag_name
733                                 else
734                                         parse_error()
735                                         tok_state = tok_state_bogus_comment
736                 return null
737
738         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
739         tok_state_tag_name = ->
740                 switch c = txt.charAt(cur++)
741                         when "\t", "\n", "\u000c", ' '
742                                 tok_state = tok_state_before_attribute_name
743                         when '/'
744                                 tok_state = tok_state_self_closing_start_tag
745                         when '>'
746                                 tok_state = tok_state_data
747                                 tmp = tok_cur_tag
748                                 tok_cur_tag = null
749                                 return tmp
750                         when "\u0000"
751                                 parse_error()
752                                 tok_cur_tag.name += "\ufffd"
753                         when '' # EOF
754                                 parse_error()
755                                 tok_state = tok_state_data
756                         else
757                                 if uc_alpha.indexOf(c) > -1
758                                         tok_cur_tag.name += c.toLowerCase()
759                                 else
760                                         tok_cur_tag.name += c
761                 return null
762
763         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
764         tok_state_before_attribute_name = ->
765                 attr_name = null
766                 switch c = txt.charAt(cur++)
767                         when "\t", "\n", "\u000c", ' '
768                                 return null
769                         when '/'
770                                 tok_state = tok_state_self_closing_start_tag
771                                 return null
772                         when '>'
773                                 tok_state = tok_state_data
774                                 tmp = tok_cur_tag
775                                 tok_cur_tag = null
776                                 return tmp
777                         when "\u0000"
778                                 parse_error()
779                                 attr_name = "\ufffd"
780                         when '"', "'", '<', '='
781                                 parse_error()
782                                 attr_name = c
783                         when '' # EOF
784                                 parse_error()
785                                 tok_state = tok_state_data
786                         else
787                                 if uc_alpha.indexOf(c) > -1
788                                         attr_name = c.toLowerCase()
789                                 else
790                                         attr_name = c
791                 if attr_name?
792                         tok_cur_tag.attrs_a.unshift [attr_name, '']
793                         tok_state = tok_state_attribute_name
794                 return null
795
796         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
797         tok_state_attribute_name = ->
798                 switch c = txt.charAt(cur++)
799                         when "\t", "\n", "\u000c", ' '
800                                 tok_state = tok_state_after_attribute_name
801                         when '/'
802                                 tok_state = tok_state_self_closing_start_tag
803                         when '='
804                                 tok_state = tok_state_before_attribute_value
805                         when '>'
806                                 tok_state = tok_state_data
807                                 tmp = tok_cur_tag
808                                 tok_cur_tag = null
809                                 return tmp
810                         when "\u0000"
811                                 parse_error()
812                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
813                         when '"', "'", '<'
814                                 parse_error()
815                                 tok_cur_tag.attrs_a[0][0] = c
816                         when '' # EOF
817                                 parse_error()
818                                 tok_state = tok_state_data
819                         else
820                                 if uc_alpha.indexOf(c) > -1
821                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
822                                 else
823                                         tok_cur_tag.attrs_a[0][0] += c
824                 return null
825
826         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
827         tok_state_before_attribute_value = ->
828                 switch c = txt.charAt(cur++)
829                         when "\t", "\n", "\u000c", ' '
830                                 return null
831                         when '"'
832                                 tok_state = tok_state_attribute_value_double_quoted
833                         when '&'
834                                 tok_state = tok_state_attribute_value_unquoted
835                                 cur -= 1
836                         when "'"
837                                 tok_state = tok_state_attribute_value_single_quoted
838                         when "\u0000"
839                                 # Parse error
840                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
841                                 tok_state = tok_state_attribute_value_unquoted
842                         when '>'
843                                 # Parse error
844                                 tok_state = tok_state_data
845                                 tmp = tok_cur_tag
846                                 tok_cur_tag = null
847                                 return tmp
848                         when '' # EOF
849                                 parse_error()
850                                 tok_state = tok_state_data
851                         else
852                                 tok_cur_tag.attrs_a[0][1] += c
853                                 tok_state = tok_state_attribute_value_unquoted
854                 return null
855
856         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
857         tok_state_attribute_value_double_quoted = ->
858                 switch c = txt.charAt(cur++)
859                         when '"'
860                                 tok_state = tok_state_after_attribute_value_quoted
861                         when '&'
862                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
863                         when "\u0000"
864                                 # Parse error
865                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
866                         when '' # EOF
867                                 parse_error()
868                                 tok_state = tok_state_data
869                         else
870                                 tok_cur_tag.attrs_a[0][1] += c
871                 return null
872
873         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
874         tok_state_attribute_value_single_quoted = ->
875                 switch c = txt.charAt(cur++)
876                         when "'"
877                                 tok_state = tok_state_after_attribute_value_quoted
878                         when '&'
879                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
880                         when "\u0000"
881                                 # Parse error
882                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
883                         when '' # EOF
884                                 parse_error()
885                                 tok_state = tok_state_data
886                         else
887                                 tok_cur_tag.attrs_a[0][1] += c
888                 return null
889
890         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
891         tok_state_attribute_value_unquoted = ->
892                 switch c = txt.charAt(cur++)
893                         when "\t", "\n", "\u000c", ' '
894                                 tok_state = tok_state_before_attribute_name
895                         when '&'
896                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
897                         when '>'
898                                 tok_state = tok_state_data
899                                 tmp = tok_cur_tag
900                                 tok_cur_tag = null
901                                 return tmp
902                         when "\u0000"
903                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
904                         when '' # EOF
905                                 parse_error()
906                                 tok_state = tok_state_data
907                         else
908                                 # Parse Error if ', <, = or ` (backtick)
909                                 tok_cur_tag.attrs_a[0][1] += c
910                 return null
911
912         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
913         tok_state_after_attribute_value_quoted = ->
914                 switch c = txt.charAt(cur++)
915                         when "\t", "\n", "\u000c", ' '
916                                 tok_state = tok_state_before_attribute_name
917                         when '/'
918                                 tok_state = tok_state_self_closing_start_tag
919                         when '>'
920                                 tok_state = tok_state_data
921                                 tmp = tok_cur_tag
922                                 tok_cur_tag = null
923                                 return tmp
924                         when '' # EOF
925                                 parse_error()
926                                 tok_state = tok_state_data
927                         else
928                                 # Parse Error
929                                 tok_state = tok_state_before_attribute_name
930                                 cur -= 1 # we didn't handle that char
931                 return null
932
933         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
934         # Don't set this as a state, just call it
935         # returns a string (NOT a text node)
936         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
937                 if cur >= txt.length
938                         return '&'
939                 switch c = txt.charAt(cur)
940                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
941                                 # explicitly not a parse error
942                                 return '&'
943                         when ';'
944                                 # there has to be "one or more" alnums between & and ; to be a parse error
945                                 return '&'
946                         when '#'
947                                 if cur + 1 >= txt.length
948                                         return '&'
949                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
950                                         prefix = '#x'
951                                         charset = hex_chars
952                                         start = cur + 2
953                                 else
954                                         charset = digits
955                                         start = cur + 1
956                                         prefix = '#'
957                                 i = 0
958                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
959                                         i += 1
960                                 if i is 0
961                                         return '&'
962                                 if txt.charAt(start + i) is ';'
963                                         i += 1
964                                 # FIXME This is supposed to generate parse errors for some chars
965                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
966                                 if decoded?
967                                         cur = start + i
968                                         return decoded
969                                 return '&'
970                         else
971                                 for i in [0...31]
972                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
973                                                 break
974                                 if i is 0
975                                         # exit early, because parse_error() below needs at least one alnum
976                                         return '&'
977                                 if txt.charAt(cur + i) is ';'
978                                         i += 1 # include ';' terminator in value
979                                         decoded = decode_named_char_ref txt.substr(cur, i)
980                                         if decoded?
981                                                 cur += i
982                                                 return decoded
983                                         parse_error()
984                                         return '&'
985                                 else
986                                         # no ';' terminator (only legacy char refs)
987                                         max = i
988                                         for i in [2..max] # no prefix matches, so ok to check shortest first
989                                                 c = legacy_char_refs[txt.substr(cur, i)]
990                                                 if c?
991                                                         if in_attr
992                                                                 if txt.charAt(cur + i) is '='
993                                                                         # "because some legacy user agents will
994                                                                         # misinterpret the markup in those cases"
995                                                                         parse_error()
996                                                                         return '&'
997                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
998                                                                         # this makes attributes forgiving about url args
999                                                                         return '&'
1000                                                         # ok, and besides the weird exceptions for attributes...
1001                                                         # return the matching char
1002                                                         cur += i # consume entity chars
1003                                                         parse_error() # because no terminating ";"
1004                                                         return c
1005                                         parse_error()
1006                                         return '&'
1007                 return # never reached
1008
1009         # tree constructor initialization
1010         # see comments on TYPE_TAG/etc for the structure of this data
1011         tree = new Node TYPE_TAG, name: 'html'
1012         open_els = [tree]
1013         tree_state = tree_in_body
1014         flag_frameset_ok = true
1015         flag_parsing = true
1016         afe = [] # active formatting elements
1017
1018         # tokenizer initialization
1019         tok_state = tok_state_data
1020
1021         # proccess input
1022         while flag_parsing
1023                 t = tok_state()
1024                 if t?
1025                         tree_state t
1026         return tree.children
1027
1028 # everything below is tests on the above
1029 test_equals = (description, output, expected_output) ->
1030         if output is expected_output
1031                 console.log "passed." # don't say name, so smart consoles can merge all of these
1032         else
1033                 console.log "FAILED: \"#{description}\""
1034                 console.log "   Expected: #{expected_output}"
1035                 console.log "     Actual: #{output}"
1036 test_parser = (args) ->
1037         parse_errors = []
1038         errors_cb = (i) ->
1039                 parse_errors.push i
1040         parsed = parse_html args.html, errors_cb
1041         serialized = ''
1042         sep = ''
1043         for t in parsed
1044                 serialized += sep
1045                 sep = ','
1046                 serialized += t.serialize()
1047         if serialized isnt args.expected or parse_errors.length isnt args.errors
1048                 console.log "FAILED: \"#{args.name}\""
1049         else
1050                 console.log "passed \"#{args.name}\""
1051         if serialized isnt args.expected
1052                 console.log "      Input: #{args.html}"
1053                 console.log "    Correct: #{args.expected}"
1054                 console.log "     Output: #{serialized}"
1055         if parse_errors.length isnt args.errors
1056                 console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
1057
1058 test_parser name: "empty", \
1059         html: "",
1060         expected: '',
1061         errors: 0
1062 test_parser name: "just text", \
1063         html: "abc",
1064         expected: 'text:"abc"',
1065         errors: 0
1066 test_parser name: "named entity", \
1067         html: "a&amp;1234",
1068         expected: 'text:"a&1234"',
1069         errors: 0
1070 test_parser name: "broken named character references", \
1071         html: "1&amp2&&amp;3&aabbcc;",
1072         expected: 'text:"1&2&&3&aabbcc;"',
1073         errors: 2
1074 test_parser name: "numbered entity overrides", \
1075         html: "1&#X80&#x80; &#x83",
1076         expected: 'text:"1€€ ƒ"',
1077         errors: 0
1078 test_parser name: "open tag", \
1079         html: "foo<span>bar",
1080         expected: 'text:"foo",tag:"span",{},[text:"bar"]',
1081         errors: 1 # no close tag
1082 test_parser name: "open tag with attributes", \
1083         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
1084         expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
1085         errors: 1 # no close tag
1086 test_parser name: "open tag with attributes of various quotings", \
1087         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
1088         expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
1089         errors: 1 # no close tag
1090 test_parser name: "attribute entity exceptions dq", \
1091         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
1092         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
1093         errors: 2 # no close tag, &amp= in attr
1094 test_parser name: "attribute entity exceptions sq", \
1095         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
1096         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
1097         errors: 2 # no close tag, &amp= in attr
1098 test_parser name: "attribute entity exceptions uq", \
1099         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
1100         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
1101         errors: 2 # no close tag, &amp= in attr
1102 test_parser name: "matching closing tags", \
1103         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
1104         expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
1105         errors: 0
1106 test_parser name: "missing closing tag inside", \
1107         html: "foo<div>bar<span>baz</div>qux",
1108         expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
1109         errors: 1 # close tag mismatch
1110 test_parser name: "mis-matched closing tags", \
1111         html: "<span>12<div>34</span>56</div>78",
1112         expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
1113         errors: 2 # misplaced </span>, no </span> at the end
1114 test_parser name: "mis-matched formatting elements", \
1115         html: "12<b>34<i>56</b>78</i>90",
1116         expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
1117         errors: 1 # no idea how many their should be
1118 test_parser name: "crazy formatting elements test", \
1119         html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
1120         # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
1121         # firefox does this:
1122         expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
1123         errors: 6 # no idea how many there should be