JasonWoof Got questions, comments, patches, etc.? Contact Jason Woofenden
implement most details about where to insert nodes
[peach-html5-editor.git] / parse-html.coffee
1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
3 #
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
7 # later version.
8 #
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
12 # details.
13 #
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
16
17
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
23 # parsing.
24 #
25 # Instead, the data structure produced by this parser is an array of nodes.
26 #
27 # Each node is an obect of the Node class. Here are the Node types:
28 TYPE_TAG = 0 # name, {attributes}, [children]
29 TYPE_TEXT = 1 # "text"
30 TYPE_COMMENT = 2
31 TYPE_DOCTYPE = 3
32 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
33 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
34 TYPE_END_TAG = 5 # name
35 TYPE_EOF = 6
36 TYPE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
37 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
38
39 # namespace constants
40 NS_HTML = 1
41 NS_MATHML = 2
42 NS_SVG = 3
43
44 class Node
45         constructor: (type, args = {}) ->
46                 @type = type # one of the TYPE_* constants above
47                 @name = args.name ? '' # tag name
48                 @text = args.text ? '' # contents for text/comment nodes
49                 @attrs = args.attrs ? {}
50                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
51                 @children = args.children ? []
52                 @namespace = args.namespace ? NS_HTML
53                 @parent = args.parent ? null
54         shallow_clone: -> # return a new node that's the same except without the children or parent
55                 # WARNING this doesn't work right on open tags that are still being parsed
56                 attrs = {}
57                 attrs[k] = v for k, v of @attrs
58                 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace
59         serialize: -> # for unit tests
60                 ret = ''
61                 switch @type
62                         when TYPE_TAG
63                                 ret += 'tag:'
64                                 ret += JSON.stringify @name
65                                 ret += ','
66                                 ret += JSON.stringify @attrs
67                                 ret += ',['
68                                 sep = ''
69                                 for c in @children
70                                         ret += sep
71                                         sep = ','
72                                         ret += c.serialize()
73                                 ret += ']'
74                         when TYPE_TEXT
75                                 ret += 'text:'
76                                 ret += JSON.stringify @text
77                         when TYPE_COMMENT
78                                 ret += 'comment:'
79                                 ret += JSON.stringify @text
80                         when TYPE_DOCTYPE
81                                 ret += 'doctype'
82                                 # FIXME
83                         else
84                                 ret += 'unknown:'
85                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
86                 return ret
87
88 # helpers: (only take args that are normally known when parser creates nodes)
89 new_open_tag = (name) ->
90         return new Node TYPE_OPEN_TAG, name: name
91 new_end_tag = (name) ->
92         return new Node TYPE_END_TAG, name: name
93 new_text_node = (txt) ->
94         return new Node TYPE_TEXT, text: txt
95 new_comment_node = (txt) ->
96         return new Node TYPE_COMMENT, text: txt
97 new_eof_token = ->
98         return new Node TYPE_EOF
99 new_aaa_bookmark = ->
100         return new Node TYPE_AAA_BOOKMARK
101
102 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
103 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
104 digits = "0123456789"
105 alnum = lc_alpha + uc_alpha + digits
106 hex_chars = digits + "abcdefABCDEF"
107
108 # some SVG elements have dashes in them
109 tag_name_chars = alnum + "-"
110
111 # http://www.w3.org/TR/html5/infrastructure.html#space-character
112 space_chars = "\u0009\u000a\u000c\u000d\u0020"
113
114 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
115 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
116
117 # These are the character references that don't need a terminating semicolon
118 # min length: 2, max: 6, none are a prefix of any other.
119 legacy_char_refs = {
120         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
121         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
122         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
123         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
124         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
125         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
126         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
127         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
128         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
129         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
130         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
131         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
132         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
133         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
134         shy: '­', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
135         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
136         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
137         yen: '¥', yuml: 'ÿ'
138 }
139
140 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
141 raw_text_elements = ['script', 'style']
142 escapable_raw_text_elements = ['textarea', 'title']
143 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
144 svg_elements = [
145         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
146         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
147         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
148         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
149         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
150         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
151         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
152         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
153         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
154         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
155         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
156         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
157         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
158         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
159         'view', 'vkern'
160 ]
161
162 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
163 mathml_elements = [
164         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
165         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
166         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
167         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
168         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
169         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
170         'determinant', 'diff', 'divergence', 'divide', 'domain',
171         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
172         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
173         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
174         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
175         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
176         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
177         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
178         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
179         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
180         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
181         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
182         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
183         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
184         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
185         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
186         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
187         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
188         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
189         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
190         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
191         'vectorproduct', 'xor'
192 ]
193 # foreign_elements = [svg_elements..., mathml_elements...]
194 #normal_elements = All other allowed HTML elements are normal elements.
195
196 special_elements = {
197         # HTML:
198         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
199         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
200         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
201         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
202         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
203         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
204         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
205         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
206         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
207         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
208         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
209         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
210         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
211         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
212         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
213         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
214         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
215         wbr:NS_HTML, xmp:NS_HTML,
216
217         # MathML:
218         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
219         'annotation-xml':NS_MATHML,
220
221         # SVG:
222         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
223 }
224
225 formatting_elements = {
226          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
227          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
228          u: true
229 }
230
231 el_is_special = (e) ->
232         return special_elements[e] is e.namespace
233
234 # decode_named_char_ref()
235 #
236 # The list of named character references is _huge_ so ask the browser to decode
237 # for us instead of wasting bandwidth/space on including the table here.
238 #
239 # Pass without the "&" but with the ";" examples:
240 #    for "&amp" pass "amp;"
241 #    for "&#x2032" pass "x2032;"
242 g_dncr = {
243         cache: {}
244         textarea: document.createElement('textarea')
245 }
246 # TODO test this in IE8
247 decode_named_char_ref = (txt) ->
248         txt = "&#{txt}"
249         decoded = g_dncr.cache[txt]
250         return decoded if decoded?
251         g_dncr.textarea.innerHTML = txt
252         decoded = g_dncr.textarea.value
253         return null if decoded is txt
254         return g_dncr.cache[txt] = decoded
255
256 parse_html = (txt, parse_error_cb = null) ->
257         cur = 0 # index of next char in txt to be parsed
258         # declare tree and tokenizer variables so they're in scope below
259         tree = null
260         open_els = [] # stack of open elements
261         tree_state = null
262         tok_state = null
263         tok_cur_tag = null # partially parsed tag
264         flag_frameset_ok = null
265         flag_parsing = null
266         flag_foster_parenting = null
267         afe = [] # active formatting elements
268
269         parse_error = ->
270                 if parse_error_cb?
271                         parse_error_cb cur
272                 else
273                         console.log "Parse error at character #{cur} of #{txt.length}"
274
275
276         # the functions below impliment the Tree Contstruction algorithm
277         # http://www.w3.org/TR/html5/syntax.html#tree-construction
278
279         # But first... the helpers
280         template_tag_is_open = ->
281                 for t in open_els
282                         if t.type is TYPE_TAG and t.name is 'template'
283                                 return true
284                 return false
285         is_in_scope_x = (tag_name, scope) ->
286                 for t in open_els
287                         if t.name is tag_name
288                                 return true
289                         if t.name of scope
290                                 return false
291                 return false
292         is_in_scope_x_y = (tag_name, scope, scope2) ->
293                 for t in open_els
294                         if t.name is tag_name
295                                 return true
296                         if t.name of scope
297                                 return false
298                         if t.name of scope2
299                                 return false
300                 return false
301         standard_scopers = { # FIXME these are supposed to be namespace specific
302                 'applet': true, 'caption': true, 'html': true, 'table': true, 'td': true,
303                 'th': true, 'marquee': true, 'object': true, 'template': true, 'mi': true,
304                 'mo': true, 'mn': true, 'ms': true, 'mtext': true, 'annotation-xml': true,
305                 'foreignObject': true, 'desc': true, 'title'
306         }
307         button_scopers = button: true
308         li_scopers = ol: true, ul: true
309         table_scopers = html: true, table: true, template: true
310         is_in_scope = (tag_name) ->
311                 return is_in_scope_x tag_name, standard_scopers
312         is_in_button_scope = (tag_name) ->
313                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers
314         is_in_table_scope = (tag_name) ->
315                 return is_in_scope_x tag_name, table_scopers
316         is_in_select_scope = (tag_name) ->
317                 for t in open_els
318                         if t.name is tag_name
319                                 return true
320                         if t.name isnt 'optgroup' and t.name isnt 'option'
321                                 return false
322                 return false
323         # this checks for a particular element, not by name
324         el_is_in_scope = (el) ->
325                 for t in open_els
326                         if t is el
327                                 return true
328                         if t.name of standard_scopers
329                                 return false
330                 return false
331
332         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
333         # this implementation is structured (mostly) as described at the link above.
334         # capitalized comments are the "labels" described at the link above.
335         reconstruct_active_formatting_elements = ->
336                 return if afe.length is 0
337                 if afe[0].type is TYPE_MARKER or afe[0] in open_els
338                         return
339                 # Rewind
340                 i = 0
341                 loop
342                         if i is afe.length - 1
343                                 break
344                         i += 1
345                         if afe[i].type is TYPE_MARKER or afe[i] in open_els
346                                 i -= 1 # Advance
347                                 break
348                 # Create
349                 loop
350                         el = afe[i].shallow_clone()
351                         tree_insert_element el
352                         afe[i] = el
353                         break if i is 0
354                         i -= 1
355
356         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
357         # adoption agency algorithm
358         adoption_agency = (subject) ->
359                 if open_els[0].name is subject
360                         el = open_els[0]
361                         open_els.shift()
362                         # remove it from the list of active formatting elements (if found)
363                         for t, i in afe
364                                 if t is el
365                                         afe.splice i, 1
366                                         break
367                         return
368                 outer = 0
369                 loop
370                         if outer >= 8
371                                 return
372                         outer += 1
373                         fe = null
374                         for t, fe_index in afe
375                                 if t.type is TYPE_MARKER
376                                         break
377                                 if t.name is subject
378                                         fe = t
379                                         break
380                         if fe is null
381                                 in_body_any_other_end_tag subject
382                                 return
383                         in_open_els = false
384                         for t in open_els
385                                 if t is fe
386                                         in_open_els = true
387                                         break
388                         unless in_open_els
389                                 parse_error()
390                                 # "remove it from the list" must mean afe, since it's not in open_els
391                                 afe.splice fe_index, 1
392                                 return
393                         unless el_is_in_scope fe
394                                 parse_error()
395                                 return
396                         unless open_els[0] is fe
397                                 parse_error()
398                                 # continue
399                         fb = null
400                         fb_index
401                         for t, i in open_els
402                                 if t is fe
403                                         break
404                                 if el_is_special t
405                                         fb = t
406                                         fb_index = i
407                         if fb is null
408                                 loop
409                                         t = open_els.shift()
410                                         if t is fe
411                                                 afe.splice fe_index, 1
412                                                 return
413                         ca = open_els[fe_index + 1] # common ancestor
414                         node_above = open_els[fb_index + 1] # next node if node isn't in open_els anymore
415                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
416                         bookmark = new_aaa_bookmark()
417                         for t, i in afe
418                                 if t is fe
419                                         afe.splice i, 0, bookmark
420                         node = last_node = fb
421                         inner = 0
422                         loop
423                                 inner += 1
424                                 node_next = null
425                                 for t, i in open_els
426                                         if t is node
427                                                 node_next = open_els[i + 1]
428                                                 break
429                                 node = node_next ? node_above
430                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
431                                 if node is fe
432                                         break
433                                 node_in_afe = false
434                                 for t, i of afe
435                                         if t is node
436                                                 if inner > 3
437                                                         afe.splice i, 1
438                                                 else
439                                                         node_in_afe = true
440                                                 break
441                                 unless node_in_afe
442                                         for t, i in open_els
443                                                 if t is node
444                                                         node_above = open_els[i + 1]
445                                                         open_els.splice i, 1
446                                                         break
447                                         continue
448                                 # 7. reate an element for the token for which the element node
449                                 # was created, in the HTML namespace, with common ancestor as
450                                 # the intended parent; replace the entry for node in the list
451                                 # of active formatting elements with an entry for the new
452                                 # element, replace the entry for node in the stack of open
453                                 # elements with an entry for the new element, and let node be
454                                 # the new element.
455                                 new_node = node.shallow_clone()
456                                 for t, i in afe
457                                         if t is node
458                                                 afe[i] = new_node
459                                                 break
460                                 for t, i in open_els
461                                         if t is node
462                                                 open_els[i] = new_node
463                                                 break
464                                 node = new_node
465                                 # 8. If last node is furthest block, then move the
466                                 # aforementioned bookmark to be immediately after the new node
467                                 # in the list of active formatting elements.
468                                 if last_node is fb
469                                         for t, i in afe
470                                                 if t is bookmark
471                                                         afe.splice i, 1
472                                         for t, i in afe
473                                                 if t is node
474                                                         # TODO test: position i gets you "after"?
475                                                         afe.splice i, 0, new_aaa_bookmark()
476                                 # 9. Insert last node into node, first removing it from its
477                                 # previous parent node if any.
478                                 if last_node.parent?
479                                         for c, i of last_node.parent.children
480                                                 if c is last_node
481                                                         last_node.parent.children.splice i, 1
482                                 node.children.push last_node
483                                 last_node.parent = node
484                                 # 10. Let last node be node.
485                                 last_node = node
486                                 # 11. Return to the step labeled inner loop.
487                         # 14. Insert whatever last node ended up being in the previous step
488                         # at the appropriate place for inserting a node, but using common
489                         # ancestor as the override target.
490                         tree_insert_element last_node, ca
491                         # 15. Create an element for the token for which formatting element
492                         # was created, in the HTML namespace, with furthest block as the
493                         # intended parent.
494                         new_element = fe.shallow_clone()
495                         # 16. Take all of the child nodes of furthest block and append them
496                         # to the element created in the last step.
497                         while fb.children.length
498                                 t = fb.children.shift()
499                                 t.parent = new_element
500                                 new_element.children.push t
501                         # 17. Append that new element to furthest block.
502                         new_element.parent = fb
503                         fb.children.push new_element
504                         # 18. Remove formatting element from the list of active formatting
505                         # elements, and insert the new element into the list of active
506                         # formatting elements at the position of the aforementioned
507                         # bookmark.
508                         for t, i in afe
509                                 if t is fe
510                                         afe.splice i, 1
511                                         break
512                         for t, i in afe
513                                 if t is bookmark
514                                         afe[i] = node
515                                         break
516                         # 19. Remove formatting element from the stack of open elements,
517                         # and insert the new element into the stack of open elements
518                         # immediately below the position of furthest block in that stack.
519                         for t, i of open_els
520                                 if t is fe
521                                         open_els.splice i, 1
522                                         break
523                         for t, i of open_els
524                                 if t is fb
525                                         open_els.splice i, 0, new_element
526                                         break
527                         # 20. Jump back to the step labeled outer loop.
528
529         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
530         # FIXME implement this
531         close_p_if_in_button_scope = ->
532                 if open_els[0].name is 'p'
533                         open_els.pop()
534                 return
535                 #p = find_button_scope 'p'
536                 #if p?
537                         # TODO generate_implied_end_tags except for p tags
538                         # TODO parse_error unless open_els[0].name is 'p'
539                         # TODO pop stack until 'p' popped
540
541         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
542         tree_insert_text = (t) ->
543                 dest = adjusted_insertion_location()
544                 if dest[1] > 0
545                         prev = dest[0].children[dest[1] - 1]
546                         if prev.type is TYPE_TEXT
547                                 prev.text += t.text
548                                 return
549                 dest[0].children.splice dest[1], 0, t
550
551         # 8.2.5.1
552         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
553         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
554         adjusted_insertion_location = (override_target = null) ->
555                 # 1. If there was an override target specified, then let target be the
556                 # override target.
557                 if override_target?
558                         target = override_target
559                 else # Otherwise, let target be the current node.
560                         target = open_els[0]
561                 # 2. Determine the adjusted insertion location using the first matching
562                 # steps from the following list:
563                 #
564                 # If foster parenting is enabled and target is a table, tbody, tfoot,
565                 # thead, or tr element Foster parenting happens when content is
566                 # misnested in tables.
567                 if flag_foster_parenting and target.name in foster_parenting_targets
568                         console.log "foster parenting isn't implemented yet" # TODO
569                         # 1. Let last template be the last template element in the stack of
570                         # open elements, if any.
571                         # 2. Let last table be the last table element in the stack of open
572                         # elements, if any.
573
574                         # 3. If there is a last template and either there is no last table,
575                         # or there is one, but last template is lower (more recently added)
576                         # than last table in the stack of open elements, then: let adjusted
577                         # insertion location be inside last template's template contents,
578                         # after its last child (if any), and abort these substeps.
579
580                         # 4. If there is no last table, then let adjusted insertion
581                         # location be inside the first element in the stack of open
582                         # elements (the html element), after its last child (if any), and
583                         # abort these substeps. (fragment case)
584
585                         # 5. If last table has a parent element, then let adjusted
586                         # insertion location be inside last table's parent element,
587                         # immediately before last table, and abort these substeps.
588
589                         # 6. Let previous element be the element immediately above last
590                         # table in the stack of open elements.
591
592                         # 7. Let adjusted insertion location be inside previous element,
593                         # after its last child (if any).
594
595                         # Note: These steps are involved in part because it's possible for
596                         # elements, the table element in this case in particular, to have
597                         # been moved by a script around in the DOM, or indeed removed from
598                         # the DOM entirely, after the element was inserted by the parser.
599                 else
600                         # Otherwise Let adjusted insertion location be inside target, after
601                         # its last child (if any).
602                         target_i = target.children.length
603
604                 # 3. If the adjusted insertion location is inside a template element,
605                 # let it instead be inside the template element's template contents,
606                 # after its last child (if any). TODO
607
608                 # 4. Return the adjusted insertion location.
609                 return [target, target_i]
610
611         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
612         # aka create_an_element_for_token
613         token_to_element = (t, namespace, intended_parent) ->
614                 t.type = TYPE_TAG # not TYPE_OPEN_TAG
615                 # convert attributes into a hash
616                 attrs = {}
617                 while t.attrs_a.length
618                         a = t.attrs_a.pop()
619                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
620                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
621
622                 # TODO 2. If the newly created element has an xmlns attribute in the
623                 # XMLNS namespace whose value is not exactly the same as the element's
624                 # namespace, that is a parse error. Similarly, if the newly created
625                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
626                 # value is not the XLink Namespace, that is a parse error.
627
628                 # fixfull: the spec says stuff about form pointers and ownerDocument
629
630                 return el
631
632         # FIXME read implement "foster parenting" part
633         # FIXME read spec, do this right
634         # FIXME implement the override target thing
635         # note: this assumes it's an open tag
636         # TODO tree_insert_html_element = (t, ...
637         tree_insert_element = (el, override_target = null, namespace = null) ->
638                 dest = adjusted_insertion_location override_target
639                 if el.type is TYPE_OPEN_TAG # means it's a "token"
640                         el = token_to_element el, namespace, dest[0]
641                 # fixfull: Document nodes sometimes can't accept more chidren
642                 dest[0].children.splice dest[1], 0, el
643                 el.parent = dest[0]
644                 open_els.unshift el
645                 return el
646
647         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
648         tree_insert_a_comment = (t) ->
649                 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
650                 open_els[0].children.push t
651
652         # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
653         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
654                 for node, i in open_els
655                         if node.name is name
656                                 # FIXME generate implied end tags except those with name==name
657                                 parse_error() unless i is 0
658                                 while i > 0
659                                         open_els.shift()
660                                         i -= 1
661                                 open_els.shift()
662                                 return
663                         if special_elements[node.name]?
664                                 parse_error()
665                                 return
666         tree_in_body = (t) ->
667                 switch t.type
668                         when TYPE_TEXT
669                                 switch t.text
670                                         when "\u0000"
671                                                 parse_error()
672                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
673                                                 reconstruct_active_formatting_elements()
674                                                 tree_insert_text t
675                                         else
676                                                 reconstruct_active_formatting_elements()
677                                                 tree_insert_text t
678                                                 flag_frameset_ok = false
679                         when TYPE_COMMENT
680                                 tree_insert_a_comment t
681                         when TYPE_DOCTYPE
682                                 parse_error()
683                         when TYPE_OPEN_TAG
684                                 switch t.name
685                                         when 'html'
686                                                 parse_error()
687                                                 return if template_tag_is_open()
688                                                 root_attrs = open_els[open_els.length - 1].children
689                                                 for k, v of t.attrs
690                                                         root_attrs[k] = v unless root_attrs[k]?
691                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
692                                                 # FIXME also do this for </template> (end tag)
693                                                 return tree_in_head t
694                                         when 'body'
695                                                 parse_error()
696                                                 # TODO
697                                         when 'frameset'
698                                                 parse_error()
699                                                 # TODO
700                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
701                                                 close_p_if_in_button_scope()
702                                                 tree_insert_element t
703                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
704                                                 close_p_if_in_button_scope()
705                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
706                                                         parse_error()
707                                                         open_els.shift()
708                                                 tree_insert_element t
709                                         # TODO lots more to implement here
710                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
711                                                 reconstruct_active_formatting_elements()
712                                                 el = tree_insert_element t
713                                                 afe.push el
714                                         # TODO lots more to implement here
715                                         else # any other start tag
716                                                 reconstruct_active_formatting_elements()
717                                                 tree_insert_element t
718                         when TYPE_EOF
719                                 ok_tags = {
720                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
721                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
722                                 }
723                                 for t in open_els
724                                         unless ok_tags[t.name]?
725                                                 parse_error()
726                                                 break
727                                 # TODO stack of template insertion modes thing
728                                 flag_parsing = false # stop parsing
729                         when TYPE_END_TAG
730                                 switch t.name
731                                         when 'body'
732                                                 unless is_in_scope 'body'
733                                                         parse_error()
734                                                         return
735                                                 # TODO implement parse error and move to tree_after_body
736                                         when 'html'
737                                                 unless is_in_scope 'body' # weird, but it's what the spec says
738                                                         parse_error()
739                                                         return
740                                                 # TODO implement parse error and move to tree_after_body, reprocess
741                                         # TODO lots more close tags to implement here
742                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
743                                                 adoption_agency t.name
744                                         # TODO lots more close tags to implement here
745                                         else
746                                                 in_body_any_other_end_tag t.name
747                 return
748
749
750         # the functions below implement the tokenizer stats described here:
751         # http://www.w3.org/TR/html5/syntax.html#tokenization
752
753         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
754         tok_state_data = ->
755                 switch c = txt.charAt(cur++)
756                         when '&'
757                                 return new_text_node tokenize_character_reference()
758                         when '<'
759                                 tok_state = tok_state_tag_open
760                         when "\u0000"
761                                 parse_error()
762                                 return new_text_node c
763                         when '' # EOF
764                                 return new_eof_token()
765                         else
766                                 return new_text_node c
767                 return null
768
769         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
770         # not needed: tok_state_character_reference_in_data = ->
771         # just call tok_state_character_reference_in_data()
772
773         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
774         tok_state_tag_open = ->
775                 switch c = txt.charAt(cur++)
776                         when '!'
777                                 tok_state = tok_state_markup_declaration_open
778                         when '/'
779                                 tok_state = tok_state_end_tag_open
780                         when '?'
781                                 parse_error()
782                                 tok_state = tok_state_bogus_comment
783                         else
784                                 if lc_alpha.indexOf(c) > -1
785                                         tok_cur_tag = new_open_tag c
786                                         tok_state = tok_state_tag_name
787                                 else if uc_alpha.indexOf(c) > -1
788                                         tok_cur_tag = new_open_tag c.toLowerCase()
789                                         tok_state = tok_state_tag_name
790                                 else
791                                         parse_error()
792                                         tok_state = tok_state_data
793                                         cur -= 1 # we didn't parse/handle the char after <
794                                         return new_text_node '<'
795                 return null
796
797         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
798         tok_state_end_tag_open = ->
799                 switch c = txt.charAt(cur++)
800                         when '>'
801                                 parse_error()
802                                 tok_state = tok_state_data
803                         when '' # EOF
804                                 parse_error()
805                                 tok_state = tok_state_data
806                                 return new_text_node '</'
807                         else
808                                 if uc_alpha.indexOf(c) > -1
809                                         tok_cur_tag = new_end_tag c.toLowerCase()
810                                         tok_state = tok_state_tag_name
811                                 else if lc_alpha.indexOf(c) > -1
812                                         tok_cur_tag = new_end_tag c
813                                         tok_state = tok_state_tag_name
814                                 else
815                                         parse_error()
816                                         tok_state = tok_state_bogus_comment
817                 return null
818
819         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
820         tok_state_tag_name = ->
821                 switch c = txt.charAt(cur++)
822                         when "\t", "\n", "\u000c", ' '
823                                 tok_state = tok_state_before_attribute_name
824                         when '/'
825                                 tok_state = tok_state_self_closing_start_tag
826                         when '>'
827                                 tok_state = tok_state_data
828                                 tmp = tok_cur_tag
829                                 tok_cur_tag = null
830                                 return tmp
831                         when "\u0000"
832                                 parse_error()
833                                 tok_cur_tag.name += "\ufffd"
834                         when '' # EOF
835                                 parse_error()
836                                 tok_state = tok_state_data
837                         else
838                                 if uc_alpha.indexOf(c) > -1
839                                         tok_cur_tag.name += c.toLowerCase()
840                                 else
841                                         tok_cur_tag.name += c
842                 return null
843
844         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
845         tok_state_before_attribute_name = ->
846                 attr_name = null
847                 switch c = txt.charAt(cur++)
848                         when "\t", "\n", "\u000c", ' '
849                                 return null
850                         when '/'
851                                 tok_state = tok_state_self_closing_start_tag
852                                 return null
853                         when '>'
854                                 tok_state = tok_state_data
855                                 tmp = tok_cur_tag
856                                 tok_cur_tag = null
857                                 return tmp
858                         when "\u0000"
859                                 parse_error()
860                                 attr_name = "\ufffd"
861                         when '"', "'", '<', '='
862                                 parse_error()
863                                 attr_name = c
864                         when '' # EOF
865                                 parse_error()
866                                 tok_state = tok_state_data
867                         else
868                                 if uc_alpha.indexOf(c) > -1
869                                         attr_name = c.toLowerCase()
870                                 else
871                                         attr_name = c
872                 if attr_name?
873                         tok_cur_tag.attrs_a.unshift [attr_name, '']
874                         tok_state = tok_state_attribute_name
875                 return null
876
877         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
878         tok_state_attribute_name = ->
879                 switch c = txt.charAt(cur++)
880                         when "\t", "\n", "\u000c", ' '
881                                 tok_state = tok_state_after_attribute_name
882                         when '/'
883                                 tok_state = tok_state_self_closing_start_tag
884                         when '='
885                                 tok_state = tok_state_before_attribute_value
886                         when '>'
887                                 tok_state = tok_state_data
888                                 tmp = tok_cur_tag
889                                 tok_cur_tag = null
890                                 return tmp
891                         when "\u0000"
892                                 parse_error()
893                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
894                         when '"', "'", '<'
895                                 parse_error()
896                                 tok_cur_tag.attrs_a[0][0] = c
897                         when '' # EOF
898                                 parse_error()
899                                 tok_state = tok_state_data
900                         else
901                                 if uc_alpha.indexOf(c) > -1
902                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
903                                 else
904                                         tok_cur_tag.attrs_a[0][0] += c
905                 return null
906
907         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
908         tok_state_before_attribute_value = ->
909                 switch c = txt.charAt(cur++)
910                         when "\t", "\n", "\u000c", ' '
911                                 return null
912                         when '"'
913                                 tok_state = tok_state_attribute_value_double_quoted
914                         when '&'
915                                 tok_state = tok_state_attribute_value_unquoted
916                                 cur -= 1
917                         when "'"
918                                 tok_state = tok_state_attribute_value_single_quoted
919                         when "\u0000"
920                                 # Parse error
921                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
922                                 tok_state = tok_state_attribute_value_unquoted
923                         when '>'
924                                 # Parse error
925                                 tok_state = tok_state_data
926                                 tmp = tok_cur_tag
927                                 tok_cur_tag = null
928                                 return tmp
929                         when '' # EOF
930                                 parse_error()
931                                 tok_state = tok_state_data
932                         else
933                                 tok_cur_tag.attrs_a[0][1] += c
934                                 tok_state = tok_state_attribute_value_unquoted
935                 return null
936
937         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
938         tok_state_attribute_value_double_quoted = ->
939                 switch c = txt.charAt(cur++)
940                         when '"'
941                                 tok_state = tok_state_after_attribute_value_quoted
942                         when '&'
943                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
944                         when "\u0000"
945                                 # Parse error
946                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
947                         when '' # EOF
948                                 parse_error()
949                                 tok_state = tok_state_data
950                         else
951                                 tok_cur_tag.attrs_a[0][1] += c
952                 return null
953
954         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
955         tok_state_attribute_value_single_quoted = ->
956                 switch c = txt.charAt(cur++)
957                         when "'"
958                                 tok_state = tok_state_after_attribute_value_quoted
959                         when '&'
960                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
961                         when "\u0000"
962                                 # Parse error
963                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
964                         when '' # EOF
965                                 parse_error()
966                                 tok_state = tok_state_data
967                         else
968                                 tok_cur_tag.attrs_a[0][1] += c
969                 return null
970
971         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
972         tok_state_attribute_value_unquoted = ->
973                 switch c = txt.charAt(cur++)
974                         when "\t", "\n", "\u000c", ' '
975                                 tok_state = tok_state_before_attribute_name
976                         when '&'
977                                 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
978                         when '>'
979                                 tok_state = tok_state_data
980                                 tmp = tok_cur_tag
981                                 tok_cur_tag = null
982                                 return tmp
983                         when "\u0000"
984                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
985                         when '' # EOF
986                                 parse_error()
987                                 tok_state = tok_state_data
988                         else
989                                 # Parse Error if ', <, = or ` (backtick)
990                                 tok_cur_tag.attrs_a[0][1] += c
991                 return null
992
993         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
994         tok_state_after_attribute_value_quoted = ->
995                 switch c = txt.charAt(cur++)
996                         when "\t", "\n", "\u000c", ' '
997                                 tok_state = tok_state_before_attribute_name
998                         when '/'
999                                 tok_state = tok_state_self_closing_start_tag
1000                         when '>'
1001                                 tok_state = tok_state_data
1002                                 tmp = tok_cur_tag
1003                                 tok_cur_tag = null
1004                                 return tmp
1005                         when '' # EOF
1006                                 parse_error()
1007                                 tok_state = tok_state_data
1008                         else
1009                                 # Parse Error
1010                                 tok_state = tok_state_before_attribute_name
1011                                 cur -= 1 # we didn't handle that char
1012                 return null
1013
1014         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
1015         # Don't set this as a state, just call it
1016         # returns a string (NOT a text node)
1017         tokenize_character_reference = (allowed_char = null, in_attr = false) ->
1018                 if cur >= txt.length
1019                         return '&'
1020                 switch c = txt.charAt(cur)
1021                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
1022                                 # explicitly not a parse error
1023                                 return '&'
1024                         when ';'
1025                                 # there has to be "one or more" alnums between & and ; to be a parse error
1026                                 return '&'
1027                         when '#'
1028                                 if cur + 1 >= txt.length
1029                                         return '&'
1030                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
1031                                         prefix = '#x'
1032                                         charset = hex_chars
1033                                         start = cur + 2
1034                                 else
1035                                         charset = digits
1036                                         start = cur + 1
1037                                         prefix = '#'
1038                                 i = 0
1039                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
1040                                         i += 1
1041                                 if i is 0
1042                                         return '&'
1043                                 if txt.charAt(start + i) is ';'
1044                                         i += 1
1045                                 # FIXME This is supposed to generate parse errors for some chars
1046                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
1047                                 if decoded?
1048                                         cur = start + i
1049                                         return decoded
1050                                 return '&'
1051                         else
1052                                 for i in [0...31]
1053                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
1054                                                 break
1055                                 if i is 0
1056                                         # exit early, because parse_error() below needs at least one alnum
1057                                         return '&'
1058                                 if txt.charAt(cur + i) is ';'
1059                                         i += 1 # include ';' terminator in value
1060                                         decoded = decode_named_char_ref txt.substr(cur, i)
1061                                         if decoded?
1062                                                 cur += i
1063                                                 return decoded
1064                                         parse_error()
1065                                         return '&'
1066                                 else
1067                                         # no ';' terminator (only legacy char refs)
1068                                         max = i
1069                                         for i in [2..max] # no prefix matches, so ok to check shortest first
1070                                                 c = legacy_char_refs[txt.substr(cur, i)]
1071                                                 if c?
1072                                                         if in_attr
1073                                                                 if txt.charAt(cur + i) is '='
1074                                                                         # "because some legacy user agents will
1075                                                                         # misinterpret the markup in those cases"
1076                                                                         parse_error()
1077                                                                         return '&'
1078                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
1079                                                                         # this makes attributes forgiving about url args
1080                                                                         return '&'
1081                                                         # ok, and besides the weird exceptions for attributes...
1082                                                         # return the matching char
1083                                                         cur += i # consume entity chars
1084                                                         parse_error() # because no terminating ";"
1085                                                         return c
1086                                         parse_error()
1087                                         return '&'
1088                 return # never reached
1089
1090         # tree constructor initialization
1091         # see comments on TYPE_TAG/etc for the structure of this data
1092         tree = new Node TYPE_TAG, name: 'html'
1093         open_els = [tree]
1094         tree_state = tree_in_body
1095         flag_frameset_ok = true
1096         flag_parsing = true
1097         flag_foster_parenting = false
1098         afe = [] # active formatting elements
1099
1100         # tokenizer initialization
1101         tok_state = tok_state_data
1102
1103         # proccess input
1104         while flag_parsing
1105                 t = tok_state()
1106                 if t?
1107                         tree_state t
1108         return tree.children
1109
1110 # everything below is tests on the above
1111 test_equals = (description, output, expected_output) ->
1112         if output is expected_output
1113                 console.log "passed." # don't say name, so smart consoles can merge all of these
1114         else
1115                 console.log "FAILED: \"#{description}\""
1116                 console.log "   Expected: #{expected_output}"
1117                 console.log "     Actual: #{output}"
1118 test_parser = (args) ->
1119         parse_errors = []
1120         errors_cb = (i) ->
1121                 parse_errors.push i
1122         parsed = parse_html args.html, errors_cb
1123         serialized = ''
1124         sep = ''
1125         for t in parsed
1126                 serialized += sep
1127                 sep = ','
1128                 serialized += t.serialize()
1129         if serialized isnt args.expected or parse_errors.length isnt args.errors
1130                 console.log "FAILED: \"#{args.name}\""
1131         else
1132                 console.log "passed \"#{args.name}\""
1133         if serialized isnt args.expected
1134                 console.log "      Input: #{args.html}"
1135                 console.log "    Correct: #{args.expected}"
1136                 console.log "     Output: #{serialized}"
1137         if parse_errors.length isnt args.errors
1138                 console.log "   Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
1139
1140 test_parser name: "empty", \
1141         html: "",
1142         expected: '',
1143         errors: 0
1144 test_parser name: "just text", \
1145         html: "abc",
1146         expected: 'text:"abc"',
1147         errors: 0
1148 test_parser name: "named entity", \
1149         html: "a&amp;1234",
1150         expected: 'text:"a&1234"',
1151         errors: 0
1152 test_parser name: "broken named character references", \
1153         html: "1&amp2&&amp;3&aabbcc;",
1154         expected: 'text:"1&2&&3&aabbcc;"',
1155         errors: 2
1156 test_parser name: "numbered entity overrides", \
1157         html: "1&#X80&#x80; &#x83",
1158         expected: 'text:"1€€ ƒ"',
1159         errors: 0
1160 test_parser name: "open tag", \
1161         html: "foo<span>bar",
1162         expected: 'text:"foo",tag:"span",{},[text:"bar"]',
1163         errors: 1 # no close tag
1164 test_parser name: "open tag with attributes", \
1165         html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
1166         expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
1167         errors: 1 # no close tag
1168 test_parser name: "open tag with attributes of various quotings", \
1169         html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
1170         expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
1171         errors: 1 # no close tag
1172 test_parser name: "attribute entity exceptions dq", \
1173         html: "foo<a href=\"foo?t=1&amp=2&ampo=3&amp;lt=foo\">bar",
1174         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
1175         errors: 2 # no close tag, &amp= in attr
1176 test_parser name: "attribute entity exceptions sq", \
1177         html: "foo<a href='foo?t=1&amp=2&ampo=3&amp;lt=foo'>bar",
1178         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
1179         errors: 2 # no close tag, &amp= in attr
1180 test_parser name: "attribute entity exceptions uq", \
1181         html: "foo<a href=foo?t=1&amp=2&ampo=3&amp;lt=foo>bar",
1182         expected: 'text:"foo",tag:"a",{"href":"foo?t=1&amp=2&ampo=3&lt=foo"},[text:"bar"]',
1183         errors: 2 # no close tag, &amp= in attr
1184 test_parser name: "matching closing tags", \
1185         html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
1186         expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
1187         errors: 0
1188 test_parser name: "missing closing tag inside", \
1189         html: "foo<div>bar<span>baz</div>qux",
1190         expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
1191         errors: 1 # close tag mismatch
1192 test_parser name: "mis-matched closing tags", \
1193         html: "<span>12<div>34</span>56</div>78",
1194         expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
1195         errors: 2 # misplaced </span>, no </span> at the end
1196 test_parser name: "mis-matched formatting elements", \
1197         html: "12<b>34<i>56</b>78</i>90",
1198         expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
1199         errors: 1 # no idea how many their should be
1200 test_parser name: "crazy formatting elements test", \
1201         html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
1202         # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
1203         # firefox does this:
1204         expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"',
1205         errors: 6 # no idea how many there should be