1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost"
43 # 4: e "previous" (to d), "above"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # Each node is an obect of the Node class. Here are the Node types:
52 TYPE_TAG = 0 # name, {attributes}, [children]
53 TYPE_TEXT = 1 # "text"
56 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
57 TYPE_OPEN_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
58 TYPE_END_TAG = 5 # name
60 TYPE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
61 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
73 debug_log_each = (cb) ->
74 for str in g_debug_log
79 constructor: (type, args = {}) ->
80 @type = type # one of the TYPE_* constants above
81 @name = args.name ? '' # tag name
82 @text = args.text ? '' # contents for text/comment nodes
83 @attrs = args.attrs ? {}
84 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_OPEN_TAG only
85 @children = args.children ? []
86 @namespace = args.namespace ? NS_HTML
87 @parent = args.parent ? null
91 @id = "#{++prev_node_id}"
92 shallow_clone: -> # return a new node that's the same except without the children or parent
93 # WARNING this doesn't work right on open tags that are still being parsed
95 attrs[k] = v for k, v of @attrs
96 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
97 serialize: (shallow = false, show_ids = false) -> # for unit tests
102 ret += JSON.stringify @name
108 ret += JSON.stringify @attrs
114 ret += c.serialize shallow, show_ids
118 ret += JSON.stringify @text
121 ret += JSON.stringify @text
127 when TYPE_AAA_BOOKMARK
128 ret += 'aaa_bookmark'
131 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
134 # helpers: (only take args that are normally known when parser creates nodes)
135 new_open_tag = (name) ->
136 return new Node TYPE_OPEN_TAG, name: name
137 new_end_tag = (name) ->
138 return new Node TYPE_END_TAG, name: name
139 new_element = (name) ->
140 return new Node TYPE_TAG, name: name
141 new_text_node = (txt) ->
142 return new Node TYPE_TEXT, text: txt
143 new_comment_node = (txt) ->
144 return new Node TYPE_COMMENT, text: txt
146 return new Node TYPE_EOF
147 new_aaa_bookmark = ->
148 return new Node TYPE_AAA_BOOKMARK
150 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
151 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
152 digits = "0123456789"
153 alnum = lc_alpha + uc_alpha + digits
154 hex_chars = digits + "abcdefABCDEF"
156 # some SVG elements have dashes in them
157 tag_name_chars = alnum + "-"
159 # http://www.w3.org/TR/html5/infrastructure.html#space-character
160 space_chars = "\u0009\u000a\u000c\u000d\u0020"
162 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
163 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
165 # These are the character references that don't need a terminating semicolon
166 # min length: 2, max: 6, none are a prefix of any other.
168 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
169 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
170 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
171 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
172 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
173 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
174 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
175 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
176 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
177 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
178 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
179 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
180 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
181 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
182 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
183 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
184 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
188 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
189 raw_text_elements = ['script', 'style']
190 escapable_raw_text_elements = ['textarea', 'title']
191 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
193 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
194 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
195 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
196 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
197 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
198 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
199 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
200 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
201 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
202 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
203 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
204 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
205 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
206 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
210 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
212 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
213 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
214 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
215 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
216 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
217 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
218 'determinant', 'diff', 'divergence', 'divide', 'domain',
219 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
220 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
221 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
222 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
223 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
224 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
225 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
226 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
227 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
228 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
229 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
230 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
231 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
232 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
233 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
234 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
235 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
236 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
237 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
238 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
239 'vectorproduct', 'xor'
241 # foreign_elements = [svg_elements..., mathml_elements...]
242 #normal_elements = All other allowed HTML elements are normal elements.
246 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
247 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
248 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
249 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
250 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
251 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
252 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
253 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
254 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
255 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
256 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
257 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
258 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
259 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
260 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
261 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
262 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
263 wbr:NS_HTML, xmp:NS_HTML,
266 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
267 'annotation-xml':NS_MATHML,
270 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
273 formatting_elements = {
274 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
275 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
293 el_is_special = (e) ->
294 return special_elements[e.name]?
295 # FIXME it should really be:
296 #return special_elements[e.name] is e.namespace
298 # decode_named_char_ref()
300 # The list of named character references is _huge_ so ask the browser to decode
301 # for us instead of wasting bandwidth/space on including the table here.
303 # Pass without the "&" but with the ";" examples:
304 # for "&" pass "amp;"
305 # for "′" pass "x2032;"
308 textarea: document.createElement('textarea')
310 # TODO test this in IE8
311 decode_named_char_ref = (txt) ->
313 decoded = g_dncr.cache[txt]
314 return decoded if decoded?
315 g_dncr.textarea.innerHTML = txt
316 decoded = g_dncr.textarea.value
317 return null if decoded is txt
318 return g_dncr.cache[txt] = decoded
320 parse_html = (txt, parse_error_cb = null) ->
321 cur = 0 # index of next char in txt to be parsed
322 # declare tree and tokenizer variables so they're in scope below
324 open_els = [] # stack of open elements
327 tok_cur_tag = null # partially parsed tag
328 flag_frameset_ok = null
330 flag_foster_parenting = null
331 afe = [] # active formatting elements
337 console.log "Parse error at character #{cur} of #{txt.length}"
340 # the functions below impliment the Tree Contstruction algorithm
341 # http://www.w3.org/TR/html5/syntax.html#tree-construction
343 # But first... the helpers
344 template_tag_is_open = ->
346 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
349 is_in_scope_x = (tag_name, scope, namespace) ->
351 if t.name is tag_name and (namespace is null or namespace is t.namespace)
353 if scope[t.name] is t.namespace
356 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
358 if t.name is tag_name and (namespace is null or namespace is t.namespace)
360 if scope[t.name] is t.namespace
362 if scope2[t.name] is t.namespace
365 standard_scopers = { # FIXME these are supposed to be namespace specific
366 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
367 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
368 template: NS_HTML, mi: NS_MATHML,
370 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
371 'annotation-xml': NS_MATHML,
373 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
375 button_scopers = button: NS_HTML
376 li_scopers = ol: NS_HTML, ul: NS_HTML
377 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
378 is_in_scope = (tag_name, namespace = null) ->
379 return is_in_scope_x tag_name, standard_scopers, namespace
380 is_in_button_scope = (tag_name, namespace = null) ->
381 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
382 is_in_table_scope = (tag_name, namespace = null) ->
383 return is_in_scope_x tag_name, table_scopers, namespace
384 is_in_select_scope = (tag_name, namespace = null) ->
386 if t.name is tag_name and (namespace is null or namespace is t.namespace)
388 if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
391 # this checks for a particular element, not by name
392 el_is_in_scope = (el) ->
396 if standard_scopers[t.name] is t.namespace
400 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
401 # this implementation is structured (mostly) as described at the link above.
402 # capitalized comments are the "labels" described at the link above.
403 reconstruct_active_formatting_elements = ->
404 return if afe.length is 0
405 if afe[0].type is TYPE_MARKER or afe[0] in open_els
410 if i is afe.length - 1
413 if afe[i].type is TYPE_MARKER or afe[i] in open_els
418 el = afe[i].shallow_clone()
419 tree_insert_element el
424 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
425 # adoption agency algorithm
427 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
428 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
429 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
430 adoption_agency = (subject) ->
431 if open_els[0].name is subject
434 # remove it from the list of active formatting elements (if found)
445 # 5. Let formatting element be the last element in the list of
446 # active formatting elements that: is between the end of the list
447 # and the last scope marker in the list, if any, or the start of
448 # the list otherwise, and has the tag name subject.
450 for t, fe_of_afe in afe
451 if t.type is TYPE_MARKER
456 # If there is no such element, then abort these steps and instead
457 # act as described in the "any other end tag" entry above.
459 in_body_any_other_end_tag subject
461 # 6. If formatting element is not in the stack of open elements,
462 # then this is a parse error; remove the element from the list, and
465 for t, fe_of_open_els in open_els
471 # "remove it from the list" must mean afe, since it's not in open_els
472 afe.splice fe_of_afe, 1
474 # 7. If formatting element is in the stack of open elements, but
475 # the element is not in scope, then this is a parse error; abort
477 unless el_is_in_scope fe
480 # 8. If formatting element is not the current node, this is a parse
481 # error. (But do not abort these steps.)
482 unless open_els[0] is fe
485 # 9. Let furthest block be the topmost node in the stack of open
486 # elements that is lower in the stack than formatting element, and
487 # is an element in the special category. There might not be one.
489 fb_of_open_els = null
496 # and continue, to see if there's one that's more "topmost"
497 # 10. If there is no furthest block, then the UA must first pop all
498 # the nodes from the bottom of the stack of open elements, from the
499 # current node up to and including formatting element, then remove
500 # formatting element from the list of active formatting elements,
501 # and finally abort these steps.
506 afe.splice fe_of_afe, 1
508 # 11. Let common ancestor be the element immediately above
509 # formatting element in the stack of open elements.
510 ca = open_els[fe_of_open_els + 1] # common ancestor
512 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
513 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
514 bookmark = new_aaa_bookmark()
517 afe.splice i, 0, bookmark
519 node = last_node = fb
523 # 3. Let node be the element immediately above node in the
524 # stack of open elements, or if node is no longer in the stack
525 # of open elements (e.g. because it got removed by this
526 # algorithm), the element that was immediately above node in
527 # the stack of open elements before node was removed.
531 node_next = open_els[i + 1]
533 node = node_next ? node_above
534 debug_log "inner loop #{inner}"
535 debug_log "open_els: #{serialize_els open_els, true, true}"
536 debug_log "tree: #{serialize_els tree.children, false, true}"
537 debug_log "afe: #{serialize_els afe, true, true}"
538 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
539 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
540 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
541 debug_log "node: #{node.serialize true, true}"
542 # TODO make sure node_above gets re-set if/when node is removed from open_els
544 # 4. If node is formatting element, then go to the next step in
545 # the overall algorithm.
549 # 5. If inner loop counter is greater than three and node is in
550 # the list of active formatting elements, then remove node from
551 # the list of active formatting elements.
557 debug_log "max out inner"
562 # 6. If node is not in the list of active formatting elements,
563 # then remove node from the stack of open elements and then go
564 # back to the step labeled inner loop.
566 debug_log "not in afe"
569 node_above = open_els[i + 1]
573 debug_log "the bones"
574 # 7. create an element for the token for which the element node
575 # was created, in the HTML namespace, with common ancestor as
576 # the intended parent; replace the entry for node in the list
577 # of active formatting elements with an entry for the new
578 # element, replace the entry for node in the stack of open
579 # elements with an entry for the new element, and let node be
581 new_node = node.shallow_clone()
585 debug_log "replaced in afe"
589 node_above = open_els[i + 1]
590 open_els[i] = new_node
591 debug_log "replaced in open_els"
594 # 8. If last node is furthest block, then move the
595 # aforementioned bookmark to be immediately after the new node
596 # in the list of active formatting elements.
601 debug_log "removed bookmark"
605 # "after" means lower
606 afe.splice i, 0, bookmark # "after as <-
607 debug_log "placed bookmark after node"
608 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
610 # 9. Insert last node into node, first removing it from its
611 # previous parent node if any.
613 debug_log "last_node has parent"
614 for c, i in last_node.parent.children
616 debug_log "removing last_node from parent"
617 last_node.parent.children.splice i, 1
619 node.children.push last_node
620 last_node.parent = node
621 # 10. Let last node be node.
624 # 11. Return to the step labeled inner loop.
625 # 14. Insert whatever last node ended up being in the previous step
626 # at the appropriate place for inserting a node, but using common
627 # ancestor as the override target.
629 # JASON: In the case where fe is immediately followed by fb:
630 # * inner loop exits out early (node==fe)
632 # * last_node is still in the tree (not a duplicate)
634 debug_log "FEFIRST? last_node has parent"
635 for c, i in last_node.parent.children
637 debug_log "removing last_node from parent"
638 last_node.parent.children.splice i, 1
641 debug_log "after aaa inner loop"
642 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
643 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
644 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
645 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
646 debug_log "tree: #{serialize_els tree.children, false, true}"
651 # can't use standard insert token thing, because it's already in
652 # open_els and must stay at it's current position in open_els
653 dest = adjusted_insertion_location ca
654 dest[0].children.splice dest[1], 0, last_node
655 last_node.parent = dest[0]
658 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
659 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
660 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
661 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
662 debug_log "tree: #{serialize_els tree.children, false, true}"
664 # 15. Create an element for the token for which formatting element
665 # was created, in the HTML namespace, with furthest block as the
667 new_element = fe.shallow_clone() # FIXME intended parent thing
668 # 16. Take all of the child nodes of furthest block and append them
669 # to the element created in the last step.
670 while fb.children.length
671 t = fb.children.shift()
672 t.parent = new_element
673 new_element.children.push t
674 # 17. Append that new element to furthest block.
675 new_element.parent = fb
676 fb.children.push new_element
677 # 18. Remove formatting element from the list of active formatting
678 # elements, and insert the new element into the list of active
679 # formatting elements at the position of the aforementioned
689 # 19. Remove formatting element from the stack of open elements,
690 # and insert the new element into the stack of open elements
691 # immediately below the position of furthest block in that stack.
698 open_els.splice i, 0, new_element
700 # 20. Jump back to the step labeled outer loop.
701 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
702 debug_log "tree: #{serialize_els tree.children, false, true}"
703 debug_log "open_els: #{serialize_els open_els, true, true}"
704 debug_log "afe: #{serialize_els afe, true, true}"
707 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
708 # FIXME test this (particularly emplied end tags)
710 generate_implied_end_tags 'p' # arg is exception
711 if open_els[0].name isnt 'p'
713 while open_els.length > 1 # just in case
717 close_p_if_in_button_scope = ->
718 if is_in_button_scope 'p'
721 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
722 tree_insert_text = (t) ->
723 dest = adjusted_insertion_location()
725 prev = dest[0].children[dest[1] - 1]
726 if prev.type is TYPE_TEXT
729 dest[0].children.splice dest[1], 0, t
732 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
733 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
734 adjusted_insertion_location = (override_target = null) ->
735 # 1. If there was an override target specified, then let target be the
738 target = override_target
739 else # Otherwise, let target be the current node.
741 # 2. Determine the adjusted insertion location using the first matching
742 # steps from the following list:
744 # If foster parenting is enabled and target is a table, tbody, tfoot,
745 # thead, or tr element Foster parenting happens when content is
746 # misnested in tables.
747 if flag_foster_parenting and target.name in foster_parenting_targets
748 console.log "foster parenting isn't implemented yet" # TODO
749 # 1. Let last template be the last template element in the stack of
750 # open elements, if any.
751 # 2. Let last table be the last table element in the stack of open
754 # 3. If there is a last template and either there is no last table,
755 # or there is one, but last template is lower (more recently added)
756 # than last table in the stack of open elements, then: let adjusted
757 # insertion location be inside last template's template contents,
758 # after its last child (if any), and abort these substeps.
760 # 4. If there is no last table, then let adjusted insertion
761 # location be inside the first element in the stack of open
762 # elements (the html element), after its last child (if any), and
763 # abort these substeps. (fragment case)
765 # 5. If last table has a parent element, then let adjusted
766 # insertion location be inside last table's parent element,
767 # immediately before last table, and abort these substeps.
769 # 6. Let previous element be the element immediately above last
770 # table in the stack of open elements.
772 # 7. Let adjusted insertion location be inside previous element,
773 # after its last child (if any).
775 # Note: These steps are involved in part because it's possible for
776 # elements, the table element in this case in particular, to have
777 # been moved by a script around in the DOM, or indeed removed from
778 # the DOM entirely, after the element was inserted by the parser.
780 # Otherwise Let adjusted insertion location be inside target, after
781 # its last child (if any).
782 target_i = target.children.length
784 # 3. If the adjusted insertion location is inside a template element,
785 # let it instead be inside the template element's template contents,
786 # after its last child (if any). TODO
788 # 4. Return the adjusted insertion location.
789 return [target, target_i]
791 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
792 # aka create_an_element_for_token
793 token_to_element = (t, namespace, intended_parent) ->
794 t.type = TYPE_TAG # not TYPE_OPEN_TAG
795 # convert attributes into a hash
797 while t.attrs_a.length
799 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
800 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
802 # TODO 2. If the newly created element has an xmlns attribute in the
803 # XMLNS namespace whose value is not exactly the same as the element's
804 # namespace, that is a parse error. Similarly, if the newly created
805 # element has an xmlns:xlink attribute in the XMLNS namespace whose
806 # value is not the XLink Namespace, that is a parse error.
808 # fixfull: the spec says stuff about form pointers and ownerDocument
812 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
813 insert_foreign_element = (token, namespace) ->
814 ail = adjusted_insertion_location()
817 el = token_to_element token, namespace, ail_el
818 # TODO skip this next step if it's broken (eg ail_el is document with child already)
820 ail_el.children.splice ail_i, 0, el
823 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
824 insert_html_element = insert_foreign_element # (token, namespace) ->
826 # FIXME read implement "foster parenting" part
827 # FIXME read spec, do this right
828 # FIXME implement the override target thing
829 # note: this assumes it's an open tag
830 # FIXME what part of the spec is this?
831 # TODO look through all callers of this, and see what they should really be doing.
832 # eg probably insert_html_element for tokens
833 tree_insert_element = (el, override_target = null, namespace = null) ->
835 el.namespace = namespace
836 dest = adjusted_insertion_location override_target
837 if el.type is TYPE_OPEN_TAG # means it's a "token"
838 el = token_to_element el, namespace, dest[0]
840 namespace = dest.namespace
841 # fixfull: Document nodes sometimes can't accept more chidren
842 dest[0].children.splice dest[1], 0, el
847 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
848 tree_insert_a_comment = (t) ->
849 # FIXME read spec for "adjusted insertion location, etc, this might be wrong
850 open_els[0].children.push t
852 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
853 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
854 generate_implied_end_tags = (except = null) ->
855 while end_tag_implied[open_els[0]] and open_els[0].name isnt except
858 # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
859 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
860 for node, i in open_els
861 if node.name is name # FIXME check namespace too
862 generate_implied_end_tags name # arg is exception
863 parse_error() unless i is 0
868 if special_elements[node.name]? # FIXME check namespac too
871 tree_in_body = (t) ->
877 when "\t", "\u000a", "\u000c", "\u000d", ' '
878 reconstruct_active_formatting_elements()
881 reconstruct_active_formatting_elements()
883 flag_frameset_ok = false
885 tree_insert_a_comment t
892 return if template_tag_is_open()
893 root_attrs = open_els[open_els.length - 1].attrs
895 root_attrs[k] = v unless root_attrs[k]?
896 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
897 # FIXME also do this for </template> (end tag)
898 return tree_in_head t
905 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
906 close_p_if_in_button_scope()
907 insert_html_element t
908 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
909 close_p_if_in_button_scope()
910 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
913 insert_html_element t
914 # TODO lots more to implement here
916 # If the list of active formatting elements
917 # contains an a element between the end of the list and
918 # the last marker on the list (or the start of the list
919 # if there is no marker on the list), then this is a
920 # parse error; run the adoption agency algorithm for
921 # the tag name "a", then remove that element from the
922 # list of active formatting elements and the stack of
923 # open elements if the adoption agency algorithm didn't
924 # already remove it (it might not have if the element
925 # is not in table scope).
928 if el.type is TYPE_MARKER
938 for el, i in open_els
941 reconstruct_active_formatting_elements()
942 el = tree_insert_element t
944 when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
945 reconstruct_active_formatting_elements()
946 el = tree_insert_element t
948 # TODO lots more to implement here
949 else # any other start tag
950 reconstruct_active_formatting_elements()
951 tree_insert_element t
954 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
955 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
958 unless ok_tags[t.name]?
961 # TODO stack of template insertion modes thing
962 flag_parsing = false # stop parsing
966 unless is_in_scope 'body'
969 # TODO implement parse error and move to tree_after_body
971 unless is_in_scope 'body' # weird, but it's what the spec says
974 # TODO implement parse error and move to tree_after_body, reprocess
975 when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
976 unless is_in_scope t.name, NS_HTML
979 generate_implied_end_tags()
980 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
983 el = open_els.shift()
984 if el.name is t.name and el.namespace is NS_HTML
986 # TODO lots more close tags to implement here
988 unless is_in_button_scope 'p'
990 insert_html_element new_open_tag 'p'
992 # TODO lots more close tags to implement here
993 when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
994 adoption_agency t.name
995 # TODO lots more close tags to implement here
997 in_body_any_other_end_tag t.name
1001 # the functions below implement the tokenizer stats described here:
1002 # http://www.w3.org/TR/html5/syntax.html#tokenization
1004 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
1006 switch c = txt.charAt(cur++)
1008 return new_text_node tokenize_character_reference()
1010 tok_state = tok_state_tag_open
1013 return new_text_node c
1015 return new_eof_token()
1017 return new_text_node c
1020 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
1021 # not needed: tok_state_character_reference_in_data = ->
1022 # just call tok_state_character_reference_in_data()
1024 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
1025 tok_state_tag_open = ->
1026 switch c = txt.charAt(cur++)
1028 tok_state = tok_state_markup_declaration_open
1030 tok_state = tok_state_end_tag_open
1033 tok_state = tok_state_bogus_comment
1035 if lc_alpha.indexOf(c) > -1
1036 tok_cur_tag = new_open_tag c
1037 tok_state = tok_state_tag_name
1038 else if uc_alpha.indexOf(c) > -1
1039 tok_cur_tag = new_open_tag c.toLowerCase()
1040 tok_state = tok_state_tag_name
1043 tok_state = tok_state_data
1044 cur -= 1 # we didn't parse/handle the char after <
1045 return new_text_node '<'
1048 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
1049 tok_state_end_tag_open = ->
1050 switch c = txt.charAt(cur++)
1053 tok_state = tok_state_data
1056 tok_state = tok_state_data
1057 return new_text_node '</'
1059 if uc_alpha.indexOf(c) > -1
1060 tok_cur_tag = new_end_tag c.toLowerCase()
1061 tok_state = tok_state_tag_name
1062 else if lc_alpha.indexOf(c) > -1
1063 tok_cur_tag = new_end_tag c
1064 tok_state = tok_state_tag_name
1067 tok_state = tok_state_bogus_comment
1070 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
1071 tok_state_tag_name = ->
1072 switch c = txt.charAt(cur++)
1073 when "\t", "\n", "\u000c", ' '
1074 tok_state = tok_state_before_attribute_name
1076 tok_state = tok_state_self_closing_start_tag
1078 tok_state = tok_state_data
1084 tok_cur_tag.name += "\ufffd"
1087 tok_state = tok_state_data
1089 if uc_alpha.indexOf(c) > -1
1090 tok_cur_tag.name += c.toLowerCase()
1092 tok_cur_tag.name += c
1095 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
1096 tok_state_before_attribute_name = ->
1098 switch c = txt.charAt(cur++)
1099 when "\t", "\n", "\u000c", ' '
1102 tok_state = tok_state_self_closing_start_tag
1105 tok_state = tok_state_data
1111 attr_name = "\ufffd"
1112 when '"', "'", '<', '='
1117 tok_state = tok_state_data
1119 if uc_alpha.indexOf(c) > -1
1120 attr_name = c.toLowerCase()
1124 tok_cur_tag.attrs_a.unshift [attr_name, '']
1125 tok_state = tok_state_attribute_name
1128 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
1129 tok_state_attribute_name = ->
1130 switch c = txt.charAt(cur++)
1131 when "\t", "\n", "\u000c", ' '
1132 tok_state = tok_state_after_attribute_name
1134 tok_state = tok_state_self_closing_start_tag
1136 tok_state = tok_state_before_attribute_value
1138 tok_state = tok_state_data
1144 tok_cur_tag.attrs_a[0][0] = "\ufffd"
1147 tok_cur_tag.attrs_a[0][0] = c
1150 tok_state = tok_state_data
1152 if uc_alpha.indexOf(c) > -1
1153 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
1155 tok_cur_tag.attrs_a[0][0] += c
1158 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
1159 tok_state_before_attribute_value = ->
1160 switch c = txt.charAt(cur++)
1161 when "\t", "\n", "\u000c", ' '
1164 tok_state = tok_state_attribute_value_double_quoted
1166 tok_state = tok_state_attribute_value_unquoted
1169 tok_state = tok_state_attribute_value_single_quoted
1172 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1173 tok_state = tok_state_attribute_value_unquoted
1176 tok_state = tok_state_data
1182 tok_state = tok_state_data
1184 tok_cur_tag.attrs_a[0][1] += c
1185 tok_state = tok_state_attribute_value_unquoted
1188 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
1189 tok_state_attribute_value_double_quoted = ->
1190 switch c = txt.charAt(cur++)
1192 tok_state = tok_state_after_attribute_value_quoted
1194 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
1197 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1200 tok_state = tok_state_data
1202 tok_cur_tag.attrs_a[0][1] += c
1205 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
1206 tok_state_attribute_value_single_quoted = ->
1207 switch c = txt.charAt(cur++)
1209 tok_state = tok_state_after_attribute_value_quoted
1211 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
1214 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1217 tok_state = tok_state_data
1219 tok_cur_tag.attrs_a[0][1] += c
1222 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
1223 tok_state_attribute_value_unquoted = ->
1224 switch c = txt.charAt(cur++)
1225 when "\t", "\n", "\u000c", ' '
1226 tok_state = tok_state_before_attribute_name
1228 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
1230 tok_state = tok_state_data
1235 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1238 tok_state = tok_state_data
1240 # Parse Error if ', <, = or ` (backtick)
1241 tok_cur_tag.attrs_a[0][1] += c
1244 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
1245 tok_state_after_attribute_value_quoted = ->
1246 switch c = txt.charAt(cur++)
1247 when "\t", "\n", "\u000c", ' '
1248 tok_state = tok_state_before_attribute_name
1250 tok_state = tok_state_self_closing_start_tag
1252 tok_state = tok_state_data
1258 tok_state = tok_state_data
1261 tok_state = tok_state_before_attribute_name
1262 cur -= 1 # we didn't handle that char
1265 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
1266 # Don't set this as a state, just call it
1267 # returns a string (NOT a text node)
1268 tokenize_character_reference = (allowed_char = null, in_attr = false) ->
1269 if cur >= txt.length
1271 switch c = txt.charAt(cur)
1272 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
1273 # explicitly not a parse error
1276 # there has to be "one or more" alnums between & and ; to be a parse error
1279 if cur + 1 >= txt.length
1281 if txt.charAt(cur + 1).toLowerCase() is 'x'
1290 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
1294 if txt.charAt(start + i) is ';'
1296 # FIXME This is supposed to generate parse errors for some chars
1297 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
1304 if alnum.indexOf(txt.charAt(cur + i)) is -1
1307 # exit early, because parse_error() below needs at least one alnum
1309 if txt.charAt(cur + i) is ';'
1310 i += 1 # include ';' terminator in value
1311 decoded = decode_named_char_ref txt.substr(cur, i)
1318 # no ';' terminator (only legacy char refs)
1320 for i in [2..max] # no prefix matches, so ok to check shortest first
1321 c = legacy_char_refs[txt.substr(cur, i)]
1324 if txt.charAt(cur + i) is '='
1325 # "because some legacy user agents will
1326 # misinterpret the markup in those cases"
1329 if alnum.indexOf(txt.charAt(cur + i)) > -1
1330 # this makes attributes forgiving about url args
1332 # ok, and besides the weird exceptions for attributes...
1333 # return the matching char
1334 cur += i # consume entity chars
1335 parse_error() # because no terminating ";"
1339 return # never reached
1341 # tree constructor initialization
1342 # see comments on TYPE_TAG/etc for the structure of this data
1343 tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
1345 tree_state = tree_in_body
1346 flag_frameset_ok = true
1348 flag_foster_parenting = false
1349 afe = [] # active formatting elements
1351 # tokenizer initialization
1352 tok_state = tok_state_data
1359 return tree.children
1361 # everything below is tests on the above
1362 test_equals = (description, output, expected_output) ->
1363 if output is expected_output
1364 console.log "passed." # don't say name, so smart consoles can merge all of these
1366 console.log "FAILED: \"#{description}\""
1367 console.log " Expected: #{expected_output}"
1368 console.log " Actual: #{output}"
1369 serialize_els = (els, shallow, show_ids) ->
1375 serialized += t.serialize shallow, show_ids
1377 test_parser = (args) ->
1382 prev_node_id = 0 # reset counter
1383 parsed = parse_html args.html, errors_cb
1384 serialized = serialize_els parsed, false, false
1385 if serialized isnt args.expected # or parse_errors.length isnt args.errors
1386 debug_log_each (str) ->
1388 console.log "FAILED: \"#{args.name}\""
1390 console.log "passed \"#{args.name}\""
1391 if serialized isnt args.expected
1392 console.log " Input: #{args.html}"
1393 console.log " Correct: #{args.expected}"
1394 console.log " Output: #{serialized}"
1395 if parse_errors.length isnt args.errors
1396 console.log " Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
1398 test_parser name: "empty", \
1402 test_parser name: "just text", \
1404 expected: 'text:"abc"',
1406 test_parser name: "named entity", \
1408 expected: 'text:"a&1234"',
1410 test_parser name: "broken named character references", \
1411 html: "1&2&&3&aabbcc;",
1412 expected: 'text:"1&2&&3&aabbcc;"',
1414 test_parser name: "numbered entity overrides", \
1415 html: "1€€ ƒ",
1416 expected: 'text:"1€€ ƒ"',
1418 test_parser name: "open tag", \
1419 html: "foo<span>bar",
1420 expected: 'text:"foo",tag:"span",{},[text:"bar"]',
1421 errors: 1 # no close tag
1422 test_parser name: "open tag with attributes", \
1423 html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
1424 expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
1425 errors: 1 # no close tag
1426 test_parser name: "open tag with attributes of various quotings", \
1427 html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
1428 expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
1429 errors: 1 # no close tag
1430 test_parser name: "attribute entity exceptions dq", \
1431 html: "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar",
1432 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
1433 errors: 2 # no close tag, &= in attr
1434 test_parser name: "attribute entity exceptions sq", \
1435 html: "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar",
1436 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
1437 errors: 2 # no close tag, &= in attr
1438 test_parser name: "attribute entity exceptions uq", \
1439 html: "foo<a href=foo?t=1&=2&o=3&lt=foo>bar",
1440 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
1441 errors: 2 # no close tag, &= in attr
1442 test_parser name: "matching closing tags", \
1443 html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
1444 expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
1446 test_parser name: "missing closing tag inside", \
1447 html: "foo<div>bar<span>baz</div>qux",
1448 expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
1449 errors: 1 # close tag mismatch
1450 test_parser name: "mis-matched closing tags", \
1451 html: "<span>12<div>34</span>56</div>78",
1452 expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
1453 errors: 2 # misplaced </span>, no </span> at the end
1454 test_parser name: "mis-matched formatting elements", \
1455 html: "12<b>34<i>56</b>78</i>90",
1456 expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
1457 errors: 1 # no idea how many their should be
1458 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
1459 html: '<p>1<b>2<i>3</b>4</i>5</p>',
1460 expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]',
1462 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
1463 html: '<b>1<p>2</b>3</p>',
1464 expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]',
1466 test_parser name: "crazy formatting elements test", \
1467 html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
1468 # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
1469 # firefox does this:
1470 expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"',
1471 errors: 6 # no idea how many there should be
1472 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
1473 test_parser name: "html5lib aaa 1", \
1474 html: '<a><p></a></p>',
1475 expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]',
1477 test_parser name: "html5lib aaa 2", \
1478 html: '<a>1<p>2</a>3</p>',
1479 expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]',
1481 test_parser name: "html5lib aaa 3", \
1482 html: '<a>1<button>2</a>3</button>',
1483 expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]',
1485 test_parser name: "html5lib aaa 4", \
1486 html: '<a>1<b>2</a>3</b>',
1487 expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]',
1489 test_parser name: "html5lib aaa 5", \
1490 html: '<a>1<div>2<div>3</a>4</div>5</div>',
1491 expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]',