1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # Each node is an obect of the Node class. Here are the Node types:
52 TYPE_TAG = 0 # name, {attributes}, [children]
53 TYPE_TEXT = 1 # "text"
56 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
57 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
58 TYPE_END_TAG = 5 # name
60 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
61 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
73 debug_log_each = (cb) ->
74 for str in g_debug_log
79 constructor: (type, args = {}) ->
80 @type = type # one of the TYPE_* constants above
81 @name = args.name ? '' # tag name
82 @text = args.text ? '' # contents for text/comment nodes
83 @attrs = args.attrs ? {}
84 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
85 @children = args.children ? []
86 @namespace = args.namespace ? NS_HTML
87 @parent = args.parent ? null
91 @id = "#{++prev_node_id}"
92 shallow_clone: -> # return a new node that's the same except without the children or parent
93 # WARNING this doesn't work right on open tags that are still being parsed
95 attrs[k] = v for k, v of @attrs
96 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
97 acknowledge_self_closing: ->
98 @flag 'did_self_close', true
101 serialize: (shallow = false, show_ids = false) -> # for unit tests
106 ret += JSON.stringify @name
121 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
127 ret += c.serialize shallow, show_ids
131 ret += JSON.stringify @text
134 ret += JSON.stringify @text
140 when TYPE_AAA_BOOKMARK
141 ret += 'aaa_bookmark'
144 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
147 # helpers: (only take args that are normally known when parser creates nodes)
148 new_open_tag = (name) ->
149 return new Node TYPE_START_TAG, name: name
150 new_end_tag = (name) ->
151 return new Node TYPE_END_TAG, name: name
152 new_element = (name) ->
153 return new Node TYPE_TAG, name: name
154 new_text_node = (txt) ->
155 return new Node TYPE_TEXT, text: txt
156 new_character_token = new_text_node
157 new_comment_node = (txt) ->
158 return new Node TYPE_COMMENT, text: txt
160 return new Node TYPE_EOF
162 return new Node TYPE_AFE_MARKER
163 new_aaa_bookmark = ->
164 return new Node TYPE_AAA_BOOKMARK
166 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
167 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
168 digits = "0123456789"
169 alnum = lc_alpha + uc_alpha + digits
170 hex_chars = digits + "abcdefABCDEF"
172 # some SVG elements have dashes in them
173 tag_name_chars = alnum + "-"
175 # http://www.w3.org/TR/html5/infrastructure.html#space-character
176 space_chars = "\u0009\u000a\u000c\u000d\u0020"
178 return txt.length is 1 and space_chars.indexOf(txt) > -1
179 is_space_tok = (t) ->
180 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
182 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
183 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
185 # These are the character references that don't need a terminating semicolon
186 # min length: 2, max: 6, none are a prefix of any other.
188 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
189 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
190 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
191 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
192 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
193 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
194 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
195 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
196 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
197 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
198 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
199 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
200 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
201 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
202 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
203 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
204 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
208 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
209 raw_text_elements = ['script', 'style']
210 escapable_raw_text_elements = ['textarea', 'title']
211 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
213 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
214 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
215 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
216 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
217 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
218 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
219 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
220 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
221 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
222 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
223 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
224 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
225 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
226 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
230 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
232 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
233 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
234 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
235 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
236 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
237 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
238 'determinant', 'diff', 'divergence', 'divide', 'domain',
239 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
240 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
241 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
242 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
243 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
244 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
245 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
246 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
247 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
248 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
249 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
250 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
251 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
252 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
253 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
254 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
255 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
256 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
257 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
258 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
259 'vectorproduct', 'xor'
261 # foreign_elements = [svg_elements..., mathml_elements...]
262 #normal_elements = All other allowed HTML elements are normal elements.
266 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
267 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
268 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
269 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
270 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
271 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
272 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
273 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
274 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
275 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
276 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
277 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
278 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
279 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
280 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
281 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
282 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
283 wbr:NS_HTML, xmp:NS_HTML,
286 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
287 'annotation-xml':NS_MATHML,
290 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
293 formatting_elements = {
294 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
295 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
299 foster_parenting_targets = {
321 el_is_special = (e) ->
322 return special_elements[e.name] is e.namespace
324 # decode_named_char_ref()
326 # The list of named character references is _huge_ so ask the browser to decode
327 # for us instead of wasting bandwidth/space on including the table here.
329 # Pass without the "&" but with the ";" examples:
330 # for "&" pass "amp;"
331 # for "′" pass "x2032;"
334 textarea: document.createElement('textarea')
336 # TODO test this in IE8
337 decode_named_char_ref = (txt) ->
339 decoded = g_dncr.cache[txt]
340 return decoded if decoded?
341 g_dncr.textarea.innerHTML = txt
342 decoded = g_dncr.textarea.value
343 return null if decoded is txt
344 return g_dncr.cache[txt] = decoded
346 parse_html = (txt, parse_error_cb = null) ->
347 cur = 0 # index of next char in txt to be parsed
348 # declare doc and tokenizer variables so they're in scope below
350 open_els = null # stack of open elements
351 afe = null # active formatting elements
352 template_insertion_modes = null
353 insertion_mode = null
354 original_insertion_mode = null
356 tok_cur_tag = null # partially parsed tag
357 flag_scripting = null
358 flag_frameset_ok = null
360 flag_foster_parenting = null
361 form_element_pointer = null
362 temporary_buffer = null
363 pending_table_character_tokens = null
364 head_element_pointer = null
370 console.log "Parse error at character #{cur} of #{txt.length}"
372 afe_push = (new_el) ->
375 if el.name is new_el.name and el.namespace is new_el.namespace
377 continue unless new_el.attrs[k] is v
378 for k, v of new_el.attrs
379 continue unless el.attrs[k] is v
386 afe.unshift new_afe_marker()
388 # the functions below impliment the Tree Contstruction algorithm
389 # http://www.w3.org/TR/html5/syntax.html#tree-construction
391 # But first... the helpers
392 template_tag_is_open = ->
394 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
397 is_in_scope_x = (tag_name, scope, namespace) ->
399 if t.name is tag_name and (namespace is null or namespace is t.namespace)
401 if scope[t.name] is t.namespace
404 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
406 if t.name is tag_name and (namespace is null or namespace is t.namespace)
408 if scope[t.name] is t.namespace
410 if scope2[t.name] is t.namespace
413 standard_scopers = { # FIXME these are supposed to be namespace specific
414 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
415 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
416 template: NS_HTML, mi: NS_MATHML,
418 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
419 'annotation-xml': NS_MATHML,
421 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
423 button_scopers = button: NS_HTML
424 li_scopers = ol: NS_HTML, ul: NS_HTML
425 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
426 is_in_scope = (tag_name, namespace = null) ->
427 return is_in_scope_x tag_name, standard_scopers, namespace
428 is_in_button_scope = (tag_name, namespace = null) ->
429 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
430 is_in_table_scope = (tag_name, namespace = null) ->
431 return is_in_scope_x tag_name, table_scopers, namespace
432 is_in_select_scope = (tag_name, namespace = null) ->
434 if t.name is tag_name and (namespace is null or namespace is t.namespace)
436 if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
439 # this checks for a particular element, not by name
440 el_is_in_scope = (el) ->
444 if standard_scopers[t.name] is t.namespace
448 clear_to_table_stopers = {
453 clear_stack_to_table_context = ->
455 if clear_to_table_stopers[open_els[0].name]?
459 clear_to_table_body_stopers = {
466 clear_stack_to_table_body_context = ->
468 if clear_to_table_body_stopers[open_els[0].name]?
472 clear_to_table_row_stopers = {
477 clear_stack_to_table_row_context = ->
479 if clear_to_table_row_stopers[open_els[0].name]?
483 clear_afe_to_marker = ->
486 if el.type is TYPE_AFE_MARKER
490 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
491 reset_insertion_mode = ->
492 # 1. Let last be false.
494 # 2. Let node be the last node in the stack of open elements.
496 node = open_els[node_i]
497 # 3. Loop: If node is the first node in the stack of open elements,
498 # then set last to true, and, if the parser was originally created as
499 # part of the HTML fragment parsing algorithm (fragment case) set node
500 # to the context element.
502 if node_i is open_els.length - 1
504 # fixfull (fragment case)
506 # 4. If node is a select element, run these substeps:
507 if node.name is 'select'
508 # 1. If last is true, jump to the step below labeled done.
510 # 2. Let ancestor be node.
513 # 3. Loop: If ancestor is the first node in the stack of
514 # open elements, jump to the step below labeled done.
516 if ancestor_i is open_els.length - 1
518 # 4. Let ancestor be the node before ancestor in the stack
521 ancestor = open_els[ancestor_i]
522 # 5. If ancestor is a template node, jump to the step below
524 if ancestor.name is 'template'
526 # 6. If ancestor is a table node, switch the insertion mode
527 # to "in select in table" and abort these steps.
528 if ancestor.name is 'table'
529 insertion_mode = ins_mode_in_select_in_table
531 # 7. Jump back to the step labeled loop.
532 # 8. Done: Switch the insertion mode to "in select" and abort
534 insertion_mode = ins_mode_in_select
536 # 5. If node is a td or th element and last is false, then switch
537 # the insertion mode to "in cell" and abort these steps.
538 if (node.name is 'td' or node.name is 'th') and last is false
539 insertion_mode = ins_mode_in_cell
541 # 6. If node is a tr element, then switch the insertion mode to "in
542 # row" and abort these steps.
544 insertion_mode = ins_mode_in_row
546 # 7. If node is a tbody, thead, or tfoot element, then switch the
547 # insertion mode to "in table body" and abort these steps.
548 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
549 insertion_mode = ins_mode_in_table_body
551 # 8. If node is a caption element, then switch the insertion mode
552 # to "in caption" and abort these steps.
553 if node.name is 'caption'
554 insertion_mode = ins_mode_in_caption
556 # 9. If node is a colgroup element, then switch the insertion mode
557 # to "in column group" and abort these steps.
558 if node.name is 'colgroup'
559 insertion_mode = ins_mode_in_column_group
561 # 10. If node is a table element, then switch the insertion mode to
562 # "in table" and abort these steps.
563 if node.name is 'table'
564 insertion_mode = ins_mode_in_table
566 # 11. If node is a template element, then switch the insertion mode
567 # to the current template insertion mode and abort these steps.
568 # fixfull (template insertion mode stack)
570 # 12. If node is a head element and last is true, then switch the
571 # insertion mode to "in body" ("in body"! not "in head"!) and abort
572 # these steps. (fragment case)
573 if node.name is 'head' and last
574 insertion_mode = ins_mode_in_body
576 # 13. If node is a head element and last is false, then switch the
577 # insertion mode to "in head" and abort these steps.
578 if node.name is 'head' and last is false
579 insertion_mode = ins_mode_in_head
581 # 14. If node is a body element, then switch the insertion mode to
582 # "in body" and abort these steps.
583 if node.name is 'body'
584 insertion_mode = ins_mode_in_body
586 # 15. If node is a frameset element, then switch the insertion mode
587 # to "in frameset" and abort these steps. (fragment case)
588 if node.name is 'frameset'
589 insertion_mode = ins_mode_in_frameset
591 # 16. If node is an html element, run these substeps:
592 if node.name is 'html'
593 # 1. If the head element pointer is null, switch the insertion
594 # mode to "before head" and abort these steps. (fragment case)
595 # fixfull (fragment case)
597 # 2. Otherwise, the head element pointer is not null, switch
598 # the insertion mode to "after head" and abort these steps.
599 insertion_mode = ins_mode_in_body # FIXME fixfull
601 # 17. If last is true, then switch the insertion mode to "in body"
602 # and abort these steps. (fragment case)
604 insertion_mode = ins_mode_in_body
606 # 18. Let node now be the node before node in the stack of open
609 node = open_els[node_i]
610 # 19. Return to the step labeled loop.
612 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
613 # this implementation is structured (mostly) as described at the link above.
614 # capitalized comments are the "labels" described at the link above.
615 reconstruct_active_formatting_elements = ->
616 return if afe.length is 0
617 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
622 if i is afe.length - 1
625 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
630 el = afe[i].shallow_clone()
631 tree_insert_element el
636 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
637 # adoption agency algorithm
639 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
640 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
641 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
642 adoption_agency = (subject) ->
643 debug_log "adoption_agency()"
644 debug_log "tree: #{serialize_els doc.children, false, true}"
645 debug_log "open_els: #{serialize_els open_els, true, true}"
646 debug_log "afe: #{serialize_els afe, true, true}"
647 if open_els[0].name is subject
650 # remove it from the list of active formatting elements (if found)
655 debug_log "aaa: starting off with subject on top of stack, exiting"
662 # 5. Let formatting element be the last element in the list of
663 # active formatting elements that: is between the end of the list
664 # and the last scope marker in the list, if any, or the start of
665 # the list otherwise, and has the tag name subject.
667 for t, fe_of_afe in afe
668 if t.type is TYPE_AFE_MARKER
673 # If there is no such element, then abort these steps and instead
674 # act as described in the "any other end tag" entry above.
676 debug_log "aaa: fe not found in afe"
677 in_body_any_other_end_tag subject
679 # 6. If formatting element is not in the stack of open elements,
680 # then this is a parse error; remove the element from the list, and
683 for t, fe_of_open_els in open_els
688 debug_log "aaa: fe not found in open_els"
690 # "remove it from the list" must mean afe, since it's not in open_els
691 afe.splice fe_of_afe, 1
693 # 7. If formatting element is in the stack of open elements, but
694 # the element is not in scope, then this is a parse error; abort
696 unless el_is_in_scope fe
697 debug_log "aaa: fe not in scope"
700 # 8. If formatting element is not the current node, this is a parse
701 # error. (But do not abort these steps.)
702 unless open_els[0] is fe
705 # 9. Let furthest block be the topmost node in the stack of open
706 # elements that is lower in the stack than formatting element, and
707 # is an element in the special category. There might not be one.
709 fb_of_open_els = null
716 # and continue, to see if there's one that's more "topmost"
717 # 10. If there is no furthest block, then the UA must first pop all
718 # the nodes from the bottom of the stack of open elements, from the
719 # current node up to and including formatting element, then remove
720 # formatting element from the list of active formatting elements,
721 # and finally abort these steps.
723 debug_log "aaa: no fb"
727 afe.splice fe_of_afe, 1
729 # 11. Let common ancestor be the element immediately above
730 # formatting element in the stack of open elements.
731 ca = open_els[fe_of_open_els + 1] # common ancestor
733 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
734 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
735 bookmark = new_aaa_bookmark()
738 afe.splice i, 0, bookmark
740 node = last_node = fb
744 # 3. Let node be the element immediately above node in the
745 # stack of open elements, or if node is no longer in the stack
746 # of open elements (e.g. because it got removed by this
747 # algorithm), the element that was immediately above node in
748 # the stack of open elements before node was removed.
752 node_next = open_els[i + 1]
754 node = node_next ? node_above
755 debug_log "inner loop #{inner}"
756 debug_log "tree: #{serialize_els doc.children, false, true}"
757 debug_log "open_els: #{serialize_els open_els, true, true}"
758 debug_log "afe: #{serialize_els afe, true, true}"
759 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
760 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
761 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
762 debug_log "node: #{node.serialize true, true}"
763 # TODO make sure node_above gets re-set if/when node is removed from open_els
765 # 4. If node is formatting element, then go to the next step in
766 # the overall algorithm.
770 # 5. If inner loop counter is greater than three and node is in
771 # the list of active formatting elements, then remove node from
772 # the list of active formatting elements.
778 debug_log "max out inner"
783 # 6. If node is not in the list of active formatting elements,
784 # then remove node from the stack of open elements and then go
785 # back to the step labeled inner loop.
787 debug_log "not in afe"
790 node_above = open_els[i + 1]
794 debug_log "the bones"
795 # 7. create an element for the token for which the element node
796 # was created, in the HTML namespace, with common ancestor as
797 # the intended parent; replace the entry for node in the list
798 # of active formatting elements with an entry for the new
799 # element, replace the entry for node in the stack of open
800 # elements with an entry for the new element, and let node be
802 new_node = node.shallow_clone()
806 debug_log "replaced in afe"
810 node_above = open_els[i + 1]
811 open_els[i] = new_node
812 debug_log "replaced in open_els"
815 # 8. If last node is furthest block, then move the
816 # aforementioned bookmark to be immediately after the new node
817 # in the list of active formatting elements.
822 debug_log "removed bookmark"
826 # "after" means lower
827 afe.splice i, 0, bookmark # "after as <-
828 debug_log "placed bookmark after node"
829 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
831 # 9. Insert last node into node, first removing it from its
832 # previous parent node if any.
834 debug_log "last_node has parent"
835 for c, i in last_node.parent.children
837 debug_log "removing last_node from parent"
838 last_node.parent.children.splice i, 1
840 node.children.push last_node
841 last_node.parent = node
842 # 10. Let last node be node.
845 # 11. Return to the step labeled inner loop.
846 # 14. Insert whatever last node ended up being in the previous step
847 # at the appropriate place for inserting a node, but using common
848 # ancestor as the override target.
850 # In the case where fe is immediately followed by fb:
851 # * inner loop exits out early (node==fe)
853 # * last_node is still in the tree (not a duplicate)
855 debug_log "FEFIRST? last_node has parent"
856 for c, i in last_node.parent.children
858 debug_log "removing last_node from parent"
859 last_node.parent.children.splice i, 1
862 debug_log "after aaa inner loop"
863 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
864 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
865 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
866 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
867 debug_log "tree: #{serialize_els doc.children, false, true}"
872 # can't use standard insert token thing, because it's already in
873 # open_els and must stay at it's current position in open_els
874 dest = adjusted_insertion_location ca
875 dest[0].children.splice dest[1], 0, last_node
876 last_node.parent = dest[0]
879 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
880 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
881 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
882 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
883 debug_log "tree: #{serialize_els doc.children, false, true}"
885 # 15. Create an element for the token for which formatting element
886 # was created, in the HTML namespace, with furthest block as the
888 new_element = fe.shallow_clone() # FIXME intended parent thing
889 # 16. Take all of the child nodes of furthest block and append them
890 # to the element created in the last step.
891 while fb.children.length
892 t = fb.children.shift()
893 t.parent = new_element
894 new_element.children.push t
895 # 17. Append that new element to furthest block.
896 new_element.parent = fb
897 fb.children.push new_element
898 # 18. Remove formatting element from the list of active formatting
899 # elements, and insert the new element into the list of active
900 # formatting elements at the position of the aforementioned
910 # 19. Remove formatting element from the stack of open elements,
911 # and insert the new element into the stack of open elements
912 # immediately below the position of furthest block in that stack.
919 open_els.splice i, 0, new_element
921 # 20. Jump back to the step labeled outer loop.
922 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
923 debug_log "tree: #{serialize_els doc.children, false, true}"
924 debug_log "open_els: #{serialize_els open_els, true, true}"
925 debug_log "afe: #{serialize_els afe, true, true}"
928 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
930 generate_implied_end_tags 'p' # arg is exception
931 if open_els[0].name isnt 'p'
933 while open_els.length > 1 # just in case
934 el = open_els.shift()
937 close_p_if_in_button_scope = ->
938 if is_in_button_scope 'p'
941 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
942 # aka insert_a_character = (t) ->
943 insert_character = (t) ->
944 dest = adjusted_insertion_location()
945 # fixfull check for Document node
947 prev = dest[0].children[dest[1] - 1]
948 if prev.type is TYPE_TEXT
951 dest[0].children.splice dest[1], 0, t
954 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
955 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
956 adjusted_insertion_location = (override_target = null) ->
957 # 1. If there was an override target specified, then let target be the
960 target = override_target
961 else # Otherwise, let target be the current node.
963 # 2. Determine the adjusted insertion location using the first matching
964 # steps from the following list:
966 # If foster parenting is enabled and target is a table, tbody, tfoot,
967 # thead, or tr element Foster parenting happens when content is
968 # misnested in tables.
969 if flag_foster_parenting and foster_parenting_targets[target.name]
970 loop # once. this is here so we can ``break`` to "abort these substeps"
971 # 1. Let last template be the last template element in the
972 # stack of open elements, if any.
974 last_template_i = null
975 for el, i in open_els
976 if el.name is 'template'
980 # 2. Let last table be the last table element in the stack of
981 # open elements, if any.
984 for el, i in open_els
985 if el.name is 'table'
989 # 3. If there is a last template and either there is no last
990 # table, or there is one, but last template is lower (more
991 # recently added) than last table in the stack of open
992 # elements, then: let adjusted insertion location be inside
993 # last template's template contents, after its last child (if
994 # any), and abort these substeps.
995 if last_template and (last_table is null or last_template_i < last_table_i)
996 target = template # fixfull should be it's contents
997 target_i = target.children.length
999 # 4. If there is no last table, then let adjusted insertion
1000 # location be inside the first element in the stack of open
1001 # elements (the html element), after its last child (if any),
1002 # and abort these substeps. (fragment case)
1003 if last_table is null
1005 target = open_els[open_els.length - 1]
1006 target_i = target.children.length
1007 # 5. If last table has a parent element, then let adjusted
1008 # insertion location be inside last table's parent element,
1009 # immediately before last table, and abort these substeps.
1010 if last_table.parent?
1011 for c, i in last_table.parent.children
1013 target = last_table.parent
1017 # 6. Let previous element be the element immediately above last
1018 # table in the stack of open elements.
1020 # huh? how could it not have a parent?
1021 previous_element = open_els[last_table_i + 1]
1022 # 7. Let adjusted insertion location be inside previous
1023 # element, after its last child (if any).
1024 target = previous_element
1025 target_i = target.children.length
1026 # Note: These steps are involved in part because it's possible
1027 # for elements, the table element in this case in particular,
1028 # to have been moved by a script around in the DOM, or indeed
1029 # removed from the DOM entirely, after the element was inserted
1031 break # don't really loop
1033 # Otherwise Let adjusted insertion location be inside target, after
1034 # its last child (if any).
1035 target_i = target.children.length
1037 # 3. If the adjusted insertion location is inside a template element,
1038 # let it instead be inside the template element's template contents,
1039 # after its last child (if any).
1040 # fixfull (template)
1042 # 4. Return the adjusted insertion location.
1043 return [target, target_i]
1045 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1046 # aka create_an_element_for_token
1047 token_to_element = (t, namespace, intended_parent) ->
1048 t.type = TYPE_TAG # not TYPE_START_TAG
1049 # convert attributes into a hash
1051 while t.attrs_a.length
1053 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1054 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
1056 # TODO 2. If the newly created element has an xmlns attribute in the
1057 # XMLNS namespace whose value is not exactly the same as the element's
1058 # namespace, that is a parse error. Similarly, if the newly created
1059 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1060 # value is not the XLink Namespace, that is a parse error.
1062 # fixfull: the spec says stuff about form pointers and ownerDocument
1066 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1067 insert_foreign_element = (token, namespace) ->
1068 ail = adjusted_insertion_location()
1071 el = token_to_element token, namespace, ail_el
1072 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1074 ail_el.children.splice ail_i, 0, el
1077 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1078 insert_html_element = insert_foreign_element # (token, namespace) ->
1080 # FIXME read implement "foster parenting" part
1081 # FIXME read spec, do this right
1082 # FIXME implement the override target thing
1083 # note: this assumes it's an open tag
1084 # FIXME what part of the spec is this?
1085 # TODO look through all callers of this, and see what they should really be doing.
1086 # eg probably insert_html_element for tokens
1087 tree_insert_element = (el, override_target = null, namespace = null) ->
1089 el.namespace = namespace
1090 dest = adjusted_insertion_location override_target
1091 if el.type is TYPE_START_TAG # means it's a "token"
1092 el = token_to_element el, namespace, dest[0]
1093 unless el.namespace?
1094 namespace = dest.namespace
1095 # fixfull: Document nodes sometimes can't accept more chidren
1096 dest[0].children.splice dest[1], 0, el
1101 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1102 # position should be [node, index_within_children]
1103 insert_comment = (t, position = null) ->
1104 position ?= adjusted_insertion_location()
1105 position[0].children.splice position[1], 0, t
1108 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1109 parse_generic_raw_text = (t) ->
1110 insert_html_element t
1111 tok_state = tok_state_rawtext
1112 original_insertion_mode = insertion_mode
1113 insertion_mode = ins_mode_text
1114 parse_generic_rcdata_text = (t) ->
1115 insert_html_element t
1116 tok_state = tok_state_rcdata
1117 original_insertion_mode = insertion_mode
1118 insertion_mode = ins_mode_text
1120 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1121 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1122 generate_implied_end_tags = (except = null) ->
1123 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1126 # 8.2.5.4 The rules for parsing tokens in HTML content
1127 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1129 # 8.2.5.4.1 The "initial" insertion mode
1130 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1131 ins_mode_initial = (t) ->
1134 if t.type is TYPE_COMMENT
1135 # fixfull this is supposed to be "the last child of the document object"
1138 if t.type is TYPE_DOCTYPE
1142 insertion_mode = ins_mode_before_html
1145 #fixfull (iframe, quirks)
1146 insertion_mode = ins_mode_before_html
1147 insertion_mode t # reprocess the token
1150 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1151 ins_mode_before_html = (t) ->
1152 if t.type is TYPE_DOCTYPE
1155 if t.type is TYPE_COMMENT
1160 if t.type is TYPE_START_TAG and t.name is 'html'
1161 el = token_to_element t, NS_HTML, doc
1162 open_els.unshift(el)
1163 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1164 insertion_mode = ins_mode_before_head
1166 if t.type is TYPE_END_TAG
1167 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1168 # fall through to "anything else"
1173 html_tok = new_open_tag 'html'
1174 el = token_to_element html_tok, NS_HTML, doc
1175 doc.children.push el
1177 # ?fixfull browsing context
1178 insertion_mode = ins_mode_before_head
1182 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1183 ins_mode_before_head = (t) ->
1186 if t.type is TYPE_COMMENT
1189 if t.type is TYPE_DOCTYPE
1192 if t.type is TYPE_START_TAG and t.name is 'html'
1195 if t.type is TYPE_START_TAG and t.name is 'head'
1196 el = insert_html_element t
1197 head_element_pointer = el
1198 insertion_mode = ins_mode_in_head
1199 if t.type is TYPE_END_TAG
1200 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1201 # fall through to Anything else below
1206 head_tok = new_open_tag 'head'
1207 el = insert_html_element head_tok
1208 head_element_pointer = el
1209 insertion_mode = ins_mode_in_head
1210 insertion_mode t # reprocess current token
1212 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1213 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1214 open_els.shift() # spec says this will be a 'head' node
1215 insertion_mode = ins_mode_after_head
1217 ins_mode_in_head = (t) ->
1218 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1221 if t.type is TYPE_COMMENT
1224 if t.type is TYPE_DOCTYPE
1227 if t.type is TYPE_START_TAG and t.name is 'html'
1230 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1231 el = insert_html_element t
1233 el.acknowledge_self_closing()
1235 if t.type is TYPE_START_TAG and t.name is 'meta'
1236 el = insert_html_element t
1238 el.acknowledge_self_closing()
1239 # fixfull encoding stuff
1241 if t.type is TYPE_START_TAG and t.name is 'title'
1242 parse_generic_rcdata_element t
1244 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1245 parse_generic_raw_text t
1247 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1248 insert_html_element t
1249 insertion_mode = in_head_noscript # FIXME implement
1251 if t.type is TYPE_START_TAG and t.name is 'script'
1252 ail = adjusted_insertion_location()
1253 el = token_to_element t, NS_HTML, ail
1254 el.flag_parser_inserted true # FIXME implement
1255 # fixfull frament case
1256 ail[0].children.splice ail[1], 0, el
1258 tok_state = tok_state_script_data
1259 original_insertion_mode = insertion_mode # make sure orig... is defined
1260 insertion_mode = ins_mode_text # FIXME implement
1262 if t.type is TYPE_END_TAG and t.name is 'head'
1263 open_els.shift() # will be a head element... spec says so
1264 insertion_mode = ins_mode_after_head
1266 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1267 ins_mode_in_head_else t
1269 if t.type is TYPE_START_TAG and t.name is 'template'
1270 insert_html_element t
1272 flag_frameset_ok = false
1273 insertion_mode = ins_mode_in_template
1274 template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1276 if t.type is TYPE_END_TAG and t.name is 'template'
1277 if template_tag_is_open()
1278 generate_implied_end_tags
1279 if open_els[0].name isnt 'template'
1282 el = open_els.shift()
1283 if el.name is 'template'
1285 clear_afe_to_marker()
1286 template_insertion_modes.shift()
1287 reset_insertion_mode()
1291 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1294 ins_mode_in_head_else t
1296 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1297 ins_mode_in_head_noscript = (t) ->
1299 console.log "ins_mode_in_head_noscript unimplemented"
1301 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1302 ins_mode_after_head_else = (t) ->
1303 body_tok = new_open_tag 'body'
1304 insert_html_element body_tok
1305 insertion_mode = ins_mode_in_body
1306 insertion_mode t # reprocess token
1308 ins_mode_after_head = (t) ->
1312 if t.type is TYPE_COMMENT
1315 if t.type is TYPE_DOCTYPE
1318 if t.type is TYPE_START_TAG and t.name is 'html'
1321 if t.type is TYPE_START_TAG and t.name is 'body'
1322 insert_html_element t
1323 flag_frameset_ok = false
1324 insertion_mode = ins_mode_in_body
1326 if t.type is TYPE_START_TAG and t.name is 'frameset'
1327 insert_html_element t
1328 insertion_mode = ins_mode_in_frameset
1330 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1332 open_els.unshift head_element_pointer
1334 for el, i of open_els
1335 if el is head_element_pointer
1336 open_els.splice i, 1
1338 console.log "warning: 23904 couldn't find head element in open_els"
1340 if t.type is TYPE_END_TAG and t.name is 'template'
1343 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1344 ins_mode_after_head_else t
1346 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1350 ins_mode_after_head_else t
1352 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1353 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1354 for node, i in open_els
1355 if node.name is name # FIXME check namespace too
1356 generate_implied_end_tags name # arg is exception
1357 parse_error() unless i is 0
1362 if special_elements[node.name]? # FIXME check namespac too
1365 ins_mode_in_body = (t) ->
1371 when "\t", "\u000a", "\u000c", "\u000d", ' '
1372 reconstruct_active_formatting_elements()
1375 reconstruct_active_formatting_elements()
1377 flag_frameset_ok = false
1386 return if template_tag_is_open()
1387 root_attrs = open_els[open_els.length - 1].attrs
1389 root_attrs[k] = v unless root_attrs[k]?
1390 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1391 # FIXME also do this for </template> (end tag)
1392 return ins_mode_in_head t
1399 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1400 close_p_if_in_button_scope()
1401 insert_html_element t
1402 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1403 close_p_if_in_button_scope()
1404 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1407 insert_html_element t
1408 # TODO lots more to implement here
1410 # If the list of active formatting elements
1411 # contains an a element between the end of the list and
1412 # the last marker on the list (or the start of the list
1413 # if there is no marker on the list), then this is a
1414 # parse error; run the adoption agency algorithm for
1415 # the tag name "a", then remove that element from the
1416 # list of active formatting elements and the stack of
1417 # open elements if the adoption agency algorithm didn't
1418 # already remove it (it might not have if the element
1419 # is not in table scope).
1422 if el.type is TYPE_AFE_MARKER
1432 for el, i in open_els
1434 open_els.splice i, 1
1435 reconstruct_active_formatting_elements()
1436 el = insert_html_element t
1438 when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1439 reconstruct_active_formatting_elements()
1440 el = insert_html_element t
1443 # fixfull quirksmode thing
1444 close_p_if_in_button_scope()
1445 insert_html_element t
1446 insertion_mode = ins_mode_in_table
1447 # TODO lots more to implement here
1448 else # any other start tag
1449 reconstruct_active_formatting_elements()
1450 insert_html_element t
1453 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1454 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1457 unless ok_tags[t.name]?
1460 # TODO stack of template insertion modes thing
1461 flag_parsing = false # stop parsing
1465 unless is_in_scope 'body'
1468 # TODO implement parse error and move to tree_after_body
1470 unless is_in_scope 'body' # weird, but it's what the spec says
1473 # TODO implement parse error and move to tree_after_body, reprocess
1474 when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1475 unless is_in_scope t.name, NS_HTML
1478 generate_implied_end_tags()
1479 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1482 el = open_els.shift()
1483 if el.name is t.name and el.namespace is NS_HTML
1485 # TODO lots more close tags to implement here
1487 unless is_in_button_scope 'p'
1489 insert_html_element new_open_tag 'p'
1491 # TODO lots more close tags to implement here
1492 when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1493 adoption_agency t.name
1494 # TODO lots more close tags to implement here
1496 in_body_any_other_end_tag t.name
1499 ins_mode_in_table_else = (t) ->
1501 flag_foster_parenting = true # FIXME
1503 flag_foster_parenting = false
1504 can_in_table = { # FIXME do this inline like everywhere else
1512 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1513 ins_mode_text = (t) ->
1514 if t.type is TYPE_TEXT
1517 if t.type is TYPE_EOF
1519 if open_els[0].name is 'script'
1520 open_els[0].flag 'already started', true
1522 insertion_mode = original_insertion_mode
1525 if t.type is TYPE_END_TAG and t.name is 'script'
1527 insertion_mode = original_insertion_mode
1528 # fixfull the spec seems to assume that I'm going to run the script
1529 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1531 if t.type is TYPE_END_TAG
1533 insertion_mode = original_insertion_mode
1535 console.log 'warning: end of ins_mode_text reached'
1537 # the functions below implement the tokenizer stats described here:
1538 # http://www.w3.org/TR/html5/syntax.html#tokenization
1540 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1541 ins_mode_in_table = (t) ->
1544 if can_in_table[t.name]
1545 original_insertion_mode = insertion_mode
1546 insertion_mode = ins_mode_in_table_text
1549 ins_mode_in_table_else t
1557 clear_stack_to_table_context()
1559 insert_html_element t
1560 insertion_mode = ins_mode_in_caption
1562 clear_stack_to_table_context()
1563 insert_html_element t
1564 insertion_mode = ins_mode_in_column_group
1566 clear_stack_to_table_context()
1567 insert_html_element new_open_tag 'colgroup'
1568 insertion_mode = ins_mode_in_column_group
1570 when 'tbody', 'tfoot', 'thead'
1571 clear_stack_to_table_context()
1572 insert_html_element t
1573 insertion_mode = ins_mode_in_table_body
1574 when 'td', 'th', 'tr'
1575 clear_stack_to_table_context()
1576 insert_html_element new_open_tag 'tbody'
1577 insertion_mode = ins_mode_in_table_body
1581 if is_in_table_scope 'table'
1583 el = open_els.shift()
1584 if el.name is 'table'
1586 reset_insertion_mode()
1588 when 'style', 'script', 'template'
1591 if token_is_input_hidden t
1592 ins_mode_in_table_else t
1595 el = insert_html_element t
1597 el.acknowledge_self_closing()
1600 if form_element_pointer?
1602 if template_tag_is_open()
1604 form_element_pointer = insert_html_element t
1607 ins_mode_in_table_else t
1611 if is_in_table_scope 'table'
1613 el = open_els.shift()
1614 if el.name is 'table'
1616 reset_insertion_mode()
1619 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1624 ins_mode_in_table_else t
1628 ins_mode_in_table_else t
1631 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1632 ins_mode_in_table_text = (t) ->
1633 if t.type is TYPE_TEXT and t.text is "\u0000"
1634 # huh? I thought the tokenizer didn't emit these
1637 if t.type is TYPE_TEXT
1638 pending_table_character_tokens.push t
1642 for old in pending_table_character_tokens
1643 unless is_space_tok old
1647 for old in pending_table_character_tokens
1648 insert_character old
1650 for old in pending_table_character_tokens
1651 ins_mode_table_else old
1652 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1653 insertion_mode = original_insertion_mode
1656 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1657 ins_mode_in_caption = (t) ->
1658 if t.type is TYPE_END_TAG and t.name is 'caption'
1659 if is_in_table_scope 'caption'
1660 generate_implied_end_tags()
1661 if open_els[0].name isnt 'caption'
1664 el = open_els.shift()
1665 if el.name is 'caption'
1667 clear_afe_to_marker()
1668 insertion_mode = in_table
1673 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1675 if is_in_table_scope 'caption'
1677 el = open_els.shift()
1678 if el.name is 'caption'
1680 clear_afe_to_marker()
1681 insertion_mode = in_table
1683 # else fragment case
1685 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1691 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1692 ins_mode_in_column_group = (t) ->
1696 if t.type is TYPE_COMMENT
1699 if t.type is TYPE_DOCTYPE
1702 if t.type is TYPE_START_TAG and t.name is 'html'
1705 if t.type is TYPE_START_TAG and t.name is 'col'
1706 el = insert_html_element t
1708 el.acknowledge_self_closing()
1710 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1711 if open_els[0].name is 'colgroup'
1713 insertion_mode = ins_mode_in_table
1717 if t.type is TYPE_END_TAG and t.name is 'col'
1720 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1723 if t.type is TYPE_EOF
1727 if open_els[0].name isnt 'colgroup'
1731 insertion_mode = ins_mode_in_table
1735 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1736 ins_mode_in_table_body = (t) ->
1737 if t.type is TYPE_START_TAG and t.name is 'tr'
1738 clear_stack_to_table_body_context()
1739 insert_html_element t
1740 insertion_mode = ins_mode_in_row
1742 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1744 clear_stack_to_table_body_context()
1745 insert_html_element new_open_tag 'tr'
1746 insertion_mode = ins_mode_in_row
1749 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1750 unless is_in_table_scope t.name # fixfull check namespace
1753 clear_stack_to_table_body_context()
1755 insertion_mode = ins_mode_in_table
1757 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1760 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1763 if table_scopers[el.name]
1768 clear_stack_to_table_body_context()
1770 insertion_mode = ins_mode_in_table
1773 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1779 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1780 ins_mode_in_row = (t) ->
1781 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1782 clear_stack_to_table_row_context()
1783 insert_html_element t
1784 insertion_mode = ins_mode_in_cell
1787 if t.type is TYPE_END_TAG and t.name is 'tr'
1788 if is_in_table_scope 'tr'
1789 clear_stack_to_table_row_context()
1791 insertion_mode = ins_mode_in_table_body
1795 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1796 if is_in_table_scope 'tr'
1797 clear_stack_to_table_row_context()
1799 insertion_mode = ins_mode_in_table_body
1804 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1805 if is_in_table_scope t.name # fixfull namespace
1806 if is_in_table_scope 'tr'
1807 clear_stack_to_table_row_context()
1809 insertion_mode = ins_mode_in_table_body
1814 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1820 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1822 generate_implied_end_tags()
1823 unless open_els[0].name is 'td' or open_els[0] is 'th'
1826 el = open_els.shift()
1827 if el.name is 'td' or el.name is 'th'
1829 clear_afe_to_marker()
1830 insertion_mode = ins_mode_in_row
1832 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1833 ins_mode_in_cell = (t) ->
1834 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1835 if is_in_table_scope t.name
1836 generate_implied_end_tags()
1837 if open_els[0].name isnt t.name
1840 el = open_els.shift()
1841 if el.name is t.name
1843 clear_afe_to_marker()
1844 insertion_mode = ins_mode_in_row
1848 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1851 if el.name is 'td' or el.name is 'th'
1854 if table_scopers[el.name]
1862 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1865 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1866 if is_in_table_scope t.name # fixfull namespace
1875 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
1876 ins_mode_in_select = (t) ->
1877 if t.type is TYPE_TEXT and t.text is "\u0000"
1880 if t.type is TYPE_TEXT
1883 if t.type is TYPE_COMMENT
1886 if t.type is TYPE_DOCTYPE
1889 if t.type is TYPE_START_TAG and t.name is 'html'
1892 if t.type is TYPE_START_TAG and t.name is 'option'
1893 if open_els[0].name is 'option'
1895 insert_html_element t
1897 if t.type is TYPE_START_TAG and t.name is 'optgroup'
1898 if open_els[0].name is 'option'
1900 if open_els[0].name is 'optgroup'
1902 insert_html_element t
1904 if t.type is TYPE_END_TAG and t.name is 'optgroup'
1905 if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
1907 if open_els[0].name is 'optgroup'
1912 if t.type is TYPE_END_TAG and t.name is 'option'
1913 if open_els[0].name is 'option'
1918 if t.type is TYPE_END_TAG and t.name is 'select'
1919 if is_in_select_scope 'select'
1921 el = open_els.shift()
1922 if el.name is 'select'
1924 reset_insertion_mode()
1928 if t.type is TYPE_START_TAG and t.name is 'select'
1931 el = open_els.shift()
1932 if el.name is 'select'
1934 reset_insertion_mode()
1935 # spec says that this is the same as </select> but it doesn't say
1936 # to check scope first
1938 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
1940 if is_in_select_scope 'select'
1943 el = open_els.shift()
1944 if el.name is 'select'
1946 reset_insertion_mode()
1949 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
1952 if t.type is TYPE_EOF
1959 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
1960 ins_mode_in_select_in_table = (t) ->
1961 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1964 el = open_els.shift()
1965 if el.name is 'select'
1967 reset_insertion_mode()
1970 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1972 unless is_in_table_scope t.name, NS_HTML
1975 el = open_els.shift()
1976 if el.name is 'select'
1978 reset_insertion_mode()
1982 ins_mode_in_select t
1985 # CONTINUE more insertion modes!
1998 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2000 switch c = txt.charAt(cur++)
2002 return new_text_node parse_character_reference()
2004 tok_state = tok_state_tag_open
2007 return new_text_node c
2009 return new_eof_token()
2011 return new_text_node c
2014 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2015 # not needed: tok_state_character_reference_in_data = ->
2016 # just call parse_character_reference()
2018 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2019 tok_state_rcdata = ->
2020 switch c = txt.charAt(cur++)
2022 return new_text_node parse_character_reference()
2024 tok_state = tok_state_rcdata_less_than_sign
2027 return new_character_token "\ufffd"
2029 return new_eof_token()
2031 return new_character_token c
2034 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2035 # not needed: tok_state_character_reference_in_rcdata = ->
2036 # just call parse_character_reference()
2038 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2039 tok_state_rawtext = ->
2040 switch c = txt.charAt(cur++)
2042 tok_state = tok_state_rawtext_less_than_sign
2045 return new_character_token "\ufffd"
2047 return new_eof_token()
2049 return new_character_token c
2052 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2053 tok_state_script_data = ->
2054 switch c = txt.charAt(cur++)
2056 tok_state = tok_state_script_data_less_than_sign
2059 return new_character_token "\ufffd"
2061 return new_eof_token()
2063 return new_character_token c
2066 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2067 tok_state_plaintext = ->
2068 switch c = txt.charAt(cur++)
2071 return new_character_token "\ufffd"
2073 return new_eof_token()
2075 return new_character_token c
2079 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2080 tok_state_tag_open = ->
2081 switch c = txt.charAt(cur++)
2083 tok_state = tok_state_markup_declaration_open
2085 tok_state = tok_state_end_tag_open
2088 tok_state = tok_state_bogus_comment
2090 if lc_alpha.indexOf(c) > -1
2091 tok_cur_tag = new_open_tag c
2092 tok_state = tok_state_tag_name
2093 else if uc_alpha.indexOf(c) > -1
2094 tok_cur_tag = new_open_tag c.toLowerCase()
2095 tok_state = tok_state_tag_name
2098 tok_state = tok_state_data
2099 cur -= 1 # we didn't parse/handle the char after <
2100 return new_text_node '<'
2103 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2104 tok_state_end_tag_open = ->
2105 switch c = txt.charAt(cur++)
2108 tok_state = tok_state_data
2111 tok_state = tok_state_data
2112 return new_text_node '</'
2114 if uc_alpha.indexOf(c) > -1
2115 tok_cur_tag = new_end_tag c.toLowerCase()
2116 tok_state = tok_state_tag_name
2117 else if lc_alpha.indexOf(c) > -1
2118 tok_cur_tag = new_end_tag c
2119 tok_state = tok_state_tag_name
2122 tok_state = tok_state_bogus_comment
2125 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2126 tok_state_tag_name = ->
2127 switch c = txt.charAt(cur++)
2128 when "\t", "\n", "\u000c", ' '
2129 tok_state = tok_state_before_attribute_name
2131 tok_state = tok_state_self_closing_start_tag
2133 tok_state = tok_state_data
2139 tok_cur_tag.name += "\ufffd"
2142 tok_state = tok_state_data
2144 if uc_alpha.indexOf(c) > -1
2145 tok_cur_tag.name += c.toLowerCase()
2147 tok_cur_tag.name += c
2150 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2151 tok_state_rcdata_less_than_sign = ->
2152 c = txt.charAt(cur++)
2154 temporary_buffer = ''
2155 tok_state = tok_state_rcdata_end_tag_open
2158 tok_state = tok_state_rcdata
2159 cur -= 1 # reconsume the input character
2160 return new_character_token '<'
2162 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2163 tok_state_rcdata_end_tag_open = ->
2164 c = txt.charAt(cur++)
2165 if uc_alpha.indexOf(c) > -1
2166 tok_cur_tag = new_end_tag c.toLowerCase()
2167 temporary_buffer += c
2168 tok_state = tok_state_rcdata_end_tag_name
2170 if lc_alpha.indexOf(c) > -1
2171 tok_cur_tag = new_end_tag c
2172 temporary_buffer += c
2173 tok_state = tok_state_rcdata_end_tag_name
2176 tok_state = tok_state_rcdata
2177 cur -= 1 # reconsume the input character
2178 return new_character_token "</" # fixfull separate these
2180 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2181 is_appropriate_end_tag = (t) ->
2182 # spec says to check against "the tag name of the last start tag to
2183 # have been emitted from this tokenizer", but this is only called from
2184 # the various "raw" states, which I'm pretty sure all push the start
2185 # token onto open_els. TODO: verify this after the script data states
2187 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2188 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2190 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2191 tok_state_rcdata_end_tag_name = ->
2192 c = txt.charAt(cur++)
2193 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2194 if is_appropriate_end_tag tok_cur_tag
2195 tok_state = tok_state_before_attribute_name
2197 # else fall through to "Anything else"
2199 if is_appropriate_end_tag tok_cur_tag
2200 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2202 # else fall through to "Anything else"
2204 if is_appropriate_end_tag tok_cur_tag
2205 tok_state = tok_state_data
2207 # else fall through to "Anything else"
2208 if uc_alpha.indexOf(c) > -1
2209 tok_cur_tag.name += c.toLowerCase()
2210 temporary_buffer += c
2212 if lc_alpha.indexOf(c) > -1
2213 tok_cur_tag.name += c
2214 temporary_buffer += c
2217 tok_state = tok_state_rcdata
2218 cur -= 1 # reconsume the input character
2219 return new_character_token '</' + temporary_buffer # fixfull separate these
2221 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2222 tok_state_rawtext_less_than_sign = ->
2223 c = txt.charAt(cur++)
2225 temporary_buffer = ''
2226 tok_state = tok_state_rawtext_end_tag_open
2229 tok_state = tok_state_rawtext
2230 cur -= 1 # reconsume the input character
2231 return new_character_token '<'
2233 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2234 tok_state_rawtext_end_tag_open = ->
2235 c = txt.charAt(cur++)
2236 if uc_alpha.indexOf(c) > -1
2237 tok_cur_tag = new_end_tag c.toLowerCase()
2238 temporary_buffer += c
2239 tok_state = tok_state_rawtext_end_tag_name
2241 if lc_alpha.indexOf(c) > -1
2242 tok_cur_tag = new_end_tag c
2243 temporary_buffer += c
2244 tok_state = tok_state_rawtext_end_tag_name
2247 tok_state = tok_state_rawtext
2248 cur -= 1 # reconsume the input character
2249 return new_character_token "</" # fixfull separate these
2251 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2252 tok_state_rawtext_end_tag_name = ->
2253 c = txt.charAt(cur++)
2254 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2255 if is_appropriate_end_tag tok_cur_tag
2256 tok_state = tok_state_before_attribute_name
2258 # else fall through to "Anything else"
2260 if is_appropriate_end_tag tok_cur_tag
2261 tok_state = tok_state_self_closing_start_tag
2263 # else fall through to "Anything else"
2265 if is_appropriate_end_tag tok_cur_tag
2266 tok_state = tok_state_data
2268 # else fall through to "Anything else"
2269 if uc_alpha.indexOf(c) > -1
2270 tok_cur_tag.name += c.toLowerCase()
2271 temporary_buffer += c
2273 if lc_alpha.indexOf(c) > -1
2274 tok_cur_tag.name += c
2275 temporary_buffer += c
2278 tok_state = tok_state_rawtext
2279 cur -= 1 # reconsume the input character
2280 return new_character_token '</' + temporary_buffer # fixfull separate these
2282 # TODO _all_ of the missing states here (17-33) are for parsing script tags
2284 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2285 tok_state_before_attribute_name = ->
2287 switch c = txt.charAt(cur++)
2288 when "\t", "\n", "\u000c", ' '
2291 tok_state = tok_state_self_closing_start_tag
2294 tok_state = tok_state_data
2300 attr_name = "\ufffd"
2301 when '"', "'", '<', '='
2306 tok_state = tok_state_data
2308 if uc_alpha.indexOf(c) > -1
2309 attr_name = c.toLowerCase()
2313 tok_cur_tag.attrs_a.unshift [attr_name, '']
2314 tok_state = tok_state_attribute_name
2317 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2318 tok_state_attribute_name = ->
2319 switch c = txt.charAt(cur++)
2320 when "\t", "\n", "\u000c", ' '
2321 tok_state = tok_state_after_attribute_name
2323 tok_state = tok_state_self_closing_start_tag
2325 tok_state = tok_state_before_attribute_value
2327 tok_state = tok_state_data
2333 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2336 tok_cur_tag.attrs_a[0][0] = c
2339 tok_state = tok_state_data
2341 if uc_alpha.indexOf(c) > -1
2342 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2344 tok_cur_tag.attrs_a[0][0] += c
2347 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2348 tok_state_after_attribute_name = ->
2349 c = txt.charAt(cur++)
2350 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2353 tok_state = tok_state_self_closing_start_tag
2356 tok_state = tok_state_before_attribute_value
2359 tok_state = tok_state_data
2361 if uc_alpha.indexOf(c) > -1
2362 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2363 tok_state = tok_state_attribute_name
2367 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2368 tok_state = tok_state_attribute_name
2372 tok_state = tok_state_data
2373 cur -= 1 # reconsume
2375 if c is '"' or c is "'" or c is '<'
2377 # fall through to Anything else
2379 tok_cur_tag.attrs_a.unshift [c, '']
2380 tok_state = tok_state_attribute_name
2382 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2383 tok_state_before_attribute_value = ->
2384 switch c = txt.charAt(cur++)
2385 when "\t", "\n", "\u000c", ' '
2388 tok_state = tok_state_attribute_value_double_quoted
2390 tok_state = tok_state_attribute_value_unquoted
2393 tok_state = tok_state_attribute_value_single_quoted
2396 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2397 tok_state = tok_state_attribute_value_unquoted
2400 tok_state = tok_state_data
2406 tok_state = tok_state_data
2408 tok_cur_tag.attrs_a[0][1] += c
2409 tok_state = tok_state_attribute_value_unquoted
2412 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2413 tok_state_attribute_value_double_quoted = ->
2414 switch c = txt.charAt(cur++)
2416 tok_state = tok_state_after_attribute_value_quoted
2418 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2421 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2424 tok_state = tok_state_data
2426 tok_cur_tag.attrs_a[0][1] += c
2429 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2430 tok_state_attribute_value_single_quoted = ->
2431 switch c = txt.charAt(cur++)
2433 tok_state = tok_state_after_attribute_value_quoted
2435 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2438 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2441 tok_state = tok_state_data
2443 tok_cur_tag.attrs_a[0][1] += c
2446 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2447 tok_state_attribute_value_unquoted = ->
2448 switch c = txt.charAt(cur++)
2449 when "\t", "\n", "\u000c", ' '
2450 tok_state = tok_state_before_attribute_name
2452 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2454 tok_state = tok_state_data
2459 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2462 tok_state = tok_state_data
2464 # Parse Error if ', <, = or ` (backtick)
2465 tok_cur_tag.attrs_a[0][1] += c
2468 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2469 tok_state_after_attribute_value_quoted = ->
2470 switch c = txt.charAt(cur++)
2471 when "\t", "\n", "\u000c", ' '
2472 tok_state = tok_state_before_attribute_name
2474 tok_state = tok_state_self_closing_start_tag
2476 tok_state = tok_state_data
2482 tok_state = tok_state_data
2485 tok_state = tok_state_before_attribute_name
2486 cur -= 1 # we didn't handle that char
2489 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
2490 # Don't set this as a state, just call it
2491 # returns a string (NOT a text node)
2492 parse_character_reference = (allowed_char = null, in_attr = false) ->
2493 if cur >= txt.length
2495 switch c = txt.charAt(cur)
2496 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
2497 # explicitly not a parse error
2500 # there has to be "one or more" alnums between & and ; to be a parse error
2503 if cur + 1 >= txt.length
2505 if txt.charAt(cur + 1).toLowerCase() is 'x'
2514 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
2518 if txt.charAt(start + i) is ';'
2520 # FIXME This is supposed to generate parse errors for some chars
2521 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
2528 if alnum.indexOf(txt.charAt(cur + i)) is -1
2531 # exit early, because parse_error() below needs at least one alnum
2533 if txt.charAt(cur + i) is ';'
2534 i += 1 # include ';' terminator in value
2535 decoded = decode_named_char_ref txt.substr(cur, i)
2542 # no ';' terminator (only legacy char refs)
2544 for i in [2..max] # no prefix matches, so ok to check shortest first
2545 c = legacy_char_refs[txt.substr(cur, i)]
2548 if txt.charAt(cur + i) is '='
2549 # "because some legacy user agents will
2550 # misinterpret the markup in those cases"
2553 if alnum.indexOf(txt.charAt(cur + i)) > -1
2554 # this makes attributes forgiving about url args
2556 # ok, and besides the weird exceptions for attributes...
2557 # return the matching char
2558 cur += i # consume entity chars
2559 parse_error() # because no terminating ";"
2563 return # never reached
2565 # tree constructor initialization
2566 # see comments on TYPE_TAG/etc for the structure of this data
2567 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
2569 afe = [] # active formatting elements
2570 template_insertion_modes = []
2571 insertion_mode = ins_mode_initial
2572 original_insertion_mode = insertion_mode # TODO check spec
2573 flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
2574 flag_frameset_ok = true
2576 flag_foster_parenting = false
2577 form_element_pointer = null
2578 temporary_buffer = null
2579 pending_table_character_tokens = []
2580 head_element_pointer = null
2582 # tokenizer initialization
2583 tok_state = tok_state_data
2592 # everything below is tests on the above
2593 test_equals = (description, output, expected_output) ->
2594 if output is expected_output
2595 console.log "passed." # don't say name, so smart consoles can merge all of these
2597 console.log "FAILED: \"#{description}\""
2598 console.log " Expected: #{expected_output}"
2599 console.log " Actual: #{output}"
2600 serialize_els = (els, shallow, show_ids) ->
2606 serialized += t.serialize shallow, show_ids
2608 test_parser = (args) ->
2613 prev_node_id = 0 # reset counter
2614 parsed = parse_html args.html, errors_cb
2615 serialized = serialize_els parsed, false, false
2616 expected = 'tag:"html",{},[tag:"head",{},[],tag:"body",{},[' + args.expected + ']]'
2617 if serialized isnt expected
2618 debug_log_each (str) ->
2620 console.log "FAILED: \"#{args.name}\""
2621 console.log " Input: #{args.html}"
2622 console.log " Correct: #{expected}"
2623 console.log " Output: #{serialized}"
2624 if parse_errors.length > 0
2625 console.log " parse errs: #{JSON.stringify parse_errors}"
2627 console.log " No parse errors"
2629 console.log "passed \"#{args.name}\""
2631 test_parser name: "empty", \
2634 test_parser name: "just text", \
2636 expected: 'text:"abc"'
2637 test_parser name: "named entity", \
2639 expected: 'text:"a&1234"'
2640 test_parser name: "broken named character references", \
2641 html: "1&2&&3&aabbcc;",
2642 expected: 'text:"1&2&&3&aabbcc;"'
2643 test_parser name: "numbered entity overrides", \
2644 html: "1€€ ƒ",
2645 expected: 'text:"1€€ ƒ"'
2646 test_parser name: "open tag", \
2647 html: "foo<span>bar",
2648 expected: 'text:"foo",tag:"span",{},[text:"bar"]'
2649 test_parser name: "open tag with attributes", \
2650 html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
2651 expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
2652 test_parser name: "open tag with attributes of various quotings", \
2653 html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
2654 expected: 'text:"foo",tag:"span",{"abc":"def","autofocus":"","g":"hij","klm":"nopqrstuv\\""},[text:"bar"]'
2655 test_parser name: "attribute entity exceptions dq", \
2656 html: "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar",
2657 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
2658 test_parser name: "attribute entity exceptions sq", \
2659 html: "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar",
2660 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
2661 test_parser name: "attribute entity exceptions uq", \
2662 html: "foo<a href=foo?t=1&=2&o=3&lt=foo>bar",
2663 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
2664 test_parser name: "matching closing tags", \
2665 html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
2666 expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"'
2667 test_parser name: "missing closing tag inside", \
2668 html: "foo<div>bar<span>baz</div>qux",
2669 expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"'
2670 test_parser name: "mis-matched closing tags", \
2671 html: "<span>12<div>34</span>56</div>78",
2672 expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]'
2673 test_parser name: "mis-matched formatting elements", \
2674 html: "12<b>34<i>56</b>78</i>90",
2675 expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"'
2676 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
2677 html: '<p>1<b>2<i>3</b>4</i>5</p>',
2678 expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]'
2679 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
2680 html: '<b>1<p>2</b>3</p>',
2681 expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]'
2682 test_parser name: "crazy formatting elements test", \
2683 html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
2684 # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
2685 # firefox does this:
2686 expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
2687 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
2688 test_parser name: "html5lib aaa 1", \
2689 html: '<a><p></a></p>',
2690 expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]'
2691 test_parser name: "html5lib aaa 2", \
2692 html: '<a>1<p>2</a>3</p>',
2693 expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]'
2694 test_parser name: "html5lib aaa 3", \
2695 html: '<a>1<button>2</a>3</button>',
2696 expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]'
2697 test_parser name: "html5lib aaa 4", \
2698 html: '<a>1<b>2</a>3</b>',
2699 expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]'
2700 test_parser name: "html5lib aaa 5 (two divs deep)", \
2701 html: '<a>1<div>2<div>3</a>4</div>5</div>',
2702 expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]'
2703 test_parser name: "html5lib aaa 6 (foster parenting)", \
2704 html: '<table><a>1<p>2</a>3</p>',
2705 expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]'
2706 test_parser name: "html5lib aaa 7 (aaa, eof) 1", \
2707 html: '<b><b><a><p></a>',
2708 expected: 'tag:"b",{},[tag:"b",{},[tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]]]'
2709 test_parser name: "html5lib aaa 8 (aaa, eof) 2", \
2710 html: '<b><a><b><p></a>',
2711 expected: 'tag:"b",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2712 test_parser name: "html5lib aaa 9 (aaa, eof) 3", \
2713 html: '<a><b><b><p></a>',
2714 expected: 'tag:"a",{},[tag:"b",{},[tag:"b",{},[]]],tag:"b",{},[tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2715 test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \
2716 html: '<p>1<s id="A">2<b id="B">3</p>4</s>5</b>',
2717 expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]'
2718 test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
2719 html: '<table><a>1<td>2</td>3</table>',
2720 expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]'
2721 test_parser name: "html5lib aaa 12 (table with foster parenting, split text)", \
2722 html: '<table>A<td>B</td>C</table>',
2723 expected: 'text:"AC",tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2724 # TODO implement svg and namespacing
2725 #test_parser name: "html5lib aaa 13 (svg tr input)", \
2726 # html: '<a><svg><tr><input></a>',
2727 # expected: 'tag:"a",{},[svg:"svg",{},[svg:"tr",{},[svg:"input"]]]'
2728 test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \
2729 html: '<div><a><b><div><div><div><div><div><div><div><div><div><div></a>',
2730 expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"div",{},[tag:"div",{},[]]]]]]]]]]]]]'
2731 test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \
2732 html: '<div><a><b><u><i><code><div></a>',
2733 expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]'
2734 test_parser name: "html5lib aaa 16 (correctly nested 4b)", \
2735 html: '<b><b><b><b>x</b></b></b></b>y',
2736 expected: 'tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]],text:"y"'
2737 test_parser name: "html5lib aaa 17 (formatting, implied /p, noah's ark)", \
2738 html: '<p><b><b><b><b><p>x',
2739 expected: 'tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[]]]]],tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]]'
2740 test_parser name: "variation on html5lib aaa 17 (with attributes in various orders)", \
2741 html: '<p><b c="d" e="f"><b e="f" c="d"><b e="f" c="d"><b c="d" e="f"><p>x',
2742 expected: 'tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[]]]]],tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[text:"x"]]]]'
2743 test_parser name: "junk after attribute close-quote", \
2744 html: '<p><b c="d", e="f">foo<p>x',
2745 expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'
2746 test_parser name: "html5lib aaa02 1", \
2747 html: '<b>1<i>2<p>3</b>4',
2748 expected: 'tag:"b",{},[text:"1",tag:"i",{},[text:"2"]],tag:"i",{},[tag:"p",{},[tag:"b",{},[text:"3"],text:"4"]]'
2749 test_parser name: "html5lib aaa02 2", \
2750 html: '<a><div><style></style><address><a>',
2751 expected: 'tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"style",{},[]],tag:"address",{},[tag:"a",{},[],tag:"a",{},[]]]'
2752 test_parser name: "html5lib tables 1", \
2753 html: '<table><th>',
2754 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"th",{},[]]]]'
2755 test_parser name: "html5lib tables 2", \
2756 html: '<table><td>',
2757 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
2758 test_parser name: "html5lib tables 3", \
2759 html: "<table><col foo='bar'>",
2760 expected: 'tag:"table",{},[tag:"colgroup",{},[tag:"col",{"foo":"bar"},[]]]'
2761 test_parser name: "html5lib tables 4", \
2762 html: '<table><colgroup></html>foo',
2763 expected: 'text:"foo",tag:"table",{},[tag:"colgroup",{},[]]'
2764 test_parser name: "html5lib tables 5", \
2765 html: '<table></table><p>foo',
2766 expected: 'tag:"table",{},[],tag:"p",{},[text:"foo"]'
2767 test_parser name: "html5lib tables 6", \
2768 html: '<table></body></caption></col></colgroup></html></tbody></td></tfoot></th></thead></tr><td>',
2769 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
2770 test_parser name: "html5lib tables 7", \
2771 html: '<table><select><option>3</select></table>',
2772 expected: 'tag:"select",{},[tag:"option",{},[text:"3"]],tag:"table",{},[]'
2773 test_parser name: "html5lib tables 8", \
2774 html: '<table><select><table></table></select></table>',
2775 expected: 'tag:"select",{},[],tag:"table",{},[],tag:"table",{},[]'
2776 test_parser name: "html5lib tables 9", \
2777 html: '<table><select></table>',
2778 expected: 'tag:"select",{},[],tag:"table",{},[]'
2779 test_parser name: "html5lib tables 10", \
2780 html: '<table><select><option>A<tr><td>B</td></tr></table>',
2781 expected: 'tag:"select",{},[tag:"option",{},[text:"A"]],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2782 test_parser name: "html5lib tables 11", \
2783 html: '<table><td></body></caption></col></colgroup></html>foo',
2784 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
2785 test_parser name: "html5lib tables 12", \
2786 html: '<table><td>A</table>B',
2787 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"A"]]]],text:"B"'
2788 test_parser name: "html5lib tables 13", \
2789 html: '<table><tr><caption>',
2790 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[]],tag:"caption",{},[]]'
2791 test_parser name: "html5lib tables 14", \
2792 html: '<table><tr></body></caption></col></colgroup></html></td></th><td>foo',
2793 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
2794 test_parser name: "html5lib tables 15", \
2795 html: '<table><td><tr>',
2796 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]],tag:"tr",{},[]]]'
2797 test_parser name: "html5lib tables 16", \
2798 html: '<table><td><button><td>',
2799 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[tag:"button",{},[]],tag:"td",{},[]]]]'
2800 # TODO implement svg parsing
2801 #test_parser name: "html5lib tables 17", \
2802 # html: '<table><tr><td><svg><desc><td>',
2803 # expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[svg:"svg",{},[svg:"desc",{},[]]],tag:"td",{},[]]]]'