1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # Each node is an obect of the Node class. Here are the Node types:
52 TYPE_TAG = 0 # name, {attributes}, [children]
53 TYPE_TEXT = 1 # "text"
56 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
57 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
58 TYPE_END_TAG = 5 # name
60 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
61 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
73 debug_log_each = (cb) ->
74 for str in g_debug_log
79 constructor: (type, args = {}) ->
80 @type = type # one of the TYPE_* constants above
81 @name = args.name ? '' # tag name
82 @text = args.text ? '' # contents for text/comment nodes
83 @attrs = args.attrs ? {}
84 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
85 @children = args.children ? []
86 @namespace = args.namespace ? NS_HTML
87 @parent = args.parent ? null
88 @token = args.token ? null
92 @id = "#{++prev_node_id}"
93 shallow_clone: -> # return a new node that's the same except without the children or parent
94 # WARNING this doesn't work right on open tags that are still being parsed
96 attrs[k] = v for k, v of @attrs
97 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id, token: @token
98 acknowledge_self_closing: ->
100 @token.flag 'did_self_close'
102 @flag 'did_self_close', true
105 serialize: (shallow = false, show_ids = false) -> # for unit tests
110 ret += JSON.stringify @name
125 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
131 ret += c.serialize shallow, show_ids
135 ret += JSON.stringify @text
138 ret += JSON.stringify @text
144 when TYPE_AAA_BOOKMARK
145 ret += 'aaa_bookmark'
148 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
151 # helpers: (only take args that are normally known when parser creates nodes)
152 new_open_tag = (name) ->
153 return new Node TYPE_START_TAG, name: name
154 new_end_tag = (name) ->
155 return new Node TYPE_END_TAG, name: name
156 new_element = (name) ->
157 return new Node TYPE_TAG, name: name
158 new_text_node = (txt) ->
159 return new Node TYPE_TEXT, text: txt
160 new_character_token = new_text_node
161 new_comment_node = (txt) ->
162 return new Node TYPE_COMMENT, text: txt
164 return new Node TYPE_EOF
166 return new Node TYPE_AFE_MARKER
167 new_aaa_bookmark = ->
168 return new Node TYPE_AAA_BOOKMARK
170 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
171 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
172 digits = "0123456789"
173 alnum = lc_alpha + uc_alpha + digits
174 hex_chars = digits + "abcdefABCDEF"
176 # some SVG elements have dashes in them
177 tag_name_chars = alnum + "-"
179 # http://www.w3.org/TR/html5/infrastructure.html#space-character
180 space_chars = "\u0009\u000a\u000c\u000d\u0020"
182 return txt.length is 1 and space_chars.indexOf(txt) > -1
183 is_space_tok = (t) ->
184 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
186 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
187 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
189 # These are the character references that don't need a terminating semicolon
190 # min length: 2, max: 6, none are a prefix of any other.
192 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
193 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
194 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
195 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
196 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
197 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
198 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
199 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
200 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
201 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
202 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
203 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
204 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
205 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
206 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
207 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
208 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
212 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
213 raw_text_elements = ['script', 'style']
214 escapable_raw_text_elements = ['textarea', 'title']
215 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
217 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
218 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
219 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
220 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
221 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
222 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
223 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
224 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
225 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
226 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
227 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
228 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
229 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
230 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
234 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
236 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
237 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
238 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
239 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
240 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
241 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
242 'determinant', 'diff', 'divergence', 'divide', 'domain',
243 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
244 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
245 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
246 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
247 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
248 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
249 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
250 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
251 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
252 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
253 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
254 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
255 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
256 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
257 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
258 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
259 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
260 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
261 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
262 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
263 'vectorproduct', 'xor'
265 # foreign_elements = [svg_elements..., mathml_elements...]
266 #normal_elements = All other allowed HTML elements are normal elements.
270 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
271 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
272 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
273 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
274 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
275 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
276 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
277 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
278 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
279 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
280 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
281 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
282 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
283 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
284 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
285 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
286 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
287 wbr:NS_HTML, xmp:NS_HTML,
290 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
291 'annotation-xml':NS_MATHML,
294 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
297 formatting_elements = {
298 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
299 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
303 foster_parenting_targets = {
325 el_is_special = (e) ->
326 return special_elements[e.name] is e.namespace
328 # decode_named_char_ref()
330 # The list of named character references is _huge_ so ask the browser to decode
331 # for us instead of wasting bandwidth/space on including the table here.
333 # Pass without the "&" but with the ";" examples:
334 # for "&" pass "amp;"
335 # for "′" pass "x2032;"
338 textarea: document.createElement('textarea')
340 # TODO test this in IE8
341 decode_named_char_ref = (txt) ->
343 decoded = g_dncr.cache[txt]
344 return decoded if decoded?
345 g_dncr.textarea.innerHTML = txt
346 decoded = g_dncr.textarea.value
347 return null if decoded is txt
348 return g_dncr.cache[txt] = decoded
350 parse_html = (txt, parse_error_cb = null) ->
351 cur = 0 # index of next char in txt to be parsed
352 # declare doc and tokenizer variables so they're in scope below
354 open_els = null # stack of open elements
355 afe = null # active formatting elements
356 template_insertion_modes = null
357 insertion_mode = null
358 original_insertion_mode = null
360 tok_cur_tag = null # partially parsed tag
361 flag_scripting = null
362 flag_frameset_ok = null
364 flag_foster_parenting = null
365 form_element_pointer = null
366 temporary_buffer = null
367 pending_table_character_tokens = null
368 head_element_pointer = null
369 flag_fragment_parsing = null
378 console.log "Parse error at character #{cur} of #{txt.length}"
380 afe_push = (new_el) ->
383 if el.name is new_el.name and el.namespace is new_el.namespace
385 continue unless new_el.attrs[k] is v
386 for k, v of new_el.attrs
387 continue unless el.attrs[k] is v
394 afe.unshift new_afe_marker()
396 # the functions below impliment the Tree Contstruction algorithm
397 # http://www.w3.org/TR/html5/syntax.html#tree-construction
399 # But first... the helpers
400 template_tag_is_open = ->
402 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
405 is_in_scope_x = (tag_name, scope, namespace) ->
407 if t.name is tag_name and (namespace is null or namespace is t.namespace)
409 if scope[t.name] is t.namespace
412 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
414 if t.name is tag_name and (namespace is null or namespace is t.namespace)
416 if scope[t.name] is t.namespace
418 if scope2[t.name] is t.namespace
421 standard_scopers = { # FIXME these are supposed to be namespace specific
422 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
423 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
424 template: NS_HTML, mi: NS_MATHML,
426 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
427 'annotation-xml': NS_MATHML,
429 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
431 button_scopers = button: NS_HTML
432 li_scopers = ol: NS_HTML, ul: NS_HTML
433 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
434 is_in_scope = (tag_name, namespace = null) ->
435 return is_in_scope_x tag_name, standard_scopers, namespace
436 is_in_button_scope = (tag_name, namespace = null) ->
437 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
438 is_in_table_scope = (tag_name, namespace = null) ->
439 return is_in_scope_x tag_name, table_scopers, namespace
440 is_in_select_scope = (tag_name, namespace = null) ->
442 if t.name is tag_name and (namespace is null or namespace is t.namespace)
444 if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
447 # this checks for a particular element, not by name
448 el_is_in_scope = (el) ->
452 if standard_scopers[t.name] is t.namespace
456 clear_to_table_stopers = {
461 clear_stack_to_table_context = ->
463 if clear_to_table_stopers[open_els[0].name]?
467 clear_to_table_body_stopers = {
474 clear_stack_to_table_body_context = ->
476 if clear_to_table_body_stopers[open_els[0].name]?
480 clear_to_table_row_stopers = {
485 clear_stack_to_table_row_context = ->
487 if clear_to_table_row_stopers[open_els[0].name]?
491 clear_afe_to_marker = ->
494 if el.type is TYPE_AFE_MARKER
498 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
499 reset_insertion_mode = ->
500 # 1. Let last be false.
502 # 2. Let node be the last node in the stack of open elements.
504 node = open_els[node_i]
505 # 3. Loop: If node is the first node in the stack of open elements,
506 # then set last to true, and, if the parser was originally created as
507 # part of the HTML fragment parsing algorithm (fragment case) set node
508 # to the context element.
510 if node_i is open_els.length - 1
512 # fixfull (fragment case)
514 # 4. If node is a select element, run these substeps:
515 if node.name is 'select'
516 # 1. If last is true, jump to the step below labeled done.
518 # 2. Let ancestor be node.
521 # 3. Loop: If ancestor is the first node in the stack of
522 # open elements, jump to the step below labeled done.
524 if ancestor_i is open_els.length - 1
526 # 4. Let ancestor be the node before ancestor in the stack
529 ancestor = open_els[ancestor_i]
530 # 5. If ancestor is a template node, jump to the step below
532 if ancestor.name is 'template'
534 # 6. If ancestor is a table node, switch the insertion mode
535 # to "in select in table" and abort these steps.
536 if ancestor.name is 'table'
537 insertion_mode = ins_mode_in_select_in_table
539 # 7. Jump back to the step labeled loop.
540 # 8. Done: Switch the insertion mode to "in select" and abort
542 insertion_mode = ins_mode_in_select
544 # 5. If node is a td or th element and last is false, then switch
545 # the insertion mode to "in cell" and abort these steps.
546 if (node.name is 'td' or node.name is 'th') and last is false
547 insertion_mode = ins_mode_in_cell
549 # 6. If node is a tr element, then switch the insertion mode to "in
550 # row" and abort these steps.
552 insertion_mode = ins_mode_in_row
554 # 7. If node is a tbody, thead, or tfoot element, then switch the
555 # insertion mode to "in table body" and abort these steps.
556 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
557 insertion_mode = ins_mode_in_table_body
559 # 8. If node is a caption element, then switch the insertion mode
560 # to "in caption" and abort these steps.
561 if node.name is 'caption'
562 insertion_mode = ins_mode_in_caption
564 # 9. If node is a colgroup element, then switch the insertion mode
565 # to "in column group" and abort these steps.
566 if node.name is 'colgroup'
567 insertion_mode = ins_mode_in_column_group
569 # 10. If node is a table element, then switch the insertion mode to
570 # "in table" and abort these steps.
571 if node.name is 'table'
572 insertion_mode = ins_mode_in_table
574 # 11. If node is a template element, then switch the insertion mode
575 # to the current template insertion mode and abort these steps.
576 # fixfull (template insertion mode stack)
578 # 12. If node is a head element and last is true, then switch the
579 # insertion mode to "in body" ("in body"! not "in head"!) and abort
580 # these steps. (fragment case)
581 if node.name is 'head' and last
582 insertion_mode = ins_mode_in_body
584 # 13. If node is a head element and last is false, then switch the
585 # insertion mode to "in head" and abort these steps.
586 if node.name is 'head' and last is false
587 insertion_mode = ins_mode_in_head
589 # 14. If node is a body element, then switch the insertion mode to
590 # "in body" and abort these steps.
591 if node.name is 'body'
592 insertion_mode = ins_mode_in_body
594 # 15. If node is a frameset element, then switch the insertion mode
595 # to "in frameset" and abort these steps. (fragment case)
596 if node.name is 'frameset'
597 insertion_mode = ins_mode_in_frameset
599 # 16. If node is an html element, run these substeps:
600 if node.name is 'html'
601 # 1. If the head element pointer is null, switch the insertion
602 # mode to "before head" and abort these steps. (fragment case)
603 # fixfull (fragment case)
605 # 2. Otherwise, the head element pointer is not null, switch
606 # the insertion mode to "after head" and abort these steps.
607 insertion_mode = ins_mode_in_body # FIXME fixfull
609 # 17. If last is true, then switch the insertion mode to "in body"
610 # and abort these steps. (fragment case)
612 insertion_mode = ins_mode_in_body
614 # 18. Let node now be the node before node in the stack of open
617 node = open_els[node_i]
618 # 19. Return to the step labeled loop.
620 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
621 # this implementation is structured (mostly) as described at the link above.
622 # capitalized comments are the "labels" described at the link above.
623 reconstruct_active_formatting_elements = ->
624 return if afe.length is 0
625 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
630 if i is afe.length - 1
633 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
638 el = afe[i].shallow_clone()
639 tree_insert_element el
644 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
645 # adoption agency algorithm
647 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
648 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
649 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
650 adoption_agency = (subject) ->
651 debug_log "adoption_agency()"
652 debug_log "tree: #{serialize_els doc.children, false, true}"
653 debug_log "open_els: #{serialize_els open_els, true, true}"
654 debug_log "afe: #{serialize_els afe, true, true}"
655 if open_els[0].name is subject
658 # remove it from the list of active formatting elements (if found)
663 debug_log "aaa: starting off with subject on top of stack, exiting"
670 # 5. Let formatting element be the last element in the list of
671 # active formatting elements that: is between the end of the list
672 # and the last scope marker in the list, if any, or the start of
673 # the list otherwise, and has the tag name subject.
675 for t, fe_of_afe in afe
676 if t.type is TYPE_AFE_MARKER
681 # If there is no such element, then abort these steps and instead
682 # act as described in the "any other end tag" entry above.
684 debug_log "aaa: fe not found in afe"
685 in_body_any_other_end_tag subject
687 # 6. If formatting element is not in the stack of open elements,
688 # then this is a parse error; remove the element from the list, and
691 for t, fe_of_open_els in open_els
696 debug_log "aaa: fe not found in open_els"
698 # "remove it from the list" must mean afe, since it's not in open_els
699 afe.splice fe_of_afe, 1
701 # 7. If formatting element is in the stack of open elements, but
702 # the element is not in scope, then this is a parse error; abort
704 unless el_is_in_scope fe
705 debug_log "aaa: fe not in scope"
708 # 8. If formatting element is not the current node, this is a parse
709 # error. (But do not abort these steps.)
710 unless open_els[0] is fe
713 # 9. Let furthest block be the topmost node in the stack of open
714 # elements that is lower in the stack than formatting element, and
715 # is an element in the special category. There might not be one.
717 fb_of_open_els = null
724 # and continue, to see if there's one that's more "topmost"
725 # 10. If there is no furthest block, then the UA must first pop all
726 # the nodes from the bottom of the stack of open elements, from the
727 # current node up to and including formatting element, then remove
728 # formatting element from the list of active formatting elements,
729 # and finally abort these steps.
731 debug_log "aaa: no fb"
735 afe.splice fe_of_afe, 1
737 # 11. Let common ancestor be the element immediately above
738 # formatting element in the stack of open elements.
739 ca = open_els[fe_of_open_els + 1] # common ancestor
741 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
742 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
743 bookmark = new_aaa_bookmark()
746 afe.splice i, 0, bookmark
748 node = last_node = fb
752 # 3. Let node be the element immediately above node in the
753 # stack of open elements, or if node is no longer in the stack
754 # of open elements (e.g. because it got removed by this
755 # algorithm), the element that was immediately above node in
756 # the stack of open elements before node was removed.
760 node_next = open_els[i + 1]
762 node = node_next ? node_above
763 debug_log "inner loop #{inner}"
764 debug_log "tree: #{serialize_els doc.children, false, true}"
765 debug_log "open_els: #{serialize_els open_els, true, true}"
766 debug_log "afe: #{serialize_els afe, true, true}"
767 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
768 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
769 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
770 debug_log "node: #{node.serialize true, true}"
771 # TODO make sure node_above gets re-set if/when node is removed from open_els
773 # 4. If node is formatting element, then go to the next step in
774 # the overall algorithm.
778 # 5. If inner loop counter is greater than three and node is in
779 # the list of active formatting elements, then remove node from
780 # the list of active formatting elements.
786 debug_log "max out inner"
791 # 6. If node is not in the list of active formatting elements,
792 # then remove node from the stack of open elements and then go
793 # back to the step labeled inner loop.
795 debug_log "not in afe"
798 node_above = open_els[i + 1]
802 debug_log "the bones"
803 # 7. create an element for the token for which the element node
804 # was created, in the HTML namespace, with common ancestor as
805 # the intended parent; replace the entry for node in the list
806 # of active formatting elements with an entry for the new
807 # element, replace the entry for node in the stack of open
808 # elements with an entry for the new element, and let node be
810 new_node = node.shallow_clone()
814 debug_log "replaced in afe"
818 node_above = open_els[i + 1]
819 open_els[i] = new_node
820 debug_log "replaced in open_els"
823 # 8. If last node is furthest block, then move the
824 # aforementioned bookmark to be immediately after the new node
825 # in the list of active formatting elements.
830 debug_log "removed bookmark"
834 # "after" means lower
835 afe.splice i, 0, bookmark # "after as <-
836 debug_log "placed bookmark after node"
837 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
839 # 9. Insert last node into node, first removing it from its
840 # previous parent node if any.
842 debug_log "last_node has parent"
843 for c, i in last_node.parent.children
845 debug_log "removing last_node from parent"
846 last_node.parent.children.splice i, 1
848 node.children.push last_node
849 last_node.parent = node
850 # 10. Let last node be node.
853 # 11. Return to the step labeled inner loop.
854 # 14. Insert whatever last node ended up being in the previous step
855 # at the appropriate place for inserting a node, but using common
856 # ancestor as the override target.
858 # In the case where fe is immediately followed by fb:
859 # * inner loop exits out early (node==fe)
861 # * last_node is still in the tree (not a duplicate)
863 debug_log "FEFIRST? last_node has parent"
864 for c, i in last_node.parent.children
866 debug_log "removing last_node from parent"
867 last_node.parent.children.splice i, 1
870 debug_log "after aaa inner loop"
871 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
872 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
873 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
874 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
875 debug_log "tree: #{serialize_els doc.children, false, true}"
880 # can't use standard insert token thing, because it's already in
881 # open_els and must stay at it's current position in open_els
882 dest = adjusted_insertion_location ca
883 dest[0].children.splice dest[1], 0, last_node
884 last_node.parent = dest[0]
887 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
888 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
889 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
890 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
891 debug_log "tree: #{serialize_els doc.children, false, true}"
893 # 15. Create an element for the token for which formatting element
894 # was created, in the HTML namespace, with furthest block as the
896 new_element = fe.shallow_clone() # FIXME intended parent thing
897 # 16. Take all of the child nodes of furthest block and append them
898 # to the element created in the last step.
899 while fb.children.length
900 t = fb.children.shift()
901 t.parent = new_element
902 new_element.children.push t
903 # 17. Append that new element to furthest block.
904 new_element.parent = fb
905 fb.children.push new_element
906 # 18. Remove formatting element from the list of active formatting
907 # elements, and insert the new element into the list of active
908 # formatting elements at the position of the aforementioned
918 # 19. Remove formatting element from the stack of open elements,
919 # and insert the new element into the stack of open elements
920 # immediately below the position of furthest block in that stack.
927 open_els.splice i, 0, new_element
929 # 20. Jump back to the step labeled outer loop.
930 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
931 debug_log "tree: #{serialize_els doc.children, false, true}"
932 debug_log "open_els: #{serialize_els open_els, true, true}"
933 debug_log "afe: #{serialize_els afe, true, true}"
936 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
938 generate_implied_end_tags 'p' # arg is exception
939 if open_els[0].name isnt 'p'
941 while open_els.length > 1 # just in case
942 el = open_els.shift()
945 close_p_if_in_button_scope = ->
946 if is_in_button_scope 'p'
949 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
950 # aka insert_a_character = (t) ->
951 insert_character = (t) ->
952 dest = adjusted_insertion_location()
953 # fixfull check for Document node
955 prev = dest[0].children[dest[1] - 1]
956 if prev.type is TYPE_TEXT
959 dest[0].children.splice dest[1], 0, t
962 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
963 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
964 adjusted_insertion_location = (override_target = null) ->
965 # 1. If there was an override target specified, then let target be the
968 target = override_target
969 else # Otherwise, let target be the current node.
971 # 2. Determine the adjusted insertion location using the first matching
972 # steps from the following list:
974 # If foster parenting is enabled and target is a table, tbody, tfoot,
975 # thead, or tr element Foster parenting happens when content is
976 # misnested in tables.
977 if flag_foster_parenting and foster_parenting_targets[target.name]
978 loop # once. this is here so we can ``break`` to "abort these substeps"
979 # 1. Let last template be the last template element in the
980 # stack of open elements, if any.
982 last_template_i = null
983 for el, i in open_els
984 if el.name is 'template'
988 # 2. Let last table be the last table element in the stack of
989 # open elements, if any.
992 for el, i in open_els
993 if el.name is 'table'
997 # 3. If there is a last template and either there is no last
998 # table, or there is one, but last template is lower (more
999 # recently added) than last table in the stack of open
1000 # elements, then: let adjusted insertion location be inside
1001 # last template's template contents, after its last child (if
1002 # any), and abort these substeps.
1003 if last_template and (last_table is null or last_template_i < last_table_i)
1004 target = template # fixfull should be it's contents
1005 target_i = target.children.length
1007 # 4. If there is no last table, then let adjusted insertion
1008 # location be inside the first element in the stack of open
1009 # elements (the html element), after its last child (if any),
1010 # and abort these substeps. (fragment case)
1011 if last_table is null
1013 target = open_els[open_els.length - 1]
1014 target_i = target.children.length
1015 # 5. If last table has a parent element, then let adjusted
1016 # insertion location be inside last table's parent element,
1017 # immediately before last table, and abort these substeps.
1018 if last_table.parent?
1019 for c, i in last_table.parent.children
1021 target = last_table.parent
1025 # 6. Let previous element be the element immediately above last
1026 # table in the stack of open elements.
1028 # huh? how could it not have a parent?
1029 previous_element = open_els[last_table_i + 1]
1030 # 7. Let adjusted insertion location be inside previous
1031 # element, after its last child (if any).
1032 target = previous_element
1033 target_i = target.children.length
1034 # Note: These steps are involved in part because it's possible
1035 # for elements, the table element in this case in particular,
1036 # to have been moved by a script around in the DOM, or indeed
1037 # removed from the DOM entirely, after the element was inserted
1039 break # don't really loop
1041 # Otherwise Let adjusted insertion location be inside target, after
1042 # its last child (if any).
1043 target_i = target.children.length
1045 # 3. If the adjusted insertion location is inside a template element,
1046 # let it instead be inside the template element's template contents,
1047 # after its last child (if any).
1048 # fixfull (template)
1050 # 4. Return the adjusted insertion location.
1051 return [target, target_i]
1053 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1054 # aka create_an_element_for_token
1055 token_to_element = (t, namespace, intended_parent) ->
1056 t.type = TYPE_TAG # not TYPE_START_TAG
1057 # convert attributes into a hash
1059 while t.attrs_a.length
1061 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1062 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1064 # TODO 2. If the newly created element has an xmlns attribute in the
1065 # XMLNS namespace whose value is not exactly the same as the element's
1066 # namespace, that is a parse error. Similarly, if the newly created
1067 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1068 # value is not the XLink Namespace, that is a parse error.
1070 # fixfull: the spec says stuff about form pointers and ownerDocument
1074 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1075 insert_foreign_element = (token, namespace) ->
1076 ail = adjusted_insertion_location()
1079 el = token_to_element token, namespace, ail_el
1080 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1082 ail_el.children.splice ail_i, 0, el
1085 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1086 insert_html_element = insert_foreign_element # (token, namespace) ->
1088 # FIXME read implement "foster parenting" part
1089 # FIXME read spec, do this right
1090 # FIXME implement the override target thing
1091 # note: this assumes it's an open tag
1092 # FIXME what part of the spec is this?
1093 # TODO look through all callers of this, and see what they should really be doing.
1094 # eg probably insert_html_element for tokens
1095 tree_insert_element = (el, override_target = null, namespace = null) ->
1097 el.namespace = namespace
1098 dest = adjusted_insertion_location override_target
1099 if el.type is TYPE_START_TAG # means it's a "token"
1100 el = token_to_element el, namespace, dest[0]
1101 unless el.namespace?
1102 namespace = dest.namespace
1103 # fixfull: Document nodes sometimes can't accept more chidren
1104 dest[0].children.splice dest[1], 0, el
1109 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1110 # position should be [node, index_within_children]
1111 insert_comment = (t, position = null) ->
1112 position ?= adjusted_insertion_location()
1113 position[0].children.splice position[1], 0, t
1116 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1117 parse_generic_raw_text = (t) ->
1118 insert_html_element t
1119 tok_state = tok_state_rawtext
1120 original_insertion_mode = insertion_mode
1121 insertion_mode = ins_mode_text
1122 parse_generic_rcdata_text = (t) ->
1123 insert_html_element t
1124 tok_state = tok_state_rcdata
1125 original_insertion_mode = insertion_mode
1126 insertion_mode = ins_mode_text
1128 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1129 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1130 generate_implied_end_tags = (except = null) ->
1131 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1134 # 8.2.5.4 The rules for parsing tokens in HTML content
1135 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1137 # 8.2.5.4.1 The "initial" insertion mode
1138 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1139 ins_mode_initial = (t) ->
1142 if t.type is TYPE_COMMENT
1143 # fixfull this is supposed to be "the last child of the document object"
1146 if t.type is TYPE_DOCTYPE
1150 insertion_mode = ins_mode_before_html
1153 #fixfull (iframe, quirks)
1154 insertion_mode = ins_mode_before_html
1155 insertion_mode t # reprocess the token
1158 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1159 ins_mode_before_html = (t) ->
1160 if t.type is TYPE_DOCTYPE
1163 if t.type is TYPE_COMMENT
1168 if t.type is TYPE_START_TAG and t.name is 'html'
1169 el = token_to_element t, NS_HTML, doc
1170 open_els.unshift(el)
1171 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1172 insertion_mode = ins_mode_before_head
1174 if t.type is TYPE_END_TAG
1175 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1176 # fall through to "anything else"
1181 html_tok = new_open_tag 'html'
1182 el = token_to_element html_tok, NS_HTML, doc
1183 doc.children.push el
1185 # ?fixfull browsing context
1186 insertion_mode = ins_mode_before_head
1190 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1191 ins_mode_before_head = (t) ->
1194 if t.type is TYPE_COMMENT
1197 if t.type is TYPE_DOCTYPE
1200 if t.type is TYPE_START_TAG and t.name is 'html'
1203 if t.type is TYPE_START_TAG and t.name is 'head'
1204 el = insert_html_element t
1205 head_element_pointer = el
1206 insertion_mode = ins_mode_in_head
1207 if t.type is TYPE_END_TAG
1208 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1209 # fall through to Anything else below
1214 head_tok = new_open_tag 'head'
1215 el = insert_html_element head_tok
1216 head_element_pointer = el
1217 insertion_mode = ins_mode_in_head
1218 insertion_mode t # reprocess current token
1220 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1221 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1222 open_els.shift() # spec says this will be a 'head' node
1223 insertion_mode = ins_mode_after_head
1225 ins_mode_in_head = (t) ->
1226 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1229 if t.type is TYPE_COMMENT
1232 if t.type is TYPE_DOCTYPE
1235 if t.type is TYPE_START_TAG and t.name is 'html'
1238 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1239 el = insert_html_element t
1241 t.acknowledge_self_closing()
1243 if t.type is TYPE_START_TAG and t.name is 'meta'
1244 el = insert_html_element t
1246 t.acknowledge_self_closing()
1247 # fixfull encoding stuff
1249 if t.type is TYPE_START_TAG and t.name is 'title'
1250 parse_generic_rcdata_element t
1252 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1253 parse_generic_raw_text t
1255 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1256 insert_html_element t
1257 insertion_mode = in_head_noscript # FIXME implement
1259 if t.type is TYPE_START_TAG and t.name is 'script'
1260 ail = adjusted_insertion_location()
1261 el = token_to_element t, NS_HTML, ail
1262 el.flag_parser_inserted true # FIXME implement
1263 # fixfull frament case
1264 ail[0].children.splice ail[1], 0, el
1266 tok_state = tok_state_script_data
1267 original_insertion_mode = insertion_mode # make sure orig... is defined
1268 insertion_mode = ins_mode_text # FIXME implement
1270 if t.type is TYPE_END_TAG and t.name is 'head'
1271 open_els.shift() # will be a head element... spec says so
1272 insertion_mode = ins_mode_after_head
1274 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1275 ins_mode_in_head_else t
1277 if t.type is TYPE_START_TAG and t.name is 'template'
1278 insert_html_element t
1280 flag_frameset_ok = false
1281 insertion_mode = ins_mode_in_template
1282 template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1284 if t.type is TYPE_END_TAG and t.name is 'template'
1285 if template_tag_is_open()
1286 generate_implied_end_tags
1287 if open_els[0].name isnt 'template'
1290 el = open_els.shift()
1291 if el.name is 'template'
1293 clear_afe_to_marker()
1294 template_insertion_modes.shift()
1295 reset_insertion_mode()
1299 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1302 ins_mode_in_head_else t
1304 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1305 ins_mode_in_head_noscript = (t) ->
1307 console.log "ins_mode_in_head_noscript unimplemented"
1309 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1310 ins_mode_after_head_else = (t) ->
1311 body_tok = new_open_tag 'body'
1312 insert_html_element body_tok
1313 insertion_mode = ins_mode_in_body
1314 insertion_mode t # reprocess token
1316 ins_mode_after_head = (t) ->
1320 if t.type is TYPE_COMMENT
1323 if t.type is TYPE_DOCTYPE
1326 if t.type is TYPE_START_TAG and t.name is 'html'
1329 if t.type is TYPE_START_TAG and t.name is 'body'
1330 insert_html_element t
1331 flag_frameset_ok = false
1332 insertion_mode = ins_mode_in_body
1334 if t.type is TYPE_START_TAG and t.name is 'frameset'
1335 insert_html_element t
1336 insertion_mode = ins_mode_in_frameset
1338 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1340 open_els.unshift head_element_pointer
1342 for el, i of open_els
1343 if el is head_element_pointer
1344 open_els.splice i, 1
1346 console.log "warning: 23904 couldn't find head element in open_els"
1348 if t.type is TYPE_END_TAG and t.name is 'template'
1351 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1352 ins_mode_after_head_else t
1354 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1358 ins_mode_after_head_else t
1360 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1361 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1362 for node, i in open_els
1363 if node.name is name # FIXME check namespace too
1364 generate_implied_end_tags name # arg is exception
1365 parse_error() unless i is 0
1370 if special_elements[node.name]? # FIXME check namespac too
1373 ins_mode_in_body = (t) ->
1379 when "\t", "\u000a", "\u000c", "\u000d", ' '
1380 reconstruct_active_formatting_elements()
1383 reconstruct_active_formatting_elements()
1385 flag_frameset_ok = false
1394 return if template_tag_is_open()
1395 root_attrs = open_els[open_els.length - 1].attrs
1397 root_attrs[k] = v unless root_attrs[k]?
1398 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1399 # FIXME also do this for </template> (end tag)
1400 return ins_mode_in_head t
1407 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1408 close_p_if_in_button_scope()
1409 insert_html_element t
1410 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1411 close_p_if_in_button_scope()
1412 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1415 insert_html_element t
1416 # TODO lots more to implement here
1418 # If the list of active formatting elements
1419 # contains an a element between the end of the list and
1420 # the last marker on the list (or the start of the list
1421 # if there is no marker on the list), then this is a
1422 # parse error; run the adoption agency algorithm for
1423 # the tag name "a", then remove that element from the
1424 # list of active formatting elements and the stack of
1425 # open elements if the adoption agency algorithm didn't
1426 # already remove it (it might not have if the element
1427 # is not in table scope).
1430 if el.type is TYPE_AFE_MARKER
1440 for el, i in open_els
1442 open_els.splice i, 1
1443 reconstruct_active_formatting_elements()
1444 el = insert_html_element t
1446 when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1447 reconstruct_active_formatting_elements()
1448 el = insert_html_element t
1451 # fixfull quirksmode thing
1452 close_p_if_in_button_scope()
1453 insert_html_element t
1454 insertion_mode = ins_mode_in_table
1455 # TODO lots more to implement here
1456 else # any other start tag
1457 reconstruct_active_formatting_elements()
1458 insert_html_element t
1461 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1462 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1465 unless ok_tags[t.name]?
1468 # TODO stack of template insertion modes thing
1473 unless is_in_scope 'body'
1476 # TODO implement parse error and move to tree_after_body
1478 unless is_in_scope 'body' # weird, but it's what the spec says
1481 # TODO implement parse error and move to tree_after_body, reprocess
1482 when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1483 unless is_in_scope t.name, NS_HTML
1486 generate_implied_end_tags()
1487 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1490 el = open_els.shift()
1491 if el.name is t.name and el.namespace is NS_HTML
1493 # TODO lots more close tags to implement here
1495 unless is_in_button_scope 'p'
1497 insert_html_element new_open_tag 'p'
1499 # TODO lots more close tags to implement here
1500 when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1501 adoption_agency t.name
1502 # TODO lots more close tags to implement here
1504 in_body_any_other_end_tag t.name
1507 ins_mode_in_table_else = (t) ->
1509 flag_foster_parenting = true # FIXME
1511 flag_foster_parenting = false
1512 can_in_table = { # FIXME do this inline like everywhere else
1520 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1521 ins_mode_text = (t) ->
1522 if t.type is TYPE_TEXT
1525 if t.type is TYPE_EOF
1527 if open_els[0].name is 'script'
1528 open_els[0].flag 'already started', true
1530 insertion_mode = original_insertion_mode
1533 if t.type is TYPE_END_TAG and t.name is 'script'
1535 insertion_mode = original_insertion_mode
1536 # fixfull the spec seems to assume that I'm going to run the script
1537 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1539 if t.type is TYPE_END_TAG
1541 insertion_mode = original_insertion_mode
1543 console.log 'warning: end of ins_mode_text reached'
1545 # the functions below implement the tokenizer stats described here:
1546 # http://www.w3.org/TR/html5/syntax.html#tokenization
1548 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1549 ins_mode_in_table = (t) ->
1552 if can_in_table[t.name]
1553 original_insertion_mode = insertion_mode
1554 insertion_mode = ins_mode_in_table_text
1557 ins_mode_in_table_else t
1565 clear_stack_to_table_context()
1567 insert_html_element t
1568 insertion_mode = ins_mode_in_caption
1570 clear_stack_to_table_context()
1571 insert_html_element t
1572 insertion_mode = ins_mode_in_column_group
1574 clear_stack_to_table_context()
1575 insert_html_element new_open_tag 'colgroup'
1576 insertion_mode = ins_mode_in_column_group
1578 when 'tbody', 'tfoot', 'thead'
1579 clear_stack_to_table_context()
1580 insert_html_element t
1581 insertion_mode = ins_mode_in_table_body
1582 when 'td', 'th', 'tr'
1583 clear_stack_to_table_context()
1584 insert_html_element new_open_tag 'tbody'
1585 insertion_mode = ins_mode_in_table_body
1589 if is_in_table_scope 'table'
1591 el = open_els.shift()
1592 if el.name is 'table'
1594 reset_insertion_mode()
1596 when 'style', 'script', 'template'
1599 if token_is_input_hidden t
1600 ins_mode_in_table_else t
1603 el = insert_html_element t
1605 t.acknowledge_self_closing()
1608 if form_element_pointer?
1610 if template_tag_is_open()
1612 form_element_pointer = insert_html_element t
1615 ins_mode_in_table_else t
1619 if is_in_table_scope 'table'
1621 el = open_els.shift()
1622 if el.name is 'table'
1624 reset_insertion_mode()
1627 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1632 ins_mode_in_table_else t
1636 ins_mode_in_table_else t
1639 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1640 ins_mode_in_table_text = (t) ->
1641 if t.type is TYPE_TEXT and t.text is "\u0000"
1642 # huh? I thought the tokenizer didn't emit these
1645 if t.type is TYPE_TEXT
1646 pending_table_character_tokens.push t
1650 for old in pending_table_character_tokens
1651 unless is_space_tok old
1655 for old in pending_table_character_tokens
1656 insert_character old
1658 for old in pending_table_character_tokens
1659 ins_mode_table_else old
1660 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1661 insertion_mode = original_insertion_mode
1664 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1665 ins_mode_in_caption = (t) ->
1666 if t.type is TYPE_END_TAG and t.name is 'caption'
1667 if is_in_table_scope 'caption'
1668 generate_implied_end_tags()
1669 if open_els[0].name isnt 'caption'
1672 el = open_els.shift()
1673 if el.name is 'caption'
1675 clear_afe_to_marker()
1676 insertion_mode = in_table
1681 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1683 if is_in_table_scope 'caption'
1685 el = open_els.shift()
1686 if el.name is 'caption'
1688 clear_afe_to_marker()
1689 insertion_mode = in_table
1691 # else fragment case
1693 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1699 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1700 ins_mode_in_column_group = (t) ->
1704 if t.type is TYPE_COMMENT
1707 if t.type is TYPE_DOCTYPE
1710 if t.type is TYPE_START_TAG and t.name is 'html'
1713 if t.type is TYPE_START_TAG and t.name is 'col'
1714 el = insert_html_element t
1716 t.acknowledge_self_closing()
1718 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1719 if open_els[0].name is 'colgroup'
1721 insertion_mode = ins_mode_in_table
1725 if t.type is TYPE_END_TAG and t.name is 'col'
1728 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1731 if t.type is TYPE_EOF
1735 if open_els[0].name isnt 'colgroup'
1739 insertion_mode = ins_mode_in_table
1743 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1744 ins_mode_in_table_body = (t) ->
1745 if t.type is TYPE_START_TAG and t.name is 'tr'
1746 clear_stack_to_table_body_context()
1747 insert_html_element t
1748 insertion_mode = ins_mode_in_row
1750 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1752 clear_stack_to_table_body_context()
1753 insert_html_element new_open_tag 'tr'
1754 insertion_mode = ins_mode_in_row
1757 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1758 unless is_in_table_scope t.name # fixfull check namespace
1761 clear_stack_to_table_body_context()
1763 insertion_mode = ins_mode_in_table
1765 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1768 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1771 if table_scopers[el.name]
1776 clear_stack_to_table_body_context()
1778 insertion_mode = ins_mode_in_table
1781 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1787 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1788 ins_mode_in_row = (t) ->
1789 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1790 clear_stack_to_table_row_context()
1791 insert_html_element t
1792 insertion_mode = ins_mode_in_cell
1795 if t.type is TYPE_END_TAG and t.name is 'tr'
1796 if is_in_table_scope 'tr'
1797 clear_stack_to_table_row_context()
1799 insertion_mode = ins_mode_in_table_body
1803 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1804 if is_in_table_scope 'tr'
1805 clear_stack_to_table_row_context()
1807 insertion_mode = ins_mode_in_table_body
1812 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1813 if is_in_table_scope t.name # fixfull namespace
1814 if is_in_table_scope 'tr'
1815 clear_stack_to_table_row_context()
1817 insertion_mode = ins_mode_in_table_body
1822 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1828 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1830 generate_implied_end_tags()
1831 unless open_els[0].name is 'td' or open_els[0] is 'th'
1834 el = open_els.shift()
1835 if el.name is 'td' or el.name is 'th'
1837 clear_afe_to_marker()
1838 insertion_mode = ins_mode_in_row
1840 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1841 ins_mode_in_cell = (t) ->
1842 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1843 if is_in_table_scope t.name
1844 generate_implied_end_tags()
1845 if open_els[0].name isnt t.name
1848 el = open_els.shift()
1849 if el.name is t.name
1851 clear_afe_to_marker()
1852 insertion_mode = ins_mode_in_row
1856 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1859 if el.name is 'td' or el.name is 'th'
1862 if table_scopers[el.name]
1870 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1873 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1874 if is_in_table_scope t.name # fixfull namespace
1883 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
1884 ins_mode_in_select = (t) ->
1885 if t.type is TYPE_TEXT and t.text is "\u0000"
1888 if t.type is TYPE_TEXT
1891 if t.type is TYPE_COMMENT
1894 if t.type is TYPE_DOCTYPE
1897 if t.type is TYPE_START_TAG and t.name is 'html'
1900 if t.type is TYPE_START_TAG and t.name is 'option'
1901 if open_els[0].name is 'option'
1903 insert_html_element t
1905 if t.type is TYPE_START_TAG and t.name is 'optgroup'
1906 if open_els[0].name is 'option'
1908 if open_els[0].name is 'optgroup'
1910 insert_html_element t
1912 if t.type is TYPE_END_TAG and t.name is 'optgroup'
1913 if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
1915 if open_els[0].name is 'optgroup'
1920 if t.type is TYPE_END_TAG and t.name is 'option'
1921 if open_els[0].name is 'option'
1926 if t.type is TYPE_END_TAG and t.name is 'select'
1927 if is_in_select_scope 'select'
1929 el = open_els.shift()
1930 if el.name is 'select'
1932 reset_insertion_mode()
1936 if t.type is TYPE_START_TAG and t.name is 'select'
1939 el = open_els.shift()
1940 if el.name is 'select'
1942 reset_insertion_mode()
1943 # spec says that this is the same as </select> but it doesn't say
1944 # to check scope first
1946 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
1948 if is_in_select_scope 'select'
1951 el = open_els.shift()
1952 if el.name is 'select'
1954 reset_insertion_mode()
1957 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
1960 if t.type is TYPE_EOF
1967 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
1968 ins_mode_in_select_in_table = (t) ->
1969 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1972 el = open_els.shift()
1973 if el.name is 'select'
1975 reset_insertion_mode()
1978 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1980 unless is_in_table_scope t.name, NS_HTML
1983 el = open_els.shift()
1984 if el.name is 'select'
1986 reset_insertion_mode()
1990 ins_mode_in_select t
1993 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
1994 ins_mode_in_template = (t) ->
1995 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
1998 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2001 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2002 template_insertion_modes.shift()
2003 template_insertion_modes.unshift ins_mode_in_table
2004 insertion_mode = ins_mode_in_table
2007 if t.type is TYPE_START_TAG and t.name is 'col'
2008 template_insertion_modes.shift()
2009 template_insertion_modes.unshift ins_mode_in_column_group
2010 insertion_mode = ins_mode_in_column_group
2013 if t.type is TYPE_START_TAG and t.name is 'tr'
2014 template_insertion_modes.shift()
2015 template_insertion_modes.unshift ins_mode_in_table_body
2016 insertion_mode = ins_mode_in_table_body
2019 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2020 template_insertion_modes.shift()
2021 template_insertion_modes.unshift ins_mode_in_row
2022 insertion_mode = ins_mode_in_row
2025 if t.type is TYPE_START_TAG
2026 template_insertion_modes.shift()
2027 template_insertion_modes.unshift ins_mode_in_body
2028 insertion_mode = ins_mode_in_body
2031 if t.type is TYPE_END_TAG
2035 unless template_tag_is_open()
2040 el = open_els.shift()
2041 if el.name is 'template' # fixfull check namespace
2043 clear_afe_to_marker()
2044 template_insertion_modes.shift()
2045 reset_insertion_mode()
2048 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2049 ins_mode_after_body = (t) ->
2053 if t.type is TYPE_COMMENT
2054 insert_comment t, [open_els[0], open_els[0].children.length]
2056 if t.type is TYPE_DOCTYPE
2059 if t.type is TYPE_START_TAG and t.name is 'html'
2062 if t.type is TYPE_END_TAG and t.name is 'html'
2063 # fixfull fragment case
2064 insertion_mode = ins_mode_after_after_body
2066 if t.type is TYPE_EOF
2071 insertion_mode = ins_mode_in_body
2074 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2075 ins_mode_in_frameset = (t) ->
2079 if t.type is TYPE_COMMENT
2082 if t.type is TYPE_DOCTYPE
2085 if t.type is TYPE_START_TAG and t.name is 'html'
2088 if t.type is TYPE_START_TAG and t.name is 'frameset'
2089 insert_html_element t
2091 if t.type is TYPE_END_TAG and t.name is 'frameset'
2092 # TODO ?correct for: "if the current node is the root html element"
2093 if open_els.length is 1
2095 return # fragment case
2097 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2098 insertion_mode = ins_mode_after_frameset
2100 if t.type is TYPE_START_TAG and t.name is 'frame'
2101 insert_html_element t
2103 t.acknowledge_self_closing()
2105 if t.type is TYPE_START TAG and t.name is 'noframes'
2108 if t.type is TYPE_EOF
2109 # TODO ?correct for: "if the current node is not the root html element"
2110 if open_els.length isnt 1
2118 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2119 ins_mode_after_frameset = (t) ->
2123 if t.type is TYPE_COMMENT
2126 if t.type is TYPE_DOCTYPE
2129 if t.type is TYPE_START_TAG and t.name is 'html'
2132 if t.type is TYPE_END_TAG and t.name is 'html'
2133 insert_mode = ins_mode_after_after_frameset
2135 if t.type is TYPE_START_TAG and t.name is 'noframes'
2138 if t.type is TYPE_EOF
2145 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2146 ins_mode_after_after_body = (t) ->
2147 if t.type is TYPE_COMMENT
2148 insert_comment t, [doc, doc.children.length]
2150 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2153 if t.type is TYPE_EOF
2158 insertion_mode = ins_mode_in_body
2161 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2162 ins_mode_after_after_frameset = (t) ->
2163 if t.type is TYPE_COMMENT
2164 insert_comment t, [doc, doc.children.length]
2166 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2169 if t.type is TYPE_EOF
2172 if t.type is TYPE_START_TAG and t.name is 'noframes'
2183 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2185 switch c = txt.charAt(cur++)
2187 return new_text_node parse_character_reference()
2189 tok_state = tok_state_tag_open
2192 return new_text_node c
2194 return new_eof_token()
2196 return new_text_node c
2199 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2200 # not needed: tok_state_character_reference_in_data = ->
2201 # just call parse_character_reference()
2203 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2204 tok_state_rcdata = ->
2205 switch c = txt.charAt(cur++)
2207 return new_text_node parse_character_reference()
2209 tok_state = tok_state_rcdata_less_than_sign
2212 return new_character_token "\ufffd"
2214 return new_eof_token()
2216 return new_character_token c
2219 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2220 # not needed: tok_state_character_reference_in_rcdata = ->
2221 # just call parse_character_reference()
2223 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2224 tok_state_rawtext = ->
2225 switch c = txt.charAt(cur++)
2227 tok_state = tok_state_rawtext_less_than_sign
2230 return new_character_token "\ufffd"
2232 return new_eof_token()
2234 return new_character_token c
2237 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2238 tok_state_script_data = ->
2239 switch c = txt.charAt(cur++)
2241 tok_state = tok_state_script_data_less_than_sign
2244 return new_character_token "\ufffd"
2246 return new_eof_token()
2248 return new_character_token c
2251 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2252 tok_state_plaintext = ->
2253 switch c = txt.charAt(cur++)
2256 return new_character_token "\ufffd"
2258 return new_eof_token()
2260 return new_character_token c
2264 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2265 tok_state_tag_open = ->
2266 switch c = txt.charAt(cur++)
2268 tok_state = tok_state_markup_declaration_open
2270 tok_state = tok_state_end_tag_open
2273 tok_state = tok_state_bogus_comment
2275 if lc_alpha.indexOf(c) > -1
2276 tok_cur_tag = new_open_tag c
2277 tok_state = tok_state_tag_name
2278 else if uc_alpha.indexOf(c) > -1
2279 tok_cur_tag = new_open_tag c.toLowerCase()
2280 tok_state = tok_state_tag_name
2283 tok_state = tok_state_data
2284 cur -= 1 # we didn't parse/handle the char after <
2285 return new_text_node '<'
2288 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2289 tok_state_end_tag_open = ->
2290 switch c = txt.charAt(cur++)
2293 tok_state = tok_state_data
2296 tok_state = tok_state_data
2297 return new_text_node '</'
2299 if uc_alpha.indexOf(c) > -1
2300 tok_cur_tag = new_end_tag c.toLowerCase()
2301 tok_state = tok_state_tag_name
2302 else if lc_alpha.indexOf(c) > -1
2303 tok_cur_tag = new_end_tag c
2304 tok_state = tok_state_tag_name
2307 tok_state = tok_state_bogus_comment
2310 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2311 tok_state_tag_name = ->
2312 switch c = txt.charAt(cur++)
2313 when "\t", "\n", "\u000c", ' '
2314 tok_state = tok_state_before_attribute_name
2316 tok_state = tok_state_self_closing_start_tag
2318 tok_state = tok_state_data
2324 tok_cur_tag.name += "\ufffd"
2327 tok_state = tok_state_data
2329 if uc_alpha.indexOf(c) > -1
2330 tok_cur_tag.name += c.toLowerCase()
2332 tok_cur_tag.name += c
2335 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2336 tok_state_rcdata_less_than_sign = ->
2337 c = txt.charAt(cur++)
2339 temporary_buffer = ''
2340 tok_state = tok_state_rcdata_end_tag_open
2343 tok_state = tok_state_rcdata
2344 cur -= 1 # reconsume the input character
2345 return new_character_token '<'
2347 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2348 tok_state_rcdata_end_tag_open = ->
2349 c = txt.charAt(cur++)
2350 if uc_alpha.indexOf(c) > -1
2351 tok_cur_tag = new_end_tag c.toLowerCase()
2352 temporary_buffer += c
2353 tok_state = tok_state_rcdata_end_tag_name
2355 if lc_alpha.indexOf(c) > -1
2356 tok_cur_tag = new_end_tag c
2357 temporary_buffer += c
2358 tok_state = tok_state_rcdata_end_tag_name
2361 tok_state = tok_state_rcdata
2362 cur -= 1 # reconsume the input character
2363 return new_character_token "</" # fixfull separate these
2365 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2366 is_appropriate_end_tag = (t) ->
2367 # spec says to check against "the tag name of the last start tag to
2368 # have been emitted from this tokenizer", but this is only called from
2369 # the various "raw" states, which I'm pretty sure all push the start
2370 # token onto open_els. TODO: verify this after the script data states
2372 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2373 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2375 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2376 tok_state_rcdata_end_tag_name = ->
2377 c = txt.charAt(cur++)
2378 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2379 if is_appropriate_end_tag tok_cur_tag
2380 tok_state = tok_state_before_attribute_name
2382 # else fall through to "Anything else"
2384 if is_appropriate_end_tag tok_cur_tag
2385 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2387 # else fall through to "Anything else"
2389 if is_appropriate_end_tag tok_cur_tag
2390 tok_state = tok_state_data
2392 # else fall through to "Anything else"
2393 if uc_alpha.indexOf(c) > -1
2394 tok_cur_tag.name += c.toLowerCase()
2395 temporary_buffer += c
2397 if lc_alpha.indexOf(c) > -1
2398 tok_cur_tag.name += c
2399 temporary_buffer += c
2402 tok_state = tok_state_rcdata
2403 cur -= 1 # reconsume the input character
2404 return new_character_token '</' + temporary_buffer # fixfull separate these
2406 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2407 tok_state_rawtext_less_than_sign = ->
2408 c = txt.charAt(cur++)
2410 temporary_buffer = ''
2411 tok_state = tok_state_rawtext_end_tag_open
2414 tok_state = tok_state_rawtext
2415 cur -= 1 # reconsume the input character
2416 return new_character_token '<'
2418 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2419 tok_state_rawtext_end_tag_open = ->
2420 c = txt.charAt(cur++)
2421 if uc_alpha.indexOf(c) > -1
2422 tok_cur_tag = new_end_tag c.toLowerCase()
2423 temporary_buffer += c
2424 tok_state = tok_state_rawtext_end_tag_name
2426 if lc_alpha.indexOf(c) > -1
2427 tok_cur_tag = new_end_tag c
2428 temporary_buffer += c
2429 tok_state = tok_state_rawtext_end_tag_name
2432 tok_state = tok_state_rawtext
2433 cur -= 1 # reconsume the input character
2434 return new_character_token "</" # fixfull separate these
2436 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2437 tok_state_rawtext_end_tag_name = ->
2438 c = txt.charAt(cur++)
2439 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2440 if is_appropriate_end_tag tok_cur_tag
2441 tok_state = tok_state_before_attribute_name
2443 # else fall through to "Anything else"
2445 if is_appropriate_end_tag tok_cur_tag
2446 tok_state = tok_state_self_closing_start_tag
2448 # else fall through to "Anything else"
2450 if is_appropriate_end_tag tok_cur_tag
2451 tok_state = tok_state_data
2453 # else fall through to "Anything else"
2454 if uc_alpha.indexOf(c) > -1
2455 tok_cur_tag.name += c.toLowerCase()
2456 temporary_buffer += c
2458 if lc_alpha.indexOf(c) > -1
2459 tok_cur_tag.name += c
2460 temporary_buffer += c
2463 tok_state = tok_state_rawtext
2464 cur -= 1 # reconsume the input character
2465 return new_character_token '</' + temporary_buffer # fixfull separate these
2467 # TODO _all_ of the missing states here (17-33) are for parsing script tags
2469 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2470 tok_state_before_attribute_name = ->
2472 switch c = txt.charAt(cur++)
2473 when "\t", "\n", "\u000c", ' '
2476 tok_state = tok_state_self_closing_start_tag
2479 tok_state = tok_state_data
2485 attr_name = "\ufffd"
2486 when '"', "'", '<', '='
2491 tok_state = tok_state_data
2493 if uc_alpha.indexOf(c) > -1
2494 attr_name = c.toLowerCase()
2498 tok_cur_tag.attrs_a.unshift [attr_name, '']
2499 tok_state = tok_state_attribute_name
2502 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2503 tok_state_attribute_name = ->
2504 switch c = txt.charAt(cur++)
2505 when "\t", "\n", "\u000c", ' '
2506 tok_state = tok_state_after_attribute_name
2508 tok_state = tok_state_self_closing_start_tag
2510 tok_state = tok_state_before_attribute_value
2512 tok_state = tok_state_data
2518 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2521 tok_cur_tag.attrs_a[0][0] = c
2524 tok_state = tok_state_data
2526 if uc_alpha.indexOf(c) > -1
2527 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2529 tok_cur_tag.attrs_a[0][0] += c
2532 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2533 tok_state_after_attribute_name = ->
2534 c = txt.charAt(cur++)
2535 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2538 tok_state = tok_state_self_closing_start_tag
2541 tok_state = tok_state_before_attribute_value
2544 tok_state = tok_state_data
2546 if uc_alpha.indexOf(c) > -1
2547 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2548 tok_state = tok_state_attribute_name
2552 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2553 tok_state = tok_state_attribute_name
2557 tok_state = tok_state_data
2558 cur -= 1 # reconsume
2560 if c is '"' or c is "'" or c is '<'
2562 # fall through to Anything else
2564 tok_cur_tag.attrs_a.unshift [c, '']
2565 tok_state = tok_state_attribute_name
2567 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2568 tok_state_before_attribute_value = ->
2569 switch c = txt.charAt(cur++)
2570 when "\t", "\n", "\u000c", ' '
2573 tok_state = tok_state_attribute_value_double_quoted
2575 tok_state = tok_state_attribute_value_unquoted
2578 tok_state = tok_state_attribute_value_single_quoted
2581 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2582 tok_state = tok_state_attribute_value_unquoted
2585 tok_state = tok_state_data
2591 tok_state = tok_state_data
2593 tok_cur_tag.attrs_a[0][1] += c
2594 tok_state = tok_state_attribute_value_unquoted
2597 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2598 tok_state_attribute_value_double_quoted = ->
2599 switch c = txt.charAt(cur++)
2601 tok_state = tok_state_after_attribute_value_quoted
2603 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2606 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2609 tok_state = tok_state_data
2611 tok_cur_tag.attrs_a[0][1] += c
2614 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2615 tok_state_attribute_value_single_quoted = ->
2616 switch c = txt.charAt(cur++)
2618 tok_state = tok_state_after_attribute_value_quoted
2620 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2623 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2626 tok_state = tok_state_data
2628 tok_cur_tag.attrs_a[0][1] += c
2631 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2632 tok_state_attribute_value_unquoted = ->
2633 switch c = txt.charAt(cur++)
2634 when "\t", "\n", "\u000c", ' '
2635 tok_state = tok_state_before_attribute_name
2637 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2639 tok_state = tok_state_data
2644 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2647 tok_state = tok_state_data
2649 # Parse Error if ', <, = or ` (backtick)
2650 tok_cur_tag.attrs_a[0][1] += c
2653 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2654 tok_state_after_attribute_value_quoted = ->
2655 switch c = txt.charAt(cur++)
2656 when "\t", "\n", "\u000c", ' '
2657 tok_state = tok_state_before_attribute_name
2659 tok_state = tok_state_self_closing_start_tag
2661 tok_state = tok_state_data
2667 tok_state = tok_state_data
2670 tok_state = tok_state_before_attribute_name
2671 cur -= 1 # we didn't handle that char
2674 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
2675 # Don't set this as a state, just call it
2676 # returns a string (NOT a text node)
2677 parse_character_reference = (allowed_char = null, in_attr = false) ->
2678 if cur >= txt.length
2680 switch c = txt.charAt(cur)
2681 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
2682 # explicitly not a parse error
2685 # there has to be "one or more" alnums between & and ; to be a parse error
2688 if cur + 1 >= txt.length
2690 if txt.charAt(cur + 1).toLowerCase() is 'x'
2699 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
2703 if txt.charAt(start + i) is ';'
2705 # FIXME This is supposed to generate parse errors for some chars
2706 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
2713 if alnum.indexOf(txt.charAt(cur + i)) is -1
2716 # exit early, because parse_error() below needs at least one alnum
2718 if txt.charAt(cur + i) is ';'
2719 i += 1 # include ';' terminator in value
2720 decoded = decode_named_char_ref txt.substr(cur, i)
2727 # no ';' terminator (only legacy char refs)
2729 for i in [2..max] # no prefix matches, so ok to check shortest first
2730 c = legacy_char_refs[txt.substr(cur, i)]
2733 if txt.charAt(cur + i) is '='
2734 # "because some legacy user agents will
2735 # misinterpret the markup in those cases"
2738 if alnum.indexOf(txt.charAt(cur + i)) > -1
2739 # this makes attributes forgiving about url args
2741 # ok, and besides the weird exceptions for attributes...
2742 # return the matching char
2743 cur += i # consume entity chars
2744 parse_error() # because no terminating ";"
2748 return # never reached
2750 # tree constructor initialization
2751 # see comments on TYPE_TAG/etc for the structure of this data
2752 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
2754 afe = [] # active formatting elements
2755 template_insertion_modes = []
2756 insertion_mode = ins_mode_initial
2757 original_insertion_mode = insertion_mode # TODO check spec
2758 flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
2759 flag_frameset_ok = true
2761 flag_foster_parenting = false
2762 form_element_pointer = null
2763 temporary_buffer = null
2764 pending_table_character_tokens = []
2765 head_element_pointer = null
2766 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
2768 # tokenizer initialization
2769 tok_state = tok_state_data
2776 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
2779 test_results = passed: 0, failed: 0
2780 # everything below is tests on the above
2781 test_equals = (description, output, expected_output) ->
2782 if output is expected_output
2783 console.log "passed." # don't say name, so smart consoles can merge all of these
2785 console.log "FAILED: \"#{description}\""
2786 console.log " Expected: #{expected_output}"
2787 console.log " Actual: #{output}"
2788 serialize_els = (els, shallow, show_ids) ->
2794 serialized += t.serialize shallow, show_ids
2796 test_parser = (args) ->
2801 prev_node_id = 0 # reset counter
2802 parsed = parse_html args.html, errors_cb
2803 serialized = serialize_els parsed, false, false
2804 expected = 'tag:"html",{},[tag:"head",{},[],tag:"body",{},[' + args.expected + ']]'
2805 if serialized isnt expected
2806 debug_log_each (str) ->
2808 console.log "FAILED: \"#{args.name}\""
2809 console.log " Input: #{args.html}"
2810 console.log " Correct: #{expected}"
2811 console.log " Output: #{serialized}"
2812 if parse_errors.length > 0
2813 console.log " parse errs: #{JSON.stringify parse_errors}"
2815 console.log " No parse errors"
2816 test_results.failed += 1
2818 #console.log "passed \"#{args.name}\""
2819 test_results.passed += 1
2821 console.log "Tests passed: #{test_results.passed}"
2822 console.log "Tests Failed: #{test_results.failed}"
2824 test_parser name: "empty", \
2827 test_parser name: "just text", \
2829 expected: 'text:"abc"'
2830 test_parser name: "named entity", \
2832 expected: 'text:"a&1234"'
2833 test_parser name: "broken named character references", \
2834 html: "1&2&&3&aabbcc;",
2835 expected: 'text:"1&2&&3&aabbcc;"'
2836 test_parser name: "numbered entity overrides", \
2837 html: "1€€ ƒ",
2838 expected: 'text:"1€€ ƒ"'
2839 test_parser name: "open tag", \
2840 html: "foo<span>bar",
2841 expected: 'text:"foo",tag:"span",{},[text:"bar"]'
2842 test_parser name: "open tag with attributes", \
2843 html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
2844 expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
2845 test_parser name: "open tag with attributes of various quotings", \
2846 html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
2847 expected: 'text:"foo",tag:"span",{"abc":"def","autofocus":"","g":"hij","klm":"nopqrstuv\\""},[text:"bar"]'
2848 test_parser name: "attribute entity exceptions dq", \
2849 html: "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar",
2850 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
2851 test_parser name: "attribute entity exceptions sq", \
2852 html: "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar",
2853 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
2854 test_parser name: "attribute entity exceptions uq", \
2855 html: "foo<a href=foo?t=1&=2&o=3&lt=foo>bar",
2856 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
2857 test_parser name: "matching closing tags", \
2858 html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
2859 expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"'
2860 test_parser name: "missing closing tag inside", \
2861 html: "foo<div>bar<span>baz</div>qux",
2862 expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"'
2863 test_parser name: "mis-matched closing tags", \
2864 html: "<span>12<div>34</span>56</div>78",
2865 expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]'
2866 test_parser name: "mis-matched formatting elements", \
2867 html: "12<b>34<i>56</b>78</i>90",
2868 expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"'
2869 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
2870 html: '<p>1<b>2<i>3</b>4</i>5</p>',
2871 expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]'
2872 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
2873 html: '<b>1<p>2</b>3</p>',
2874 expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]'
2875 test_parser name: "crazy formatting elements test", \
2876 html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
2877 # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
2878 # firefox does this:
2879 expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
2880 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
2881 test_parser name: "html5lib aaa 1", \
2882 html: '<a><p></a></p>',
2883 expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]'
2884 test_parser name: "html5lib aaa 2", \
2885 html: '<a>1<p>2</a>3</p>',
2886 expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]'
2887 test_parser name: "html5lib aaa 3", \
2888 html: '<a>1<button>2</a>3</button>',
2889 expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]'
2890 test_parser name: "html5lib aaa 4", \
2891 html: '<a>1<b>2</a>3</b>',
2892 expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]'
2893 test_parser name: "html5lib aaa 5 (two divs deep)", \
2894 html: '<a>1<div>2<div>3</a>4</div>5</div>',
2895 expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]'
2896 test_parser name: "html5lib aaa 6 (foster parenting)", \
2897 html: '<table><a>1<p>2</a>3</p>',
2898 expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]'
2899 test_parser name: "html5lib aaa 7 (aaa, eof) 1", \
2900 html: '<b><b><a><p></a>',
2901 expected: 'tag:"b",{},[tag:"b",{},[tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]]]'
2902 test_parser name: "html5lib aaa 8 (aaa, eof) 2", \
2903 html: '<b><a><b><p></a>',
2904 expected: 'tag:"b",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2905 test_parser name: "html5lib aaa 9 (aaa, eof) 3", \
2906 html: '<a><b><b><p></a>',
2907 expected: 'tag:"a",{},[tag:"b",{},[tag:"b",{},[]]],tag:"b",{},[tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2908 test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \
2909 html: '<p>1<s id="A">2<b id="B">3</p>4</s>5</b>',
2910 expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]'
2911 test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
2912 html: '<table><a>1<td>2</td>3</table>',
2913 expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]'
2914 test_parser name: "html5lib aaa 12 (table with foster parenting, split text)", \
2915 html: '<table>A<td>B</td>C</table>',
2916 expected: 'text:"AC",tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2917 # TODO implement svg and namespacing
2918 #test_parser name: "html5lib aaa 13 (svg tr input)", \
2919 # html: '<a><svg><tr><input></a>',
2920 # expected: 'tag:"a",{},[svg:"svg",{},[svg:"tr",{},[svg:"input"]]]'
2921 test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \
2922 html: '<div><a><b><div><div><div><div><div><div><div><div><div><div></a>',
2923 expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"div",{},[tag:"div",{},[]]]]]]]]]]]]]'
2924 test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \
2925 html: '<div><a><b><u><i><code><div></a>',
2926 expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]'
2927 test_parser name: "html5lib aaa 16 (correctly nested 4b)", \
2928 html: '<b><b><b><b>x</b></b></b></b>y',
2929 expected: 'tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]],text:"y"'
2930 test_parser name: "html5lib aaa 17 (formatting, implied /p, noah's ark)", \
2931 html: '<p><b><b><b><b><p>x',
2932 expected: 'tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[]]]]],tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]]'
2933 test_parser name: "variation on html5lib aaa 17 (with attributes in various orders)", \
2934 html: '<p><b c="d" e="f"><b e="f" c="d"><b e="f" c="d"><b c="d" e="f"><p>x',
2935 expected: 'tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[]]]]],tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[text:"x"]]]]'
2936 test_parser name: "junk after attribute close-quote", \
2937 html: '<p><b c="d", e="f">foo<p>x',
2938 expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'
2939 test_parser name: "html5lib aaa02 1", \
2940 html: '<b>1<i>2<p>3</b>4',
2941 expected: 'tag:"b",{},[text:"1",tag:"i",{},[text:"2"]],tag:"i",{},[tag:"p",{},[tag:"b",{},[text:"3"],text:"4"]]'
2942 test_parser name: "html5lib aaa02 2", \
2943 html: '<a><div><style></style><address><a>',
2944 expected: 'tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"style",{},[]],tag:"address",{},[tag:"a",{},[],tag:"a",{},[]]]'
2945 test_parser name: "html5lib tables 1", \
2946 html: '<table><th>',
2947 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"th",{},[]]]]'
2948 test_parser name: "html5lib tables 2", \
2949 html: '<table><td>',
2950 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
2951 test_parser name: "html5lib tables 3", \
2952 html: "<table><col foo='bar'>",
2953 expected: 'tag:"table",{},[tag:"colgroup",{},[tag:"col",{"foo":"bar"},[]]]'
2954 test_parser name: "html5lib tables 4", \
2955 html: '<table><colgroup></html>foo',
2956 expected: 'text:"foo",tag:"table",{},[tag:"colgroup",{},[]]'
2957 test_parser name: "html5lib tables 5", \
2958 html: '<table></table><p>foo',
2959 expected: 'tag:"table",{},[],tag:"p",{},[text:"foo"]'
2960 test_parser name: "html5lib tables 6", \
2961 html: '<table></body></caption></col></colgroup></html></tbody></td></tfoot></th></thead></tr><td>',
2962 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
2963 test_parser name: "html5lib tables 7", \
2964 html: '<table><select><option>3</select></table>',
2965 expected: 'tag:"select",{},[tag:"option",{},[text:"3"]],tag:"table",{},[]'
2966 test_parser name: "html5lib tables 8", \
2967 html: '<table><select><table></table></select></table>',
2968 expected: 'tag:"select",{},[],tag:"table",{},[],tag:"table",{},[]'
2969 test_parser name: "html5lib tables 9", \
2970 html: '<table><select></table>',
2971 expected: 'tag:"select",{},[],tag:"table",{},[]'
2972 test_parser name: "html5lib tables 10", \
2973 html: '<table><select><option>A<tr><td>B</td></tr></table>',
2974 expected: 'tag:"select",{},[tag:"option",{},[text:"A"]],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2975 test_parser name: "html5lib tables 11", \
2976 html: '<table><td></body></caption></col></colgroup></html>foo',
2977 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
2978 test_parser name: "html5lib tables 12", \
2979 html: '<table><td>A</table>B',
2980 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"A"]]]],text:"B"'
2981 test_parser name: "html5lib tables 13", \
2982 html: '<table><tr><caption>',
2983 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[]],tag:"caption",{},[]]'
2984 test_parser name: "html5lib tables 14", \
2985 html: '<table><tr></body></caption></col></colgroup></html></td></th><td>foo',
2986 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
2987 test_parser name: "html5lib tables 15", \
2988 html: '<table><td><tr>',
2989 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]],tag:"tr",{},[]]]'
2990 test_parser name: "html5lib tables 16", \
2991 html: '<table><td><button><td>',
2992 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[tag:"button",{},[]],tag:"td",{},[]]]]'
2993 # TODO implement svg parsing
2994 #test_parser name: "html5lib tables 17", \
2995 # html: '<table><tr><td><svg><desc><td>',
2996 # expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[svg:"svg",{},[svg:"desc",{},[]]],tag:"td",{},[]]]]'