1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # Each node is an obect of the Node class. Here are the Node types:
52 TYPE_TAG = 0 # name, {attributes}, [children]
53 TYPE_TEXT = 1 # "text"
56 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
57 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
58 TYPE_END_TAG = 5 # name
60 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
61 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
73 debug_log_each = (cb) ->
74 for str in g_debug_log
79 constructor: (type, args = {}) ->
80 @type = type # one of the TYPE_* constants above
81 @name = args.name ? '' # tag name
82 @text = args.text ? '' # contents for text/comment nodes
83 @attrs = args.attrs ? {}
84 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
85 @children = args.children ? []
86 @namespace = args.namespace ? NS_HTML
87 @parent = args.parent ? null
91 @id = "#{++prev_node_id}"
92 shallow_clone: -> # return a new node that's the same except without the children or parent
93 # WARNING this doesn't work right on open tags that are still being parsed
95 attrs[k] = v for k, v of @attrs
96 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
97 acknowledge_self_closing: ->
99 serialize: (shallow = false, show_ids = false) -> # for unit tests
104 ret += JSON.stringify @name
119 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
125 ret += c.serialize shallow, show_ids
129 ret += JSON.stringify @text
132 ret += JSON.stringify @text
138 when TYPE_AAA_BOOKMARK
139 ret += 'aaa_bookmark'
142 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
145 # helpers: (only take args that are normally known when parser creates nodes)
146 new_open_tag = (name) ->
147 return new Node TYPE_START_TAG, name: name
148 new_end_tag = (name) ->
149 return new Node TYPE_END_TAG, name: name
150 new_element = (name) ->
151 return new Node TYPE_TAG, name: name
152 new_text_node = (txt) ->
153 return new Node TYPE_TEXT, text: txt
154 new_character_token = new_text_node
155 new_comment_node = (txt) ->
156 return new Node TYPE_COMMENT, text: txt
158 return new Node TYPE_EOF
160 return new Node TYPE_AFE_MARKER
161 new_aaa_bookmark = ->
162 return new Node TYPE_AAA_BOOKMARK
164 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
165 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
166 digits = "0123456789"
167 alnum = lc_alpha + uc_alpha + digits
168 hex_chars = digits + "abcdefABCDEF"
170 # some SVG elements have dashes in them
171 tag_name_chars = alnum + "-"
173 # http://www.w3.org/TR/html5/infrastructure.html#space-character
174 space_chars = "\u0009\u000a\u000c\u000d\u0020"
176 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
177 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
179 # These are the character references that don't need a terminating semicolon
180 # min length: 2, max: 6, none are a prefix of any other.
182 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
183 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
184 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
185 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
186 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
187 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
188 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
189 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
190 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
191 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
192 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
193 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
194 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
195 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
196 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
197 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
198 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
202 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
203 raw_text_elements = ['script', 'style']
204 escapable_raw_text_elements = ['textarea', 'title']
205 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
207 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
208 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
209 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
210 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
211 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
212 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
213 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
214 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
215 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
216 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
217 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
218 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
219 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
220 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
224 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
226 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
227 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
228 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
229 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
230 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
231 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
232 'determinant', 'diff', 'divergence', 'divide', 'domain',
233 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
234 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
235 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
236 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
237 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
238 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
239 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
240 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
241 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
242 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
243 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
244 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
245 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
246 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
247 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
248 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
249 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
250 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
251 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
252 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
253 'vectorproduct', 'xor'
255 # foreign_elements = [svg_elements..., mathml_elements...]
256 #normal_elements = All other allowed HTML elements are normal elements.
260 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
261 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
262 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
263 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
264 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
265 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
266 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
267 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
268 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
269 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
270 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
271 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
272 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
273 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
274 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
275 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
276 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
277 wbr:NS_HTML, xmp:NS_HTML,
280 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
281 'annotation-xml':NS_MATHML,
284 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
287 formatting_elements = {
288 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
289 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
293 foster_parenting_targets = {
315 el_is_special = (e) ->
316 return special_elements[e.name]?
317 # FIXME it should really be:
318 #return special_elements[e.name] is e.namespace
320 # decode_named_char_ref()
322 # The list of named character references is _huge_ so ask the browser to decode
323 # for us instead of wasting bandwidth/space on including the table here.
325 # Pass without the "&" but with the ";" examples:
326 # for "&" pass "amp;"
327 # for "′" pass "x2032;"
330 textarea: document.createElement('textarea')
332 # TODO test this in IE8
333 decode_named_char_ref = (txt) ->
335 decoded = g_dncr.cache[txt]
336 return decoded if decoded?
337 g_dncr.textarea.innerHTML = txt
338 decoded = g_dncr.textarea.value
339 return null if decoded is txt
340 return g_dncr.cache[txt] = decoded
342 parse_html = (txt, parse_error_cb = null) ->
343 cur = 0 # index of next char in txt to be parsed
344 # declare tree and tokenizer variables so they're in scope below
346 open_els = null # stack of open elements
347 afe = null # active formatting elements
348 template_insertion_modes = null
349 insertion_mode = null
350 original_insertion_mode = null
352 tok_cur_tag = null # partially parsed tag
353 flag_scripting = null
354 flag_frameset_ok = null
356 flag_foster_parenting = null
357 form_element_pointer = null
358 temporary_buffer = null
364 console.log "Parse error at character #{cur} of #{txt.length}"
366 afe_push = (new_el) ->
369 if el.name is new_el.name and el.namespace is new_el.namespace
371 continue unless new_el.attrs[k] is v
372 for k, v of new_el.attrs
373 continue unless el.attrs[k] is v
380 afe.unshift new_afe_marker()
382 # the functions below impliment the Tree Contstruction algorithm
383 # http://www.w3.org/TR/html5/syntax.html#tree-construction
385 # But first... the helpers
386 template_tag_is_open = ->
388 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
391 is_in_scope_x = (tag_name, scope, namespace) ->
393 if t.name is tag_name and (namespace is null or namespace is t.namespace)
395 if scope[t.name] is t.namespace
398 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
400 if t.name is tag_name and (namespace is null or namespace is t.namespace)
402 if scope[t.name] is t.namespace
404 if scope2[t.name] is t.namespace
407 standard_scopers = { # FIXME these are supposed to be namespace specific
408 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
409 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
410 template: NS_HTML, mi: NS_MATHML,
412 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
413 'annotation-xml': NS_MATHML,
415 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
417 button_scopers = button: NS_HTML
418 li_scopers = ol: NS_HTML, ul: NS_HTML
419 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
420 is_in_scope = (tag_name, namespace = null) ->
421 return is_in_scope_x tag_name, standard_scopers, namespace
422 is_in_button_scope = (tag_name, namespace = null) ->
423 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
424 is_in_table_scope = (tag_name, namespace = null) ->
425 return is_in_scope_x tag_name, table_scopers, namespace
426 is_in_select_scope = (tag_name, namespace = null) ->
428 if t.name is tag_name and (namespace is null or namespace is t.namespace)
430 if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
433 # this checks for a particular element, not by name
434 el_is_in_scope = (el) ->
438 if standard_scopers[t.name] is t.namespace
443 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
444 reset_insertion_mode = ->
445 # 1. Let last be false.
447 # 2. Let node be the last node in the stack of open elements.
449 node = open_els[node_i]
450 # 3. Loop: If node is the first node in the stack of open elements,
451 # then set last to true, and, if the parser was originally created as
452 # part of the HTML fragment parsing algorithm (fragment case) set node
453 # to the context element.
455 if node_i is open_els.length - 1
457 # fixfull (fragment case)
459 # 4. If node is a select element, run these substeps:
460 if node.name is 'select'
461 # 1. If last is true, jump to the step below labeled done.
463 # 2. Let ancestor be node.
466 # 3. Loop: If ancestor is the first node in the stack of
467 # open elements, jump to the step below labeled done.
469 if ancestor_i is open_els.length - 1
471 # 4. Let ancestor be the node before ancestor in the stack
474 ancestor = open_els[ancestor_i]
475 # 5. If ancestor is a template node, jump to the step below
477 if ancestor.name is 'template'
479 # 6. If ancestor is a table node, switch the insertion mode
480 # to "in select in table" and abort these steps.
481 if ancestor.name is 'table'
482 insertion_mode = ins_mode_in_select_in_table
484 # 7. Jump back to the step labeled loop.
485 # 8. Done: Switch the insertion mode to "in select" and abort
487 insertion_mode = ins_mode_in_select
489 # 5. If node is a td or th element and last is false, then switch
490 # the insertion mode to "in cell" and abort these steps.
491 if (node.name is 'td' or node.name is 'th') and last is false
492 insertion_mode = ins_mode_in_cell
494 # 6. If node is a tr element, then switch the insertion mode to "in
495 # row" and abort these steps.
497 insertion_mode = ins_mode_in_row
499 # 7. If node is a tbody, thead, or tfoot element, then switch the
500 # insertion mode to "in table body" and abort these steps.
501 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
502 insertion_mode = ins_mode_in_table_body
504 # 8. If node is a caption element, then switch the insertion mode
505 # to "in caption" and abort these steps.
506 if node.name is 'caption'
507 insertion_mode = ins_mode_in_caption
509 # 9. If node is a colgroup element, then switch the insertion mode
510 # to "in column group" and abort these steps.
511 if node.name is 'colgroup'
512 insertion_mode = ins_mode_in_column_group
514 # 10. If node is a table element, then switch the insertion mode to
515 # "in table" and abort these steps.
516 if node.name is 'table'
517 insertion_mode = ins_mode_in_table
519 # 11. If node is a template element, then switch the insertion mode
520 # to the current template insertion mode and abort these steps.
521 # fixfull (template insertion mode stack)
523 # 12. If node is a head element and last is true, then switch the
524 # insertion mode to "in body" ("in body"! not "in head"!) and abort
525 # these steps. (fragment case)
526 if node.name is 'head' and last
527 insertion_mode = ins_mode_in_body
529 # 13. If node is a head element and last is false, then switch the
530 # insertion mode to "in head" and abort these steps.
531 if node.name is 'head' and last is false
532 insertion_mode = ins_mode_in_head
534 # 14. If node is a body element, then switch the insertion mode to
535 # "in body" and abort these steps.
536 if node.name is 'body'
537 insertion_mode = ins_mode_in_body
539 # 15. If node is a frameset element, then switch the insertion mode
540 # to "in frameset" and abort these steps. (fragment case)
541 if node.name is 'frameset'
542 insertion_mode = ins_mode_in_frameset
544 # 16. If node is an html element, run these substeps:
545 if node.name is 'html'
546 # 1. If the head element pointer is null, switch the insertion
547 # mode to "before head" and abort these steps. (fragment case)
548 # fixfull (fragment case)
550 # 2. Otherwise, the head element pointer is not null, switch
551 # the insertion mode to "after head" and abort these steps.
552 insertion_mode = ins_mode_in_body # FIXME fixfull
554 # 17. If last is true, then switch the insertion mode to "in body"
555 # and abort these steps. (fragment case)
557 insertion_mode = ins_mode_in_body
559 # 18. Let node now be the node before node in the stack of open
562 node = open_els[node_i]
563 # 19. Return to the step labeled loop.
565 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
566 # this implementation is structured (mostly) as described at the link above.
567 # capitalized comments are the "labels" described at the link above.
568 reconstruct_active_formatting_elements = ->
569 return if afe.length is 0
570 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
575 if i is afe.length - 1
578 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
583 el = afe[i].shallow_clone()
584 tree_insert_element el
589 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
590 # adoption agency algorithm
592 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
593 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
594 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
595 adoption_agency = (subject) ->
596 debug_log "adoption_agency()"
597 debug_log "tree: #{serialize_els tree.children, false, true}"
598 debug_log "open_els: #{serialize_els open_els, true, true}"
599 debug_log "afe: #{serialize_els afe, true, true}"
600 if open_els[0].name is subject
603 # remove it from the list of active formatting elements (if found)
608 debug_log "aaa: starting off with subject on top of stack, exiting"
615 # 5. Let formatting element be the last element in the list of
616 # active formatting elements that: is between the end of the list
617 # and the last scope marker in the list, if any, or the start of
618 # the list otherwise, and has the tag name subject.
620 for t, fe_of_afe in afe
621 if t.type is TYPE_AFE_MARKER
626 # If there is no such element, then abort these steps and instead
627 # act as described in the "any other end tag" entry above.
629 debug_log "aaa: fe not found in afe"
630 in_body_any_other_end_tag subject
632 # 6. If formatting element is not in the stack of open elements,
633 # then this is a parse error; remove the element from the list, and
636 for t, fe_of_open_els in open_els
641 debug_log "aaa: fe not found in open_els"
643 # "remove it from the list" must mean afe, since it's not in open_els
644 afe.splice fe_of_afe, 1
646 # 7. If formatting element is in the stack of open elements, but
647 # the element is not in scope, then this is a parse error; abort
649 unless el_is_in_scope fe
650 debug_log "aaa: fe not in scope"
653 # 8. If formatting element is not the current node, this is a parse
654 # error. (But do not abort these steps.)
655 unless open_els[0] is fe
658 # 9. Let furthest block be the topmost node in the stack of open
659 # elements that is lower in the stack than formatting element, and
660 # is an element in the special category. There might not be one.
662 fb_of_open_els = null
669 # and continue, to see if there's one that's more "topmost"
670 # 10. If there is no furthest block, then the UA must first pop all
671 # the nodes from the bottom of the stack of open elements, from the
672 # current node up to and including formatting element, then remove
673 # formatting element from the list of active formatting elements,
674 # and finally abort these steps.
676 debug_log "aaa: no fb"
680 afe.splice fe_of_afe, 1
682 # 11. Let common ancestor be the element immediately above
683 # formatting element in the stack of open elements.
684 ca = open_els[fe_of_open_els + 1] # common ancestor
686 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
687 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
688 bookmark = new_aaa_bookmark()
691 afe.splice i, 0, bookmark
693 node = last_node = fb
697 # 3. Let node be the element immediately above node in the
698 # stack of open elements, or if node is no longer in the stack
699 # of open elements (e.g. because it got removed by this
700 # algorithm), the element that was immediately above node in
701 # the stack of open elements before node was removed.
705 node_next = open_els[i + 1]
707 node = node_next ? node_above
708 debug_log "inner loop #{inner}"
709 debug_log "tree: #{serialize_els tree.children, false, true}"
710 debug_log "open_els: #{serialize_els open_els, true, true}"
711 debug_log "afe: #{serialize_els afe, true, true}"
712 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
713 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
714 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
715 debug_log "node: #{node.serialize true, true}"
716 # TODO make sure node_above gets re-set if/when node is removed from open_els
718 # 4. If node is formatting element, then go to the next step in
719 # the overall algorithm.
723 # 5. If inner loop counter is greater than three and node is in
724 # the list of active formatting elements, then remove node from
725 # the list of active formatting elements.
731 debug_log "max out inner"
736 # 6. If node is not in the list of active formatting elements,
737 # then remove node from the stack of open elements and then go
738 # back to the step labeled inner loop.
740 debug_log "not in afe"
743 node_above = open_els[i + 1]
747 debug_log "the bones"
748 # 7. create an element for the token for which the element node
749 # was created, in the HTML namespace, with common ancestor as
750 # the intended parent; replace the entry for node in the list
751 # of active formatting elements with an entry for the new
752 # element, replace the entry for node in the stack of open
753 # elements with an entry for the new element, and let node be
755 new_node = node.shallow_clone()
759 debug_log "replaced in afe"
763 node_above = open_els[i + 1]
764 open_els[i] = new_node
765 debug_log "replaced in open_els"
768 # 8. If last node is furthest block, then move the
769 # aforementioned bookmark to be immediately after the new node
770 # in the list of active formatting elements.
775 debug_log "removed bookmark"
779 # "after" means lower
780 afe.splice i, 0, bookmark # "after as <-
781 debug_log "placed bookmark after node"
782 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
784 # 9. Insert last node into node, first removing it from its
785 # previous parent node if any.
787 debug_log "last_node has parent"
788 for c, i in last_node.parent.children
790 debug_log "removing last_node from parent"
791 last_node.parent.children.splice i, 1
793 node.children.push last_node
794 last_node.parent = node
795 # 10. Let last node be node.
798 # 11. Return to the step labeled inner loop.
799 # 14. Insert whatever last node ended up being in the previous step
800 # at the appropriate place for inserting a node, but using common
801 # ancestor as the override target.
803 # JASON: In the case where fe is immediately followed by fb:
804 # * inner loop exits out early (node==fe)
806 # * last_node is still in the tree (not a duplicate)
808 debug_log "FEFIRST? last_node has parent"
809 for c, i in last_node.parent.children
811 debug_log "removing last_node from parent"
812 last_node.parent.children.splice i, 1
815 debug_log "after aaa inner loop"
816 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
817 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
818 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
819 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
820 debug_log "tree: #{serialize_els tree.children, false, true}"
825 # can't use standard insert token thing, because it's already in
826 # open_els and must stay at it's current position in open_els
827 dest = adjusted_insertion_location ca
828 dest[0].children.splice dest[1], 0, last_node
829 last_node.parent = dest[0]
832 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
833 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
834 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
835 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
836 debug_log "tree: #{serialize_els tree.children, false, true}"
838 # 15. Create an element for the token for which formatting element
839 # was created, in the HTML namespace, with furthest block as the
841 new_element = fe.shallow_clone() # FIXME intended parent thing
842 # 16. Take all of the child nodes of furthest block and append them
843 # to the element created in the last step.
844 while fb.children.length
845 t = fb.children.shift()
846 t.parent = new_element
847 new_element.children.push t
848 # 17. Append that new element to furthest block.
849 new_element.parent = fb
850 fb.children.push new_element
851 # 18. Remove formatting element from the list of active formatting
852 # elements, and insert the new element into the list of active
853 # formatting elements at the position of the aforementioned
863 # 19. Remove formatting element from the stack of open elements,
864 # and insert the new element into the stack of open elements
865 # immediately below the position of furthest block in that stack.
872 open_els.splice i, 0, new_element
874 # 20. Jump back to the step labeled outer loop.
875 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
876 debug_log "tree: #{serialize_els tree.children, false, true}"
877 debug_log "open_els: #{serialize_els open_els, true, true}"
878 debug_log "afe: #{serialize_els afe, true, true}"
881 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
883 generate_implied_end_tags 'p' # arg is exception
884 if open_els[0].name isnt 'p'
886 while open_els.length > 1 # just in case
887 el = open_els.shift()
890 close_p_if_in_button_scope = ->
891 if is_in_button_scope 'p'
894 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
895 # aka insert_a_character = (t) ->
896 insert_character = (t) ->
897 dest = adjusted_insertion_location()
898 # fixfull check for Document node
900 prev = dest[0].children[dest[1] - 1]
901 if prev.type is TYPE_TEXT
904 dest[0].children.splice dest[1], 0, t
907 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
908 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
909 adjusted_insertion_location = (override_target = null) ->
910 # 1. If there was an override target specified, then let target be the
913 target = override_target
914 else # Otherwise, let target be the current node.
916 # 2. Determine the adjusted insertion location using the first matching
917 # steps from the following list:
919 # If foster parenting is enabled and target is a table, tbody, tfoot,
920 # thead, or tr element Foster parenting happens when content is
921 # misnested in tables.
922 if flag_foster_parenting and foster_parenting_targets[target.name]
923 loop # once. this is here so we can ``break`` to "abort these substeps"
924 # 1. Let last template be the last template element in the
925 # stack of open elements, if any.
927 last_template_i = null
928 for el, i in open_els
929 if el.name is 'template'
933 # 2. Let last table be the last table element in the stack of
934 # open elements, if any.
937 for el, i in open_els
938 if el.name is 'table'
942 # 3. If there is a last template and either there is no last
943 # table, or there is one, but last template is lower (more
944 # recently added) than last table in the stack of open
945 # elements, then: let adjusted insertion location be inside
946 # last template's template contents, after its last child (if
947 # any), and abort these substeps.
948 if last_template and (last_table is null or last_template_i < last_table_i)
949 target = template # fixfull should be it's contents
950 target_i = target.children.length
952 # 4. If there is no last table, then let adjusted insertion
953 # location be inside the first element in the stack of open
954 # elements (the html element), after its last child (if any),
955 # and abort these substeps. (fragment case)
956 if last_table is null
958 target = open_els[open_els.length - 1]
959 target_i = target.children.length
960 # 5. If last table has a parent element, then let adjusted
961 # insertion location be inside last table's parent element,
962 # immediately before last table, and abort these substeps.
963 if last_table.parent?
964 for c, i in last_table.parent.children
966 target = last_table.parent
970 # 6. Let previous element be the element immediately above last
971 # table in the stack of open elements.
973 # huh? how could it not have a parent?
974 previous_element = open_els[last_table_i + 1]
975 # 7. Let adjusted insertion location be inside previous
976 # element, after its last child (if any).
977 target = previous_element
978 target_i = target.children.length
979 # Note: These steps are involved in part because it's possible
980 # for elements, the table element in this case in particular,
981 # to have been moved by a script around in the DOM, or indeed
982 # removed from the DOM entirely, after the element was inserted
984 break # don't really loop
986 # Otherwise Let adjusted insertion location be inside target, after
987 # its last child (if any).
988 target_i = target.children.length
990 # 3. If the adjusted insertion location is inside a template element,
991 # let it instead be inside the template element's template contents,
992 # after its last child (if any).
995 # 4. Return the adjusted insertion location.
996 return [target, target_i]
998 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
999 # aka create_an_element_for_token
1000 token_to_element = (t, namespace, intended_parent) ->
1001 t.type = TYPE_TAG # not TYPE_START_TAG
1002 # convert attributes into a hash
1004 while t.attrs_a.length
1006 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1007 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
1009 # TODO 2. If the newly created element has an xmlns attribute in the
1010 # XMLNS namespace whose value is not exactly the same as the element's
1011 # namespace, that is a parse error. Similarly, if the newly created
1012 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1013 # value is not the XLink Namespace, that is a parse error.
1015 # fixfull: the spec says stuff about form pointers and ownerDocument
1019 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1020 insert_foreign_element = (token, namespace) ->
1021 ail = adjusted_insertion_location()
1024 el = token_to_element token, namespace, ail_el
1025 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1027 ail_el.children.splice ail_i, 0, el
1030 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1031 insert_html_element = insert_foreign_element # (token, namespace) ->
1033 # FIXME read implement "foster parenting" part
1034 # FIXME read spec, do this right
1035 # FIXME implement the override target thing
1036 # note: this assumes it's an open tag
1037 # FIXME what part of the spec is this?
1038 # TODO look through all callers of this, and see what they should really be doing.
1039 # eg probably insert_html_element for tokens
1040 tree_insert_element = (el, override_target = null, namespace = null) ->
1042 el.namespace = namespace
1043 dest = adjusted_insertion_location override_target
1044 if el.type is TYPE_START_TAG # means it's a "token"
1045 el = token_to_element el, namespace, dest[0]
1046 unless el.namespace?
1047 namespace = dest.namespace
1048 # fixfull: Document nodes sometimes can't accept more chidren
1049 dest[0].children.splice dest[1], 0, el
1054 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1055 # position should be [node, index_within_children]
1056 tree_insert_comment = (t, position = null) ->
1057 position ?= adjusted_insertion_location()
1058 position[0].children.splice position[1], 0, t
1061 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1062 parse_generic_raw_text = (t) ->
1063 insert_html_element t
1064 tok_state = tok_state_rawtext
1065 original_insertion_mode = insertion_mode
1066 insertion_mode = ins_mode_text
1067 parse_generic_rcdata_text = (t) ->
1068 insert_html_element t
1069 tok_state = tok_state_rcdata
1070 original_insertion_mode = insertion_mode
1071 insertion_mode = ins_mode_text
1073 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1074 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1075 generate_implied_end_tags = (except = null) ->
1076 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1079 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1080 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1081 open_els.shift() # spec says this will be a 'head' node
1082 insertion_mode = ins_mode_after_head
1084 ins_mode_in_head = (t) ->
1085 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1088 if t.type is TYPE_COMMENT
1089 tree_insert_comment t
1091 if t.type is TYPE_DOCTYPE
1094 if t.type is TYPE_START_TAG and t.name is 'html'
1097 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1098 el = insert_html_element t
1100 el.acknowledge_self_closing()
1102 if t.type is TYPE_START_TAG and t.name is 'meta'
1103 el = insert_html_element t
1105 el.acknowledge_self_closing()
1106 # fixfull encoding stuff
1108 if t.type is TYPE_START_TAG and t.name is 'title'
1109 parse_generic_rcdata_element t
1111 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1112 parse_generic_raw_text t
1114 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1115 insert_html_element t
1116 insertion_mode = in_head_noscript # FIXME implement
1118 if t.type is TYPE_START_TAG and t.name is 'script'
1119 ail = adjusted_insertion_location()
1120 el = token_to_element t, NS_HTML, ail
1121 el.flag_parser_inserted true # FIXME implement
1122 # fixfull frament case
1123 ail[0].children.splice ail[1], 0, el
1125 tok_state = tok_state_script_data
1126 original_insertion_mode = insertion_mode # make sure orig... is defined
1127 insertion_mode = ins_mode_text # FIXME implement
1129 if t.type is TYPE_END_TAG and t.name is 'head'
1130 open_els.shift() # will be a head element... spec says so
1131 insertion_mode = ins_mode_after_head
1133 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1134 ins_mode_in_head_else t
1136 if t.type is TYPE_START_TAG and t.name is 'template'
1137 insert_html_element t
1139 flag_frameset_ok = false
1140 insertion_mode = ins_mode_in_template
1141 template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1143 if t.type is TYPE_END_TAG and t.name is 'template'
1144 if template_tag_is_open()
1145 generate_implied_end_tags
1146 if open_els[0].name isnt 'template'
1149 el = open_els.shift()
1150 if el.name is 'template'
1152 clear_afe_to_marker()
1153 template_insertion_modes.shift()
1154 reset_insertion_mode()
1158 if (t.type is TYPE_OPEN_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1161 ins_mode_in_head_else t
1163 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1164 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1165 for node, i in open_els
1166 if node.name is name # FIXME check namespace too
1167 generate_implied_end_tags name # arg is exception
1168 parse_error() unless i is 0
1173 if special_elements[node.name]? # FIXME check namespac too
1176 ins_mode_in_body = (t) ->
1182 when "\t", "\u000a", "\u000c", "\u000d", ' '
1183 reconstruct_active_formatting_elements()
1186 reconstruct_active_formatting_elements()
1188 flag_frameset_ok = false
1190 tree_insert_comment t
1197 return if template_tag_is_open()
1198 root_attrs = open_els[open_els.length - 1].attrs
1200 root_attrs[k] = v unless root_attrs[k]?
1201 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1202 # FIXME also do this for </template> (end tag)
1203 return ins_mode_in_head t
1210 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1211 close_p_if_in_button_scope()
1212 insert_html_element t
1213 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1214 close_p_if_in_button_scope()
1215 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1218 insert_html_element t
1219 # TODO lots more to implement here
1221 # If the list of active formatting elements
1222 # contains an a element between the end of the list and
1223 # the last marker on the list (or the start of the list
1224 # if there is no marker on the list), then this is a
1225 # parse error; run the adoption agency algorithm for
1226 # the tag name "a", then remove that element from the
1227 # list of active formatting elements and the stack of
1228 # open elements if the adoption agency algorithm didn't
1229 # already remove it (it might not have if the element
1230 # is not in table scope).
1233 if el.type is TYPE_AFE_MARKER
1243 for el, i in open_els
1245 open_els.splice i, 1
1246 reconstruct_active_formatting_elements()
1247 el = insert_html_element t
1249 when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1250 reconstruct_active_formatting_elements()
1251 el = insert_html_element t
1254 # fixfull quirksmode thing
1255 close_p_if_in_button_scope()
1256 insert_html_element t
1257 insertion_mode = ins_mode_in_table
1258 # TODO lots more to implement here
1259 else # any other start tag
1260 reconstruct_active_formatting_elements()
1261 insert_html_element t
1264 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1265 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1268 unless ok_tags[t.name]?
1271 # TODO stack of template insertion modes thing
1272 flag_parsing = false # stop parsing
1276 unless is_in_scope 'body'
1279 # TODO implement parse error and move to tree_after_body
1281 unless is_in_scope 'body' # weird, but it's what the spec says
1284 # TODO implement parse error and move to tree_after_body, reprocess
1285 when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1286 unless is_in_scope t.name, NS_HTML
1289 generate_implied_end_tags()
1290 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1293 el = open_els.shift()
1294 if el.name is t.name and el.namespace is NS_HTML
1296 # TODO lots more close tags to implement here
1298 unless is_in_button_scope 'p'
1300 insert_html_element new_open_tag 'p'
1302 # TODO lots more close tags to implement here
1303 when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1304 adoption_agency t.name
1305 # TODO lots more close tags to implement here
1307 in_body_any_other_end_tag t.name
1310 ins_mode_in_table_else = (t) ->
1312 flag_foster_parenting = true # FIXME
1314 flag_foster_parenting = false
1322 clear_to_table_stopers = {
1327 clear_stack_to_table_context = ->
1329 if clear_to_table_stopers[open_els[0].name]?
1333 clear_to_table_body_stopers = {
1340 clear_stack_to_table_body_context = ->
1342 if clear_to_table_body_stopers[open_els[0].name]?
1346 clear_to_table_row_stopers = {
1351 clear_stack_to_table_row_context = ->
1353 if clear_to_table_row_stopers[open_els[0].name]?
1357 clear_afe_to_marker = ->
1360 if el.type is TYPE_AFE_MARKER
1363 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1364 ins_mode_text = (t) ->
1365 if t.type is TYPE_TEXT
1368 if t.type is TYPE_EOF
1370 if open_els[0].name is 'script'
1371 open_els[0].flag 'already started', true
1373 insertion_mode = original_insertion_mode
1376 if t.type is TYPE_END_TAG and t.name is 'script'
1378 insertion_mode = original_insertion_mode
1379 # fixfull the spec seems to assume that I'm going to run the script
1380 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1382 if t.type is TYPE_END_TAG
1384 insertion_mode = original_insertion_mode
1386 console.log 'warning: end of ins_mode_text reached'
1388 # the functions below implement the tokenizer stats described here:
1389 # http://www.w3.org/TR/html5/syntax.html#tokenization
1391 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1392 ins_mode_in_table = (t) ->
1395 if can_in_table[t.name]
1396 original_insertion_mode = insertion_mode
1397 insertion_mode = ins_mode_in_table_text
1400 ins_mode_in_table_else t
1402 tree_insert_comment t
1408 clear_stack_to_table_context()
1410 insert_html_element t
1411 insertion_mode = ins_mode_in_caption
1413 clear_stack_to_table_context()
1414 insert_html_element t
1415 insertion_mode = ins_mode_in_column_group
1417 clear_stack_to_table_context()
1418 insert_html_element new_open_tag 'colgroup'
1419 insertion_mode = ins_mode_in_column_group
1421 when 'tbody', 'tfoot', 'thead'
1422 clear_stack_to_table_context()
1423 insert_html_element t
1424 insertion_mode = ins_mode_in_table_body
1425 when 'td', 'th', 'tr'
1426 clear_stack_to_table_context()
1427 insert_html_element new_open_tag 'tbody'
1428 insertion_mode = ins_mode_in_table_body
1432 if is_in_table_scope 'table'
1434 el = open_els.shift()
1435 if el.name is 'table'
1437 reset_insertion_mode()
1439 when 'style', 'script', 'template'
1442 if token_is_input_hidden t
1443 ins_mode_in_table_else t
1446 el = insert_html_element t
1448 el.acknowledge_self_closing()
1451 if form_element_pointer?
1453 if template_tag_is_open()
1455 form_element_pointer = insert_html_element t
1458 ins_mode_in_table_else t
1462 if is_in_table_scope 'table'
1464 el = open_els.shift()
1465 if el.name is 'table'
1467 reset_insertion_mode()
1470 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1475 ins_mode_in_table_else t
1479 ins_mode_in_table_else t
1482 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1483 ins_mode_in_table_text = (t) ->
1490 console.log "unimplemented ins_mode_in_table_text"
1493 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1494 ins_mode_in_table_body = (t) ->
1495 if t.type is TYPE_START_TAG and t.name is 'tr'
1496 clear_stack_to_table_body_context()
1497 insert_html_element t
1499 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1501 clear_stack_to_table_body_context()
1502 insert_html_element new_open_tag 'tr'
1503 insertion_mode = ins_mode_in_row
1506 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1507 unless is_in_table_scope t.name # fixfull check namespace
1510 clear_stack_to_table_body_context()
1512 insertion_mode = ins_mode_in_table
1514 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1517 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1520 if table_scopers[el.name]
1525 clear_stack_to_table_body_context()
1527 insertion_mode = ins_mode_in_table
1530 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1536 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1537 ins_mode_in_row = (t) ->
1538 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1539 clear_stack_to_table_row_context()
1540 insert_html_element t
1541 insertion_mode = ins_mode_in_cell
1544 if t.type is TYPE_END_TAG and t.name is 'tr'
1545 if is_in_table_scope 'tr'
1546 clear_stack_to_table_row_context()
1548 insertion_mode = ins_mode_in_table_body
1552 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1553 if is_in_table_scope 'tr'
1554 clear_stack_to_table_row_context()
1556 insertion_mode = ins_mode_in_table_body
1561 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1562 if is_in_table_scope t.name # fixfull namespace
1563 if is_in_table_scope 'tr'
1564 clear_stack_to_table_row_context()
1566 insertion_mode = ins_mode_in_table_body
1571 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1577 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1579 generate_implied_end_tags()
1580 unless open_els[0].name is 'td' or open_els[0] is 'th'
1583 el = open_els.shift()
1584 if el.name is 'td' or el.name is 'th'
1586 clear_afe_to_marker()
1587 insertion_mode = ins_mode_in_row
1589 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1590 ins_mode_in_cell = (t) ->
1591 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1592 if is_in_table_scope t.name
1593 generate_implied_end_tags()
1594 if open_els[0].name isnt t.name
1597 el = open_els.shift()
1598 if el.name is t.name
1600 clear_afe_to_marker()
1601 insertion_mode = ins_mode_in_row
1605 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1608 if el.name is 'td' or el.name is 'th'
1611 if table_scopers[el.name]
1619 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1622 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1623 if is_in_table_scope t.name # fixfull namespace
1632 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
1634 switch c = txt.charAt(cur++)
1636 return new_text_node parse_character_reference()
1638 tok_state = tok_state_tag_open
1641 return new_text_node c
1643 return new_eof_token()
1645 return new_text_node c
1648 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
1649 # not needed: tok_state_character_reference_in_data = ->
1650 # just call parse_character_reference()
1652 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
1653 tok_state_rcdata = ->
1654 switch c = txt.charAt(cur++)
1656 return new_text_node parse_character_reference()
1658 tok_state = tok_state_rcdata_less_than_sign
1661 return new_character_token "\ufffd"
1663 return new_eof_token()
1665 return new_character_token c
1668 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
1669 # not needed: tok_state_character_reference_in_rcdata = ->
1670 # just call parse_character_reference()
1672 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
1673 tok_state_rawtext = ->
1674 switch c = txt.charAt(cur++)
1676 tok_state = tok_state_rawtext_less_than_sign
1679 return new_character_token "\ufffd"
1681 return new_eof_token()
1683 return new_character_token c
1686 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
1687 tok_state_script_data = ->
1688 switch c = txt.charAt(cur++)
1690 tok_state = tok_state_script_data_less_than_sign
1693 return new_character_token "\ufffd"
1695 return new_eof_token()
1697 return new_character_token c
1700 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
1701 tok_state_plaintext = ->
1702 switch c = txt.charAt(cur++)
1705 return new_character_token "\ufffd"
1707 return new_eof_token()
1709 return new_character_token c
1713 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
1714 tok_state_tag_open = ->
1715 switch c = txt.charAt(cur++)
1717 tok_state = tok_state_markup_declaration_open
1719 tok_state = tok_state_end_tag_open
1722 tok_state = tok_state_bogus_comment
1724 if lc_alpha.indexOf(c) > -1
1725 tok_cur_tag = new_open_tag c
1726 tok_state = tok_state_tag_name
1727 else if uc_alpha.indexOf(c) > -1
1728 tok_cur_tag = new_open_tag c.toLowerCase()
1729 tok_state = tok_state_tag_name
1732 tok_state = tok_state_data
1733 cur -= 1 # we didn't parse/handle the char after <
1734 return new_text_node '<'
1737 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
1738 tok_state_end_tag_open = ->
1739 switch c = txt.charAt(cur++)
1742 tok_state = tok_state_data
1745 tok_state = tok_state_data
1746 return new_text_node '</'
1748 if uc_alpha.indexOf(c) > -1
1749 tok_cur_tag = new_end_tag c.toLowerCase()
1750 tok_state = tok_state_tag_name
1751 else if lc_alpha.indexOf(c) > -1
1752 tok_cur_tag = new_end_tag c
1753 tok_state = tok_state_tag_name
1756 tok_state = tok_state_bogus_comment
1759 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
1760 tok_state_tag_name = ->
1761 switch c = txt.charAt(cur++)
1762 when "\t", "\n", "\u000c", ' '
1763 tok_state = tok_state_before_attribute_name
1765 tok_state = tok_state_self_closing_start_tag
1767 tok_state = tok_state_data
1773 tok_cur_tag.name += "\ufffd"
1776 tok_state = tok_state_data
1778 if uc_alpha.indexOf(c) > -1
1779 tok_cur_tag.name += c.toLowerCase()
1781 tok_cur_tag.name += c
1784 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
1785 tok_state_rcdata_less_than_sign = ->
1786 c = txt.charAt(cur++)
1788 temporary_buffer = ''
1789 tok_state = tok_state_rcdata_end_tag_open
1792 tok_state = tok_state_rcdata
1793 cur -= 1 # reconsume the input character
1794 return new_character_token '<'
1796 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
1797 tok_state_rcdata_end_tag_open = ->
1798 c = txt.charAt(cur++)
1799 if uc_alpha.indexOf(c) > -1
1800 tok_cur_tag = new_end_tag c.toLowerCase()
1801 temporary_buffer += c
1802 tok_state = tok_state_rcdata_end_tag_name
1804 if lc_alpha.indexOf(c) > -1
1805 tok_cur_tag = new_end_tag c
1806 temporary_buffer += c
1807 tok_state = tok_state_rcdata_end_tag_name
1810 tok_state = tok_state_rcdata
1811 cur -= 1 # reconsume the input character
1812 return new_character_token "</" # fixfull separate these
1814 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
1815 is_appropriate_end_tag = (t) ->
1816 # spec says to check against "the tag name of the last start tag to
1817 # have been emitted from this tokenizer", but this is only called from
1818 # the various "raw" states, which I'm pretty sure all push the start
1819 # token onto open_els. TODO: verify this after the script data states
1821 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
1822 return t.type is TYPE_END_TAG and t.name is open_els[0].name
1824 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
1825 tok_state_rcdata_end_tag_name = ->
1826 c = txt.charAt(cur++)
1827 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
1828 if is_appropriate_end_tag tok_cur_tag
1829 tok_state = tok_state_before_attribute_name
1831 # else fall through to "Anything else"
1833 if is_appropriate_end_tag tok_cur_tag
1834 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
1836 # else fall through to "Anything else"
1838 if is_appropriate_end_tag tok_cur_tag
1839 tok_state = tok_state_data
1841 # else fall through to "Anything else"
1842 if uc_alpha.indexOf(c) > -1
1843 tok_cur_tag.name += c.toLowerCase()
1844 temporary_buffer += c
1846 if lc_alpha.indexOf(c) > -1
1847 tok_cur_tag.name += c
1848 temporary_buffer += c
1851 tok_state = tok_state_rcdata
1852 cur -= 1 # reconsume the input character
1853 return new_character_token '</' + temporary_buffer # fixfull separate these
1855 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
1856 tok_state_rawtext_less_than_sign = ->
1857 c = txt.charAt(cur++)
1859 temporary_buffer = ''
1860 tok_state = tok_state_rawtext_end_tag_open
1863 tok_state = tok_state_rawtext
1864 cur -= 1 # reconsume the input character
1865 return new_character_token '<'
1867 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
1868 tok_state_rawtext_end_tag_open = ->
1869 c = txt.charAt(cur++)
1870 if uc_alpha.indexOf(c) > -1
1871 tok_cur_tag = new_end_tag c.toLowerCase()
1872 temporary_buffer += c
1873 tok_state = tok_state_rawtext_end_tag_name
1875 if lc_alpha.indexOf(c) > -1
1876 tok_cur_tag = new_end_tag c
1877 temporary_buffer += c
1878 tok_state = tok_state_rawtext_end_tag_name
1881 tok_state = tok_state_rawtext
1882 cur -= 1 # reconsume the input character
1883 return new_character_token "</" # fixfull separate these
1885 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
1886 tok_state_rawtext_end_tag_name = ->
1887 c = txt.charAt(cur++)
1888 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
1889 if is_appropriate_end_tag tok_cur_tag
1890 tok_state = tok_state_before_attribute_name
1892 # else fall through to "Anything else"
1894 if is_appropriate_end_tag tok_cur_tag
1895 tok_state = tok_state_self_closing_start_tag
1897 # else fall through to "Anything else"
1899 if is_appropriate_end_tag tok_cur_tag
1900 tok_state = tok_state_data
1902 # else fall through to "Anything else"
1903 if uc_alpha.indexOf(c) > -1
1904 tok_cur_tag.name += c.toLowerCase()
1905 temporary_buffer += c
1907 if lc_alpha.indexOf(c) > -1
1908 tok_cur_tag.name += c
1909 temporary_buffer += c
1912 tok_state = tok_state_rawtext
1913 cur -= 1 # reconsume the input character
1914 return new_character_token '</' + temporary_buffer # fixfull separate these
1916 # TODO _all_ of the missing states here (17-33) are for parsing script tags
1918 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
1919 tok_state_before_attribute_name = ->
1921 switch c = txt.charAt(cur++)
1922 when "\t", "\n", "\u000c", ' '
1925 tok_state = tok_state_self_closing_start_tag
1928 tok_state = tok_state_data
1934 attr_name = "\ufffd"
1935 when '"', "'", '<', '='
1940 tok_state = tok_state_data
1942 if uc_alpha.indexOf(c) > -1
1943 attr_name = c.toLowerCase()
1947 tok_cur_tag.attrs_a.unshift [attr_name, '']
1948 tok_state = tok_state_attribute_name
1951 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
1952 tok_state_attribute_name = ->
1953 switch c = txt.charAt(cur++)
1954 when "\t", "\n", "\u000c", ' '
1955 tok_state = tok_state_after_attribute_name
1957 tok_state = tok_state_self_closing_start_tag
1959 tok_state = tok_state_before_attribute_value
1961 tok_state = tok_state_data
1967 tok_cur_tag.attrs_a[0][0] = "\ufffd"
1970 tok_cur_tag.attrs_a[0][0] = c
1973 tok_state = tok_state_data
1975 if uc_alpha.indexOf(c) > -1
1976 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
1978 tok_cur_tag.attrs_a[0][0] += c
1981 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
1982 tok_state_after_attribute_name = ->
1983 c = txt.charAt(cur++)
1984 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
1987 tok_state = tok_state_self_closing_start_tag
1990 tok_state = tok_state_before_attribute_value
1993 tok_state = tok_state_data
1995 if uc_alpha.indexOf(c) > -1
1996 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
1997 tok_state = tok_state_attribute_name
2001 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2002 tok_state = tok_state_attribute_name
2006 tok_state = tok_state_data
2007 cur -= 1 # reconsume
2009 if c is '"' or c is "'" or c is '<'
2011 # fall through to Anything else
2013 tok_cur_tag.attrs_a.unshift [c, '']
2014 tok_state = tok_state_attribute_name
2016 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2017 tok_state_before_attribute_value = ->
2018 switch c = txt.charAt(cur++)
2019 when "\t", "\n", "\u000c", ' '
2022 tok_state = tok_state_attribute_value_double_quoted
2024 tok_state = tok_state_attribute_value_unquoted
2027 tok_state = tok_state_attribute_value_single_quoted
2030 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2031 tok_state = tok_state_attribute_value_unquoted
2034 tok_state = tok_state_data
2040 tok_state = tok_state_data
2042 tok_cur_tag.attrs_a[0][1] += c
2043 tok_state = tok_state_attribute_value_unquoted
2046 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2047 tok_state_attribute_value_double_quoted = ->
2048 switch c = txt.charAt(cur++)
2050 tok_state = tok_state_after_attribute_value_quoted
2052 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2055 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2058 tok_state = tok_state_data
2060 tok_cur_tag.attrs_a[0][1] += c
2063 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2064 tok_state_attribute_value_single_quoted = ->
2065 switch c = txt.charAt(cur++)
2067 tok_state = tok_state_after_attribute_value_quoted
2069 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2072 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2075 tok_state = tok_state_data
2077 tok_cur_tag.attrs_a[0][1] += c
2080 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2081 tok_state_attribute_value_unquoted = ->
2082 switch c = txt.charAt(cur++)
2083 when "\t", "\n", "\u000c", ' '
2084 tok_state = tok_state_before_attribute_name
2086 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2088 tok_state = tok_state_data
2093 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2096 tok_state = tok_state_data
2098 # Parse Error if ', <, = or ` (backtick)
2099 tok_cur_tag.attrs_a[0][1] += c
2102 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2103 tok_state_after_attribute_value_quoted = ->
2104 switch c = txt.charAt(cur++)
2105 when "\t", "\n", "\u000c", ' '
2106 tok_state = tok_state_before_attribute_name
2108 tok_state = tok_state_self_closing_start_tag
2110 tok_state = tok_state_data
2116 tok_state = tok_state_data
2119 tok_state = tok_state_before_attribute_name
2120 cur -= 1 # we didn't handle that char
2123 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
2124 # Don't set this as a state, just call it
2125 # returns a string (NOT a text node)
2126 parse_character_reference = (allowed_char = null, in_attr = false) ->
2127 if cur >= txt.length
2129 switch c = txt.charAt(cur)
2130 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
2131 # explicitly not a parse error
2134 # there has to be "one or more" alnums between & and ; to be a parse error
2137 if cur + 1 >= txt.length
2139 if txt.charAt(cur + 1).toLowerCase() is 'x'
2148 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
2152 if txt.charAt(start + i) is ';'
2154 # FIXME This is supposed to generate parse errors for some chars
2155 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
2162 if alnum.indexOf(txt.charAt(cur + i)) is -1
2165 # exit early, because parse_error() below needs at least one alnum
2167 if txt.charAt(cur + i) is ';'
2168 i += 1 # include ';' terminator in value
2169 decoded = decode_named_char_ref txt.substr(cur, i)
2176 # no ';' terminator (only legacy char refs)
2178 for i in [2..max] # no prefix matches, so ok to check shortest first
2179 c = legacy_char_refs[txt.substr(cur, i)]
2182 if txt.charAt(cur + i) is '='
2183 # "because some legacy user agents will
2184 # misinterpret the markup in those cases"
2187 if alnum.indexOf(txt.charAt(cur + i)) > -1
2188 # this makes attributes forgiving about url args
2190 # ok, and besides the weird exceptions for attributes...
2191 # return the matching char
2192 cur += i # consume entity chars
2193 parse_error() # because no terminating ";"
2197 return # never reached
2199 # tree constructor initialization
2200 # see comments on TYPE_TAG/etc for the structure of this data
2201 tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
2203 afe = [] # active formatting elements
2204 template_insertion_modes = []
2205 insertion_mode = ins_mode_in_body
2206 original_insertion_mode = insertion_mode # TODO check spec
2207 flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
2208 flag_frameset_ok = true
2210 flag_foster_parenting = false
2211 form_element_pointer = null
2212 temporary_buffer = null
2214 # tokenizer initialization
2215 tok_state = tok_state_data
2222 return tree.children
2224 # everything below is tests on the above
2225 test_equals = (description, output, expected_output) ->
2226 if output is expected_output
2227 console.log "passed." # don't say name, so smart consoles can merge all of these
2229 console.log "FAILED: \"#{description}\""
2230 console.log " Expected: #{expected_output}"
2231 console.log " Actual: #{output}"
2232 serialize_els = (els, shallow, show_ids) ->
2238 serialized += t.serialize shallow, show_ids
2240 test_parser = (args) ->
2245 prev_node_id = 0 # reset counter
2246 parsed = parse_html args.html, errors_cb
2247 serialized = serialize_els parsed, false, false
2248 if serialized isnt args.expected
2249 debug_log_each (str) ->
2251 console.log "FAILED: \"#{args.name}\""
2252 console.log " Input: #{args.html}"
2253 console.log " Correct: #{args.expected}"
2254 console.log " Output: #{serialized}"
2255 if parse_errors.length > 0
2256 console.log " parse errs: #{JSON.stringify parse_errors}"
2258 console.log " No parse errors"
2260 console.log "passed \"#{args.name}\""
2262 test_parser name: "empty", \
2265 test_parser name: "just text", \
2267 expected: 'text:"abc"'
2268 test_parser name: "named entity", \
2270 expected: 'text:"a&1234"'
2271 test_parser name: "broken named character references", \
2272 html: "1&2&&3&aabbcc;",
2273 expected: 'text:"1&2&&3&aabbcc;"'
2274 test_parser name: "numbered entity overrides", \
2275 html: "1€€ ƒ",
2276 expected: 'text:"1€€ ƒ"'
2277 test_parser name: "open tag", \
2278 html: "foo<span>bar",
2279 expected: 'text:"foo",tag:"span",{},[text:"bar"]'
2280 test_parser name: "open tag with attributes", \
2281 html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
2282 expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
2283 test_parser name: "open tag with attributes of various quotings", \
2284 html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
2285 expected: 'text:"foo",tag:"span",{"abc":"def","autofocus":"","g":"hij","klm":"nopqrstuv\\""},[text:"bar"]'
2286 test_parser name: "attribute entity exceptions dq", \
2287 html: "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar",
2288 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
2289 test_parser name: "attribute entity exceptions sq", \
2290 html: "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar",
2291 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
2292 test_parser name: "attribute entity exceptions uq", \
2293 html: "foo<a href=foo?t=1&=2&o=3&lt=foo>bar",
2294 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
2295 test_parser name: "matching closing tags", \
2296 html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
2297 expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"'
2298 test_parser name: "missing closing tag inside", \
2299 html: "foo<div>bar<span>baz</div>qux",
2300 expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"'
2301 test_parser name: "mis-matched closing tags", \
2302 html: "<span>12<div>34</span>56</div>78",
2303 expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]'
2304 test_parser name: "mis-matched formatting elements", \
2305 html: "12<b>34<i>56</b>78</i>90",
2306 expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"'
2307 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
2308 html: '<p>1<b>2<i>3</b>4</i>5</p>',
2309 expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]'
2310 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
2311 html: '<b>1<p>2</b>3</p>',
2312 expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]'
2313 test_parser name: "crazy formatting elements test", \
2314 html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
2315 # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
2316 # firefox does this:
2317 expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
2318 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
2319 test_parser name: "html5lib aaa 1", \
2320 html: '<a><p></a></p>',
2321 expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]'
2322 test_parser name: "html5lib aaa 2", \
2323 html: '<a>1<p>2</a>3</p>',
2324 expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]'
2325 test_parser name: "html5lib aaa 3", \
2326 html: '<a>1<button>2</a>3</button>',
2327 expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]'
2328 test_parser name: "html5lib aaa 4", \
2329 html: '<a>1<b>2</a>3</b>',
2330 expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]'
2331 test_parser name: "html5lib aaa 5 (two divs deep)", \
2332 html: '<a>1<div>2<div>3</a>4</div>5</div>',
2333 expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]'
2334 test_parser name: "html5lib aaa 6 (foster parenting)", \
2335 html: '<table><a>1<p>2</a>3</p>',
2336 expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]'
2337 test_parser name: "html5lib aaa 7 (aaa, eof) 1", \
2338 html: '<b><b><a><p></a>',
2339 expected: 'tag:"b",{},[tag:"b",{},[tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]]]'
2340 test_parser name: "html5lib aaa 8 (aaa, eof) 2", \
2341 html: '<b><a><b><p></a>',
2342 expected: 'tag:"b",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2343 test_parser name: "html5lib aaa 9 (aaa, eof) 3", \
2344 html: '<a><b><b><p></a>',
2345 expected: 'tag:"a",{},[tag:"b",{},[tag:"b",{},[]]],tag:"b",{},[tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2346 test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \
2347 html: '<p>1<s id="A">2<b id="B">3</p>4</s>5</b>',
2348 expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]'
2349 test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
2350 html: '<table><a>1<td>2</td>3</table>',
2351 expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]'
2352 test_parser name: "html5lib aaa 12 (table with foster parenting, split text)", \
2353 html: '<table>A<td>B</td>C</table>',
2354 expected: 'text:"AC",tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2355 # TODO implement svg and namespacing
2356 #test_parser name: "html5lib aaa 13 (svg tr input)", \
2357 # html: '<a><svg><tr><input></a>',
2358 # expected: 'tag:"a",{},[svg:"svg",{},[svg:"tr",{},[svg:"input"]]]'
2359 test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \
2360 html: '<div><a><b><div><div><div><div><div><div><div><div><div><div></a>',
2361 expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"div",{},[tag:"div",{},[]]]]]]]]]]]]]'
2362 test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \
2363 html: '<div><a><b><u><i><code><div></a>',
2364 expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]'
2365 test_parser name: "html5lib aaa 16 (correctly nested 4b)", \
2366 html: '<b><b><b><b>x</b></b></b></b>y',
2367 expected: 'tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]],text:"y"'
2368 test_parser name: "html5lib aaa 17 (formatting, implied /p, noah's ark)", \
2369 html: '<p><b><b><b><b><p>x',
2370 expected: 'tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[]]]]],tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]]'
2371 test_parser name: "variation on html5lib aaa 17 (with attributes in various orders)", \
2372 html: '<p><b c="d" e="f"><b e="f" c="d"><b e="f" c="d"><b c="d" e="f"><p>x',
2373 expected: 'tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[]]]]],tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[text:"x"]]]]'
2374 test_parser name: "junk after attribute close-quote", \
2375 html: '<p><b c="d", e="f">foo<p>x',
2376 expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'
2377 test_parser name: "html5lib aaa02 1", \
2378 html: '<b>1<i>2<p>3</b>4',
2379 expected: 'tag:"b",{},[text:"1",tag:"i",{},[text:"2"]],tag:"i",{},[tag:"p",{},[tag:"b",{},[text:"3"],text:"4"]]'
2380 test_parser name: "html5lib aaa02 2", \
2381 html: '<a><div><style></style><address><a>',
2382 expected: 'tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"style",{},[]],tag:"address",{},[tag:"a",{},[],tag:"a",{},[]]]'