1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # Each node is an obect of the Node class. Here are the Node types:
52 TYPE_TAG = 0 # name, {attributes}, [children]
53 TYPE_TEXT = 1 # "text"
56 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
57 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
58 TYPE_END_TAG = 5 # name
60 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
61 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
73 debug_log_each = (cb) ->
74 for str in g_debug_log
79 constructor: (type, args = {}) ->
80 @type = type # one of the TYPE_* constants above
81 @name = args.name ? '' # tag name
82 @text = args.text ? '' # contents for text/comment nodes
83 @attrs = args.attrs ? {}
84 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
85 @children = args.children ? []
86 @namespace = args.namespace ? NS_HTML
87 @parent = args.parent ? null
91 @id = "#{++prev_node_id}"
92 shallow_clone: -> # return a new node that's the same except without the children or parent
93 # WARNING this doesn't work right on open tags that are still being parsed
95 attrs[k] = v for k, v of @attrs
96 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
97 serialize: (shallow = false, show_ids = false) -> # for unit tests
102 ret += JSON.stringify @name
117 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
123 ret += c.serialize shallow, show_ids
127 ret += JSON.stringify @text
130 ret += JSON.stringify @text
136 when TYPE_AAA_BOOKMARK
137 ret += 'aaa_bookmark'
140 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
143 # helpers: (only take args that are normally known when parser creates nodes)
144 new_open_tag = (name) ->
145 return new Node TYPE_START_TAG, name: name
146 new_end_tag = (name) ->
147 return new Node TYPE_END_TAG, name: name
148 new_element = (name) ->
149 return new Node TYPE_TAG, name: name
150 new_text_node = (txt) ->
151 return new Node TYPE_TEXT, text: txt
152 new_comment_node = (txt) ->
153 return new Node TYPE_COMMENT, text: txt
155 return new Node TYPE_EOF
157 return new Node TYPE_AFE_MARKER
158 new_aaa_bookmark = ->
159 return new Node TYPE_AAA_BOOKMARK
161 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
162 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
163 digits = "0123456789"
164 alnum = lc_alpha + uc_alpha + digits
165 hex_chars = digits + "abcdefABCDEF"
167 # some SVG elements have dashes in them
168 tag_name_chars = alnum + "-"
170 # http://www.w3.org/TR/html5/infrastructure.html#space-character
171 space_chars = "\u0009\u000a\u000c\u000d\u0020"
173 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
174 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
176 # These are the character references that don't need a terminating semicolon
177 # min length: 2, max: 6, none are a prefix of any other.
179 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
180 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
181 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
182 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
183 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
184 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
185 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
186 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
187 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
188 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
189 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
190 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
191 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
192 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
193 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
194 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
195 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
199 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
200 raw_text_elements = ['script', 'style']
201 escapable_raw_text_elements = ['textarea', 'title']
202 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
204 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
205 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
206 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
207 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
208 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
209 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
210 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
211 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
212 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
213 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
214 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
215 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
216 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
217 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
221 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
223 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
224 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
225 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
226 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
227 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
228 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
229 'determinant', 'diff', 'divergence', 'divide', 'domain',
230 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
231 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
232 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
233 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
234 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
235 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
236 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
237 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
238 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
239 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
240 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
241 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
242 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
243 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
244 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
245 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
246 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
247 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
248 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
249 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
250 'vectorproduct', 'xor'
252 # foreign_elements = [svg_elements..., mathml_elements...]
253 #normal_elements = All other allowed HTML elements are normal elements.
257 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
258 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
259 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
260 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
261 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
262 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
263 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
264 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
265 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
266 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
267 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
268 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
269 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
270 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
271 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
272 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
273 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
274 wbr:NS_HTML, xmp:NS_HTML,
277 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
278 'annotation-xml':NS_MATHML,
281 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
284 formatting_elements = {
285 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
286 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
290 foster_parenting_targets = {
312 el_is_special = (e) ->
313 return special_elements[e.name]?
314 # FIXME it should really be:
315 #return special_elements[e.name] is e.namespace
317 # decode_named_char_ref()
319 # The list of named character references is _huge_ so ask the browser to decode
320 # for us instead of wasting bandwidth/space on including the table here.
322 # Pass without the "&" but with the ";" examples:
323 # for "&" pass "amp;"
324 # for "′" pass "x2032;"
327 textarea: document.createElement('textarea')
329 # TODO test this in IE8
330 decode_named_char_ref = (txt) ->
332 decoded = g_dncr.cache[txt]
333 return decoded if decoded?
334 g_dncr.textarea.innerHTML = txt
335 decoded = g_dncr.textarea.value
336 return null if decoded is txt
337 return g_dncr.cache[txt] = decoded
339 parse_html = (txt, parse_error_cb = null) ->
340 cur = 0 # index of next char in txt to be parsed
341 # declare tree and tokenizer variables so they're in scope below
343 open_els = [] # stack of open elements
344 insertion_mode = null
346 tok_cur_tag = null # partially parsed tag
347 flag_frameset_ok = null
349 flag_foster_parenting = null
350 form_element_pointer = null
351 afe = [] # active formatting elements
357 console.log "Parse error at character #{cur} of #{txt.length}"
359 afe_push = (new_el) ->
362 if el.name is new_el.name and el.namespace is new_el.namespace
364 continue unless new_el.attrs[k] is v
365 for k, v of new_el.attrs
366 continue unless el.attrs[k] is v
373 afe.unshift new_afe_marker()
375 # the functions below impliment the Tree Contstruction algorithm
376 # http://www.w3.org/TR/html5/syntax.html#tree-construction
378 # But first... the helpers
379 template_tag_is_open = ->
381 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
384 is_in_scope_x = (tag_name, scope, namespace) ->
386 if t.name is tag_name and (namespace is null or namespace is t.namespace)
388 if scope[t.name] is t.namespace
391 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
393 if t.name is tag_name and (namespace is null or namespace is t.namespace)
395 if scope[t.name] is t.namespace
397 if scope2[t.name] is t.namespace
400 standard_scopers = { # FIXME these are supposed to be namespace specific
401 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
402 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
403 template: NS_HTML, mi: NS_MATHML,
405 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
406 'annotation-xml': NS_MATHML,
408 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
410 button_scopers = button: NS_HTML
411 li_scopers = ol: NS_HTML, ul: NS_HTML
412 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
413 is_in_scope = (tag_name, namespace = null) ->
414 return is_in_scope_x tag_name, standard_scopers, namespace
415 is_in_button_scope = (tag_name, namespace = null) ->
416 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
417 is_in_table_scope = (tag_name, namespace = null) ->
418 return is_in_scope_x tag_name, table_scopers, namespace
419 is_in_select_scope = (tag_name, namespace = null) ->
421 if t.name is tag_name and (namespace is null or namespace is t.namespace)
423 if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
426 # this checks for a particular element, not by name
427 el_is_in_scope = (el) ->
431 if standard_scopers[t.name] is t.namespace
436 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
437 reset_insertion_mode = ->
438 # 1. Let last be false.
440 # 2. Let node be the last node in the stack of open elements.
442 node = open_els[node_i]
443 # 3. Loop: If node is the first node in the stack of open elements,
444 # then set last to true, and, if the parser was originally created as
445 # part of the HTML fragment parsing algorithm (fragment case) set node
446 # to the context element.
448 if node_i is open_els.length - 1
450 # fixfull (fragment case)
452 # 4. If node is a select element, run these substeps:
453 if node.name is 'select'
454 # 1. If last is true, jump to the step below labeled done.
456 # 2. Let ancestor be node.
459 # 3. Loop: If ancestor is the first node in the stack of
460 # open elements, jump to the step below labeled done.
462 if ancestor_i is open_els.length - 1
464 # 4. Let ancestor be the node before ancestor in the stack
467 ancestor = open_els[ancestor_i]
468 # 5. If ancestor is a template node, jump to the step below
470 if ancestor.name is 'template'
472 # 6. If ancestor is a table node, switch the insertion mode
473 # to "in select in table" and abort these steps.
474 if ancestor.name is 'table'
475 insertion_mode = ins_mode_in_select_in_table
477 # 7. Jump back to the step labeled loop.
478 # 8. Done: Switch the insertion mode to "in select" and abort
480 insertion_mode = ins_mode_in_select
482 # 5. If node is a td or th element and last is false, then switch
483 # the insertion mode to "in cell" and abort these steps.
484 if (node.name is 'td' or node.name is 'th') and last is false
485 insertion_mode = ins_mode_in_cell
487 # 6. If node is a tr element, then switch the insertion mode to "in
488 # row" and abort these steps.
490 insertion_mode = ins_mode_in_row
492 # 7. If node is a tbody, thead, or tfoot element, then switch the
493 # insertion mode to "in table body" and abort these steps.
494 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
495 insertion_mode = ins_mode_in_table_body
497 # 8. If node is a caption element, then switch the insertion mode
498 # to "in caption" and abort these steps.
499 if node.name is 'caption'
500 insertion_mode = ins_mode_in_caption
502 # 9. If node is a colgroup element, then switch the insertion mode
503 # to "in column group" and abort these steps.
504 if node.name is 'colgroup'
505 insertion_mode = ins_mode_in_column_group
507 # 10. If node is a table element, then switch the insertion mode to
508 # "in table" and abort these steps.
509 if node.name is 'table'
510 insertion_mode = ins_mode_in_table
512 # 11. If node is a template element, then switch the insertion mode
513 # to the current template insertion mode and abort these steps.
514 # fixfull (template insertion mode stack)
516 # 12. If node is a head element and last is true, then switch the
517 # insertion mode to "in body" ("in body"! not "in head"!) and abort
518 # these steps. (fragment case)
519 if node.name is 'head' and last
520 insertion_mode = ins_mode_in_body
522 # 13. If node is a head element and last is false, then switch the
523 # insertion mode to "in head" and abort these steps.
524 if node.name is 'head' and last is false
525 insertion_mode = ins_mode_in_head
527 # 14. If node is a body element, then switch the insertion mode to
528 # "in body" and abort these steps.
529 if node.name is 'body'
530 insertion_mode = ins_mode_in_body
532 # 15. If node is a frameset element, then switch the insertion mode
533 # to "in frameset" and abort these steps. (fragment case)
534 if node.name is 'frameset'
535 insertion_mode = ins_mode_in_frameset
537 # 16. If node is an html element, run these substeps:
538 if node.name is 'html'
539 # 1. If the head element pointer is null, switch the insertion
540 # mode to "before head" and abort these steps. (fragment case)
541 # fixfull (fragment case)
543 # 2. Otherwise, the head element pointer is not null, switch
544 # the insertion mode to "after head" and abort these steps.
545 insertion_mode = ins_mode_in_body # FIXME fixfull
547 # 17. If last is true, then switch the insertion mode to "in body"
548 # and abort these steps. (fragment case)
550 insertion_mode = ins_mode_in_body
552 # 18. Let node now be the node before node in the stack of open
555 node = open_els[node_i]
556 # 19. Return to the step labeled loop.
558 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
559 # this implementation is structured (mostly) as described at the link above.
560 # capitalized comments are the "labels" described at the link above.
561 reconstruct_active_formatting_elements = ->
562 return if afe.length is 0
563 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
568 if i is afe.length - 1
571 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
576 el = afe[i].shallow_clone()
577 tree_insert_element el
582 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
583 # adoption agency algorithm
585 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
586 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
587 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
588 adoption_agency = (subject) ->
589 debug_log "adoption_agency()"
590 debug_log "tree: #{serialize_els tree.children, false, true}"
591 debug_log "open_els: #{serialize_els open_els, true, true}"
592 debug_log "afe: #{serialize_els afe, true, true}"
593 if open_els[0].name is subject
596 # remove it from the list of active formatting elements (if found)
601 debug_log "aaa: starting off with subject on top of stack, exiting"
608 # 5. Let formatting element be the last element in the list of
609 # active formatting elements that: is between the end of the list
610 # and the last scope marker in the list, if any, or the start of
611 # the list otherwise, and has the tag name subject.
613 for t, fe_of_afe in afe
614 if t.type is TYPE_AFE_MARKER
619 # If there is no such element, then abort these steps and instead
620 # act as described in the "any other end tag" entry above.
622 debug_log "aaa: fe not found in afe"
623 in_body_any_other_end_tag subject
625 # 6. If formatting element is not in the stack of open elements,
626 # then this is a parse error; remove the element from the list, and
629 for t, fe_of_open_els in open_els
634 debug_log "aaa: fe not found in open_els"
636 # "remove it from the list" must mean afe, since it's not in open_els
637 afe.splice fe_of_afe, 1
639 # 7. If formatting element is in the stack of open elements, but
640 # the element is not in scope, then this is a parse error; abort
642 unless el_is_in_scope fe
643 debug_log "aaa: fe not in scope"
646 # 8. If formatting element is not the current node, this is a parse
647 # error. (But do not abort these steps.)
648 unless open_els[0] is fe
651 # 9. Let furthest block be the topmost node in the stack of open
652 # elements that is lower in the stack than formatting element, and
653 # is an element in the special category. There might not be one.
655 fb_of_open_els = null
662 # and continue, to see if there's one that's more "topmost"
663 # 10. If there is no furthest block, then the UA must first pop all
664 # the nodes from the bottom of the stack of open elements, from the
665 # current node up to and including formatting element, then remove
666 # formatting element from the list of active formatting elements,
667 # and finally abort these steps.
669 debug_log "aaa: no fb"
673 afe.splice fe_of_afe, 1
675 # 11. Let common ancestor be the element immediately above
676 # formatting element in the stack of open elements.
677 ca = open_els[fe_of_open_els + 1] # common ancestor
679 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
680 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
681 bookmark = new_aaa_bookmark()
684 afe.splice i, 0, bookmark
686 node = last_node = fb
690 # 3. Let node be the element immediately above node in the
691 # stack of open elements, or if node is no longer in the stack
692 # of open elements (e.g. because it got removed by this
693 # algorithm), the element that was immediately above node in
694 # the stack of open elements before node was removed.
698 node_next = open_els[i + 1]
700 node = node_next ? node_above
701 debug_log "inner loop #{inner}"
702 debug_log "tree: #{serialize_els tree.children, false, true}"
703 debug_log "open_els: #{serialize_els open_els, true, true}"
704 debug_log "afe: #{serialize_els afe, true, true}"
705 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
706 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
707 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
708 debug_log "node: #{node.serialize true, true}"
709 # TODO make sure node_above gets re-set if/when node is removed from open_els
711 # 4. If node is formatting element, then go to the next step in
712 # the overall algorithm.
716 # 5. If inner loop counter is greater than three and node is in
717 # the list of active formatting elements, then remove node from
718 # the list of active formatting elements.
724 debug_log "max out inner"
729 # 6. If node is not in the list of active formatting elements,
730 # then remove node from the stack of open elements and then go
731 # back to the step labeled inner loop.
733 debug_log "not in afe"
736 node_above = open_els[i + 1]
740 debug_log "the bones"
741 # 7. create an element for the token for which the element node
742 # was created, in the HTML namespace, with common ancestor as
743 # the intended parent; replace the entry for node in the list
744 # of active formatting elements with an entry for the new
745 # element, replace the entry for node in the stack of open
746 # elements with an entry for the new element, and let node be
748 new_node = node.shallow_clone()
752 debug_log "replaced in afe"
756 node_above = open_els[i + 1]
757 open_els[i] = new_node
758 debug_log "replaced in open_els"
761 # 8. If last node is furthest block, then move the
762 # aforementioned bookmark to be immediately after the new node
763 # in the list of active formatting elements.
768 debug_log "removed bookmark"
772 # "after" means lower
773 afe.splice i, 0, bookmark # "after as <-
774 debug_log "placed bookmark after node"
775 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
777 # 9. Insert last node into node, first removing it from its
778 # previous parent node if any.
780 debug_log "last_node has parent"
781 for c, i in last_node.parent.children
783 debug_log "removing last_node from parent"
784 last_node.parent.children.splice i, 1
786 node.children.push last_node
787 last_node.parent = node
788 # 10. Let last node be node.
791 # 11. Return to the step labeled inner loop.
792 # 14. Insert whatever last node ended up being in the previous step
793 # at the appropriate place for inserting a node, but using common
794 # ancestor as the override target.
796 # JASON: In the case where fe is immediately followed by fb:
797 # * inner loop exits out early (node==fe)
799 # * last_node is still in the tree (not a duplicate)
801 debug_log "FEFIRST? last_node has parent"
802 for c, i in last_node.parent.children
804 debug_log "removing last_node from parent"
805 last_node.parent.children.splice i, 1
808 debug_log "after aaa inner loop"
809 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
810 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
811 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
812 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
813 debug_log "tree: #{serialize_els tree.children, false, true}"
818 # can't use standard insert token thing, because it's already in
819 # open_els and must stay at it's current position in open_els
820 dest = adjusted_insertion_location ca
821 dest[0].children.splice dest[1], 0, last_node
822 last_node.parent = dest[0]
825 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
826 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
827 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
828 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
829 debug_log "tree: #{serialize_els tree.children, false, true}"
831 # 15. Create an element for the token for which formatting element
832 # was created, in the HTML namespace, with furthest block as the
834 new_element = fe.shallow_clone() # FIXME intended parent thing
835 # 16. Take all of the child nodes of furthest block and append them
836 # to the element created in the last step.
837 while fb.children.length
838 t = fb.children.shift()
839 t.parent = new_element
840 new_element.children.push t
841 # 17. Append that new element to furthest block.
842 new_element.parent = fb
843 fb.children.push new_element
844 # 18. Remove formatting element from the list of active formatting
845 # elements, and insert the new element into the list of active
846 # formatting elements at the position of the aforementioned
856 # 19. Remove formatting element from the stack of open elements,
857 # and insert the new element into the stack of open elements
858 # immediately below the position of furthest block in that stack.
865 open_els.splice i, 0, new_element
867 # 20. Jump back to the step labeled outer loop.
868 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
869 debug_log "tree: #{serialize_els tree.children, false, true}"
870 debug_log "open_els: #{serialize_els open_els, true, true}"
871 debug_log "afe: #{serialize_els afe, true, true}"
874 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
875 # FIXME test this (particularly emplied end tags)
877 generate_implied_end_tags 'p' # arg is exception
878 if open_els[0].name isnt 'p'
880 while open_els.length > 1 # just in case
881 el = open_els.shift()
884 close_p_if_in_button_scope = ->
885 if is_in_button_scope 'p'
888 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
889 tree_insert_text = (t) ->
890 dest = adjusted_insertion_location()
891 # fixfull check for Document node
893 prev = dest[0].children[dest[1] - 1]
894 if prev.type is TYPE_TEXT
897 dest[0].children.splice dest[1], 0, t
900 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
901 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
902 adjusted_insertion_location = (override_target = null) ->
903 # 1. If there was an override target specified, then let target be the
906 target = override_target
907 else # Otherwise, let target be the current node.
909 # 2. Determine the adjusted insertion location using the first matching
910 # steps from the following list:
912 # If foster parenting is enabled and target is a table, tbody, tfoot,
913 # thead, or tr element Foster parenting happens when content is
914 # misnested in tables.
915 if flag_foster_parenting and foster_parenting_targets[target.name]
916 loop # once. this is here so we can ``break`` to "abort these substeps"
917 # 1. Let last template be the last template element in the
918 # stack of open elements, if any.
920 last_template_i = null
921 for el, i in open_els
922 if el.name is 'template'
926 # 2. Let last table be the last table element in the stack of
927 # open elements, if any.
930 for el, i in open_els
931 if el.name is 'table'
935 # 3. If there is a last template and either there is no last
936 # table, or there is one, but last template is lower (more
937 # recently added) than last table in the stack of open
938 # elements, then: let adjusted insertion location be inside
939 # last template's template contents, after its last child (if
940 # any), and abort these substeps.
941 if last_template and (last_table is null or last_template_i < last_table_i)
942 target = template # fixfull should be it's contents
943 target_i = target.children.length
945 # 4. If there is no last table, then let adjusted insertion
946 # location be inside the first element in the stack of open
947 # elements (the html element), after its last child (if any),
948 # and abort these substeps. (fragment case)
949 if last_table is null
951 target = open_els[open_els.length - 1]
952 target_i = target.children.length
953 # 5. If last table has a parent element, then let adjusted
954 # insertion location be inside last table's parent element,
955 # immediately before last table, and abort these substeps.
956 if last_table.parent?
957 for c, i in last_table.parent.children
959 target = last_table.parent
963 # 6. Let previous element be the element immediately above last
964 # table in the stack of open elements.
966 # huh? how could it not have a parent?
967 previous_element = open_els[last_table_i + 1]
968 # 7. Let adjusted insertion location be inside previous
969 # element, after its last child (if any).
970 target = previous_element
971 target_i = target.children.length
972 # Note: These steps are involved in part because it's possible
973 # for elements, the table element in this case in particular,
974 # to have been moved by a script around in the DOM, or indeed
975 # removed from the DOM entirely, after the element was inserted
977 break # don't really loop
979 # Otherwise Let adjusted insertion location be inside target, after
980 # its last child (if any).
981 target_i = target.children.length
983 # 3. If the adjusted insertion location is inside a template element,
984 # let it instead be inside the template element's template contents,
985 # after its last child (if any).
988 # 4. Return the adjusted insertion location.
989 return [target, target_i]
991 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
992 # aka create_an_element_for_token
993 token_to_element = (t, namespace, intended_parent) ->
994 t.type = TYPE_TAG # not TYPE_START_TAG
995 # convert attributes into a hash
997 while t.attrs_a.length
999 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1000 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
1002 # TODO 2. If the newly created element has an xmlns attribute in the
1003 # XMLNS namespace whose value is not exactly the same as the element's
1004 # namespace, that is a parse error. Similarly, if the newly created
1005 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1006 # value is not the XLink Namespace, that is a parse error.
1008 # fixfull: the spec says stuff about form pointers and ownerDocument
1012 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1013 insert_foreign_element = (token, namespace) ->
1014 ail = adjusted_insertion_location()
1017 el = token_to_element token, namespace, ail_el
1018 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1020 ail_el.children.splice ail_i, 0, el
1023 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1024 insert_html_element = insert_foreign_element # (token, namespace) ->
1026 # FIXME read implement "foster parenting" part
1027 # FIXME read spec, do this right
1028 # FIXME implement the override target thing
1029 # note: this assumes it's an open tag
1030 # FIXME what part of the spec is this?
1031 # TODO look through all callers of this, and see what they should really be doing.
1032 # eg probably insert_html_element for tokens
1033 tree_insert_element = (el, override_target = null, namespace = null) ->
1035 el.namespace = namespace
1036 dest = adjusted_insertion_location override_target
1037 if el.type is TYPE_START_TAG # means it's a "token"
1038 el = token_to_element el, namespace, dest[0]
1039 unless el.namespace?
1040 namespace = dest.namespace
1041 # fixfull: Document nodes sometimes can't accept more chidren
1042 dest[0].children.splice dest[1], 0, el
1047 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1048 # position should be [node, index_within_children]
1049 tree_insert_a_comment = (t, position = null) ->
1050 position ?= adjusted_insertion_location()
1051 position[0].children.splice position[1], 0, t
1053 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1054 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1055 generate_implied_end_tags = (except = null) ->
1056 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1059 # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1060 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1061 for node, i in open_els
1062 if node.name is name # FIXME check namespace too
1063 generate_implied_end_tags name # arg is exception
1064 parse_error() unless i is 0
1069 if special_elements[node.name]? # FIXME check namespac too
1072 ins_mode_in_body = (t) ->
1078 when "\t", "\u000a", "\u000c", "\u000d", ' '
1079 reconstruct_active_formatting_elements()
1082 reconstruct_active_formatting_elements()
1084 flag_frameset_ok = false
1086 tree_insert_a_comment t
1093 return if template_tag_is_open()
1094 root_attrs = open_els[open_els.length - 1].attrs
1096 root_attrs[k] = v unless root_attrs[k]?
1097 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1098 # FIXME also do this for </template> (end tag)
1099 return tree_in_head t
1106 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1107 close_p_if_in_button_scope()
1108 insert_html_element t
1109 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1110 close_p_if_in_button_scope()
1111 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1114 insert_html_element t
1115 # TODO lots more to implement here
1117 # If the list of active formatting elements
1118 # contains an a element between the end of the list and
1119 # the last marker on the list (or the start of the list
1120 # if there is no marker on the list), then this is a
1121 # parse error; run the adoption agency algorithm for
1122 # the tag name "a", then remove that element from the
1123 # list of active formatting elements and the stack of
1124 # open elements if the adoption agency algorithm didn't
1125 # already remove it (it might not have if the element
1126 # is not in table scope).
1129 if el.type is TYPE_AFE_MARKER
1139 for el, i in open_els
1141 open_els.splice i, 1
1142 reconstruct_active_formatting_elements()
1143 el = insert_html_element t
1145 when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1146 reconstruct_active_formatting_elements()
1147 el = insert_html_element t
1150 # fixfull quirksmode thing
1151 close_p_if_in_button_scope()
1152 insert_html_element t
1153 insertion_mode = ins_mode_in_table
1154 # TODO lots more to implement here
1155 else # any other start tag
1156 reconstruct_active_formatting_elements()
1157 insert_html_element t
1160 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1161 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1164 unless ok_tags[t.name]?
1167 # TODO stack of template insertion modes thing
1168 flag_parsing = false # stop parsing
1172 unless is_in_scope 'body'
1175 # TODO implement parse error and move to tree_after_body
1177 unless is_in_scope 'body' # weird, but it's what the spec says
1180 # TODO implement parse error and move to tree_after_body, reprocess
1181 when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1182 unless is_in_scope t.name, NS_HTML
1185 generate_implied_end_tags()
1186 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1189 el = open_els.shift()
1190 if el.name is t.name and el.namespace is NS_HTML
1192 # TODO lots more close tags to implement here
1194 unless is_in_button_scope 'p'
1196 insert_html_element new_open_tag 'p'
1198 # TODO lots more close tags to implement here
1199 when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1200 adoption_agency t.name
1201 # TODO lots more close tags to implement here
1203 in_body_any_other_end_tag t.name
1206 ins_mode_in_table_else = (t) ->
1208 flag_foster_parenting = true # FIXME
1210 flag_foster_parenting = false
1218 clear_to_table_stopers = {
1223 clear_stack_to_table_context = ->
1225 if clear_to_table_stopers[open_els[0].name]?
1229 clear_to_table_body_stopers = {
1236 clear_stack_to_table_body_context = ->
1238 if clear_to_table_body_stopers[open_els[0].name]?
1242 clear_to_table_row_stopers = {
1247 clear_stack_to_table_row_context = ->
1249 if clear_to_table_row_stopers[open_els[0].name]?
1253 clear_afe_to_marker = ->
1256 if el.type is TYPE_AFE_MARKER
1258 ins_mode_in_table = (t) ->
1261 if can_in_table[t.name]
1262 original_insertion_mode = insertion_mode
1263 insertion_mode = ins_mode_in_table_text
1266 ins_mode_in_table_else t
1268 tree_insert_a_comment t
1274 clear_stack_to_table_context()
1276 insert_html_element t
1277 insertion_mode = ins_mode_in_caption
1279 clear_stack_to_table_context()
1280 insert_html_element t
1281 insertion_mode = ins_mode_in_column_group
1283 clear_stack_to_table_context()
1284 insert_html_element new_open_tag 'colgroup'
1285 insertion_mode = ins_mode_in_column_group
1287 when 'tbody', 'tfoot', 'thead'
1288 clear_stack_to_table_context()
1289 insert_html_element t
1290 insertion_mode = ins_mode_in_table_body
1291 when 'td', 'th', 'tr'
1292 clear_stack_to_table_context()
1293 insert_html_element new_open_tag 'tbody'
1294 insertion_mode = ins_mode_in_table_body
1298 if is_in_table_scope 'table'
1300 el = open_els.shift()
1301 if el.name is 'table'
1303 reset_insertion_mode()
1305 when 'style', 'script', 'template'
1308 if token_is_input_hidden t
1309 ins_mode_in_table_else t
1312 insert_html_element t
1314 # fixfull acknowledge sef-closing flag
1317 if form_element_pointer?
1319 if template_tag_is_open()
1321 form_element_pointer = insert_html_element t
1324 ins_mode_in_table_else t
1328 if is_in_table_scope 'table'
1330 el = open_els.shift()
1331 if el.name is 'table'
1333 reset_insertion_mode()
1336 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1341 ins_mode_in_table_else t
1345 ins_mode_in_table_else t
1348 ins_mode_in_table_text = (t) ->
1355 console.log "unimplemented ins_mode_in_table_text"
1358 ins_mode_in_table_body = (t) ->
1359 if t.type is TYPE_START_TAG and t.name is 'tr'
1360 clear_stack_to_table_body_context()
1361 insert_html_element t
1363 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1365 clear_stack_to_table_body_context()
1366 insert_html_element new_open_tag 'tr'
1367 insertion_mode = ins_mode_in_row
1370 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1371 unless is_in_table_scope t.name # fixfull check namespace
1374 clear_stack_to_table_body_context()
1376 insertion_mode = ins_mode_in_table
1378 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1381 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1384 if table_scopers[el.name]
1389 clear_stack_to_table_body_context()
1391 insertion_mode = ins_mode_in_table
1394 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1400 ins_mode_in_row = (t) ->
1401 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1402 clear_stack_to_table_row_context()
1403 insert_html_element t
1404 insertion_mode = ins_mode_in_cell
1407 if t.type is TYPE_END_TAG and t.name is 'tr'
1408 if is_in_table_scope 'tr'
1409 clear_stack_to_table_row_context()
1411 insertion_mode = ins_mode_in_table_body
1415 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1416 if is_in_table_scope 'tr'
1417 clear_stack_to_table_row_context()
1419 insertion_mode = ins_mode_in_table_body
1424 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1425 if is_in_table_scope t.name # fixfull namespace
1426 if is_in_table_scope 'tr'
1427 clear_stack_to_table_row_context()
1429 insertion_mode = ins_mode_in_table_body
1434 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1440 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1442 generate_implied_end_tags()
1443 unless open_els[0].name is 'td' or open_els[0] is 'th'
1446 el = open_els.shift()
1447 if el.name is 'td' or el.name is 'th'
1449 clear_afe_to_marker()
1450 insertion_mode = ins_mode_in_row
1452 # http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1453 ins_mode_in_cell = (t) ->
1454 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1455 if is_in_table_scope t.name
1456 generate_implied_end_tags()
1457 if open_els[0].name isnt t.name
1460 el = open_els.shift()
1461 if el.name is t.name
1463 clear_afe_to_marker()
1464 insertion_mode = ins_mode_in_row
1468 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1471 if el.name is 'td' or el.name is 'th'
1474 if table_scopers[el.name]
1482 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1485 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1486 if is_in_table_scope t.name # fixfull namespace
1496 # the functions below implement the tokenizer stats described here:
1497 # http://www.w3.org/TR/html5/syntax.html#tokenization
1499 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
1501 switch c = txt.charAt(cur++)
1503 return new_text_node tokenize_character_reference()
1505 tok_state = tok_state_tag_open
1508 return new_text_node c
1510 return new_eof_token()
1512 return new_text_node c
1515 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
1516 # not needed: tok_state_character_reference_in_data = ->
1517 # just call tok_state_character_reference_in_data()
1519 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
1520 tok_state_tag_open = ->
1521 switch c = txt.charAt(cur++)
1523 tok_state = tok_state_markup_declaration_open
1525 tok_state = tok_state_end_tag_open
1528 tok_state = tok_state_bogus_comment
1530 if lc_alpha.indexOf(c) > -1
1531 tok_cur_tag = new_open_tag c
1532 tok_state = tok_state_tag_name
1533 else if uc_alpha.indexOf(c) > -1
1534 tok_cur_tag = new_open_tag c.toLowerCase()
1535 tok_state = tok_state_tag_name
1538 tok_state = tok_state_data
1539 cur -= 1 # we didn't parse/handle the char after <
1540 return new_text_node '<'
1543 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
1544 tok_state_end_tag_open = ->
1545 switch c = txt.charAt(cur++)
1548 tok_state = tok_state_data
1551 tok_state = tok_state_data
1552 return new_text_node '</'
1554 if uc_alpha.indexOf(c) > -1
1555 tok_cur_tag = new_end_tag c.toLowerCase()
1556 tok_state = tok_state_tag_name
1557 else if lc_alpha.indexOf(c) > -1
1558 tok_cur_tag = new_end_tag c
1559 tok_state = tok_state_tag_name
1562 tok_state = tok_state_bogus_comment
1565 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
1566 tok_state_tag_name = ->
1567 switch c = txt.charAt(cur++)
1568 when "\t", "\n", "\u000c", ' '
1569 tok_state = tok_state_before_attribute_name
1571 tok_state = tok_state_self_closing_start_tag
1573 tok_state = tok_state_data
1579 tok_cur_tag.name += "\ufffd"
1582 tok_state = tok_state_data
1584 if uc_alpha.indexOf(c) > -1
1585 tok_cur_tag.name += c.toLowerCase()
1587 tok_cur_tag.name += c
1590 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
1591 tok_state_before_attribute_name = ->
1593 switch c = txt.charAt(cur++)
1594 when "\t", "\n", "\u000c", ' '
1597 tok_state = tok_state_self_closing_start_tag
1600 tok_state = tok_state_data
1606 attr_name = "\ufffd"
1607 when '"', "'", '<', '='
1612 tok_state = tok_state_data
1614 if uc_alpha.indexOf(c) > -1
1615 attr_name = c.toLowerCase()
1619 tok_cur_tag.attrs_a.unshift [attr_name, '']
1620 tok_state = tok_state_attribute_name
1623 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
1624 tok_state_attribute_name = ->
1625 switch c = txt.charAt(cur++)
1626 when "\t", "\n", "\u000c", ' '
1627 tok_state = tok_state_after_attribute_name
1629 tok_state = tok_state_self_closing_start_tag
1631 tok_state = tok_state_before_attribute_value
1633 tok_state = tok_state_data
1639 tok_cur_tag.attrs_a[0][0] = "\ufffd"
1642 tok_cur_tag.attrs_a[0][0] = c
1645 tok_state = tok_state_data
1647 if uc_alpha.indexOf(c) > -1
1648 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
1650 tok_cur_tag.attrs_a[0][0] += c
1653 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
1654 tok_state_after_attribute_name = ->
1655 c = txt.charAt(cur++)
1656 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
1659 tok_state = tok_state_self_closing_start_tag
1662 tok_state = tok_state_before_attribute_value
1665 tok_state = tok_state_data
1667 if uc_alpha.indexOf(c) > -1
1668 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
1669 tok_state = tok_state_attribute_name
1673 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
1674 tok_state = tok_state_attribute_name
1678 tok_state = tok_state_data
1679 cur -= 1 # reconsume
1681 if c is '"' or c is "'" or c is '<'
1683 # fall through to Anything else
1685 tok_cur_tag.attrs_a.unshift [c, '']
1686 tok_state = tok_state_attribute_name
1688 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
1689 tok_state_before_attribute_value = ->
1690 switch c = txt.charAt(cur++)
1691 when "\t", "\n", "\u000c", ' '
1694 tok_state = tok_state_attribute_value_double_quoted
1696 tok_state = tok_state_attribute_value_unquoted
1699 tok_state = tok_state_attribute_value_single_quoted
1702 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1703 tok_state = tok_state_attribute_value_unquoted
1706 tok_state = tok_state_data
1712 tok_state = tok_state_data
1714 tok_cur_tag.attrs_a[0][1] += c
1715 tok_state = tok_state_attribute_value_unquoted
1718 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
1719 tok_state_attribute_value_double_quoted = ->
1720 switch c = txt.charAt(cur++)
1722 tok_state = tok_state_after_attribute_value_quoted
1724 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
1727 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1730 tok_state = tok_state_data
1732 tok_cur_tag.attrs_a[0][1] += c
1735 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
1736 tok_state_attribute_value_single_quoted = ->
1737 switch c = txt.charAt(cur++)
1739 tok_state = tok_state_after_attribute_value_quoted
1741 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
1744 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1747 tok_state = tok_state_data
1749 tok_cur_tag.attrs_a[0][1] += c
1752 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
1753 tok_state_attribute_value_unquoted = ->
1754 switch c = txt.charAt(cur++)
1755 when "\t", "\n", "\u000c", ' '
1756 tok_state = tok_state_before_attribute_name
1758 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
1760 tok_state = tok_state_data
1765 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1768 tok_state = tok_state_data
1770 # Parse Error if ', <, = or ` (backtick)
1771 tok_cur_tag.attrs_a[0][1] += c
1774 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
1775 tok_state_after_attribute_value_quoted = ->
1776 switch c = txt.charAt(cur++)
1777 when "\t", "\n", "\u000c", ' '
1778 tok_state = tok_state_before_attribute_name
1780 tok_state = tok_state_self_closing_start_tag
1782 tok_state = tok_state_data
1788 tok_state = tok_state_data
1791 tok_state = tok_state_before_attribute_name
1792 cur -= 1 # we didn't handle that char
1795 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
1796 # Don't set this as a state, just call it
1797 # returns a string (NOT a text node)
1798 tokenize_character_reference = (allowed_char = null, in_attr = false) ->
1799 if cur >= txt.length
1801 switch c = txt.charAt(cur)
1802 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
1803 # explicitly not a parse error
1806 # there has to be "one or more" alnums between & and ; to be a parse error
1809 if cur + 1 >= txt.length
1811 if txt.charAt(cur + 1).toLowerCase() is 'x'
1820 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
1824 if txt.charAt(start + i) is ';'
1826 # FIXME This is supposed to generate parse errors for some chars
1827 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
1834 if alnum.indexOf(txt.charAt(cur + i)) is -1
1837 # exit early, because parse_error() below needs at least one alnum
1839 if txt.charAt(cur + i) is ';'
1840 i += 1 # include ';' terminator in value
1841 decoded = decode_named_char_ref txt.substr(cur, i)
1848 # no ';' terminator (only legacy char refs)
1850 for i in [2..max] # no prefix matches, so ok to check shortest first
1851 c = legacy_char_refs[txt.substr(cur, i)]
1854 if txt.charAt(cur + i) is '='
1855 # "because some legacy user agents will
1856 # misinterpret the markup in those cases"
1859 if alnum.indexOf(txt.charAt(cur + i)) > -1
1860 # this makes attributes forgiving about url args
1862 # ok, and besides the weird exceptions for attributes...
1863 # return the matching char
1864 cur += i # consume entity chars
1865 parse_error() # because no terminating ";"
1869 return # never reached
1871 # tree constructor initialization
1872 # see comments on TYPE_TAG/etc for the structure of this data
1873 tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
1875 insertion_mode = ins_mode_in_body
1876 flag_frameset_ok = true
1878 flag_foster_parenting = false
1879 form_element_pointer = null
1880 afe = [] # active formatting elements
1882 # tokenizer initialization
1883 tok_state = tok_state_data
1890 return tree.children
1892 # everything below is tests on the above
1893 test_equals = (description, output, expected_output) ->
1894 if output is expected_output
1895 console.log "passed." # don't say name, so smart consoles can merge all of these
1897 console.log "FAILED: \"#{description}\""
1898 console.log " Expected: #{expected_output}"
1899 console.log " Actual: #{output}"
1900 serialize_els = (els, shallow, show_ids) ->
1906 serialized += t.serialize shallow, show_ids
1908 test_parser = (args) ->
1913 prev_node_id = 0 # reset counter
1914 parsed = parse_html args.html, errors_cb
1915 serialized = serialize_els parsed, false, false
1916 if serialized isnt args.expected
1917 debug_log_each (str) ->
1919 console.log "FAILED: \"#{args.name}\""
1920 console.log " Input: #{args.html}"
1921 console.log " Correct: #{args.expected}"
1922 console.log " Output: #{serialized}"
1923 if parse_errors.length > 0
1924 console.log " parse errs: #{JSON.stringify parse_errors}"
1926 console.log " No parse errors"
1928 console.log "passed \"#{args.name}\""
1930 test_parser name: "empty", \
1933 test_parser name: "just text", \
1935 expected: 'text:"abc"'
1936 test_parser name: "named entity", \
1938 expected: 'text:"a&1234"'
1939 test_parser name: "broken named character references", \
1940 html: "1&2&&3&aabbcc;",
1941 expected: 'text:"1&2&&3&aabbcc;"'
1942 test_parser name: "numbered entity overrides", \
1943 html: "1€€ ƒ",
1944 expected: 'text:"1€€ ƒ"'
1945 test_parser name: "open tag", \
1946 html: "foo<span>bar",
1947 expected: 'text:"foo",tag:"span",{},[text:"bar"]'
1948 test_parser name: "open tag with attributes", \
1949 html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
1950 expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
1951 test_parser name: "open tag with attributes of various quotings", \
1952 html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
1953 expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]'
1954 test_parser name: "attribute entity exceptions dq", \
1955 html: "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar",
1956 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
1957 test_parser name: "attribute entity exceptions sq", \
1958 html: "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar",
1959 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
1960 test_parser name: "attribute entity exceptions uq", \
1961 html: "foo<a href=foo?t=1&=2&o=3&lt=foo>bar",
1962 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
1963 test_parser name: "matching closing tags", \
1964 html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
1965 expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"'
1966 test_parser name: "missing closing tag inside", \
1967 html: "foo<div>bar<span>baz</div>qux",
1968 expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"'
1969 test_parser name: "mis-matched closing tags", \
1970 html: "<span>12<div>34</span>56</div>78",
1971 expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]'
1972 test_parser name: "mis-matched formatting elements", \
1973 html: "12<b>34<i>56</b>78</i>90",
1974 expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"'
1975 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
1976 html: '<p>1<b>2<i>3</b>4</i>5</p>',
1977 expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]'
1978 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
1979 html: '<b>1<p>2</b>3</p>',
1980 expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]'
1981 test_parser name: "crazy formatting elements test", \
1982 html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
1983 # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
1984 # firefox does this:
1985 expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
1986 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
1987 test_parser name: "html5lib aaa 1", \
1988 html: '<a><p></a></p>',
1989 expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]'
1990 test_parser name: "html5lib aaa 2", \
1991 html: '<a>1<p>2</a>3</p>',
1992 expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]'
1993 test_parser name: "html5lib aaa 3", \
1994 html: '<a>1<button>2</a>3</button>',
1995 expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]'
1996 test_parser name: "html5lib aaa 4", \
1997 html: '<a>1<b>2</a>3</b>',
1998 expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]'
1999 test_parser name: "html5lib aaa 5 (two divs deep)", \
2000 html: '<a>1<div>2<div>3</a>4</div>5</div>',
2001 expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]'
2002 test_parser name: "html5lib aaa 6 (foster parenting)", \
2003 html: '<table><a>1<p>2</a>3</p>',
2004 expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]'
2005 test_parser name: "html5lib aaa 7 (aaa, eof) 1", \
2006 html: '<b><b><a><p></a>',
2007 expected: 'tag:"b",{},[tag:"b",{},[tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]]]'
2008 test_parser name: "html5lib aaa 8 (aaa, eof) 2", \
2009 html: '<b><a><b><p></a>',
2010 expected: 'tag:"b",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2011 test_parser name: "html5lib aaa 9 (aaa, eof) 3", \
2012 html: '<a><b><b><p></a>',
2013 expected: 'tag:"a",{},[tag:"b",{},[tag:"b",{},[]]],tag:"b",{},[tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2014 test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \
2015 html: '<p>1<s id="A">2<b id="B">3</p>4</s>5</b>',
2016 expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]'
2017 test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
2018 html: '<table><a>1<td>2</td>3</table>',
2019 expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]'
2020 test_parser name: "html5lib aaa 12 (table with foster parenting, split text)", \
2021 html: '<table>A<td>B</td>C</table>',
2022 expected: 'text:"AC",tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2023 # TODO implement svg and namespacing
2024 #test_parser name: "html5lib aaa 13 (svg tr input)", \
2025 # html: '<a><svg><tr><input></a>',
2026 # expected: 'tag:"a",{},[svg:"svg",{},[svg:"tr",{},[svg:"input"]]]'
2027 test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \
2028 html: '<div><a><b><div><div><div><div><div><div><div><div><div><div></a>',
2029 expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"div",{},[tag:"div",{},[]]]]]]]]]]]]]'
2030 test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \
2031 html: '<div><a><b><u><i><code><div></a>',
2032 expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]'
2033 test_parser name: "html5lib aaa 16 (correctly nested 4b)", \
2034 html: '<b><b><b><b>x</b></b></b></b>y',
2035 expected: 'tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]],text:"y"'
2036 test_parser name: "html5lib aaa 17 (formatting, implied /p, noah's ark)", \
2037 html: '<p><b><b><b><b><p>x',
2038 expected: 'tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[]]]]],tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]]'
2039 test_parser name: "variation on html5lib aaa 17 (with attributes in various orders)", \
2040 html: '<p><b c="d" e="f"><b e="f" c="d"><b e="f" c="d"><b c="d" e="f"><p>x',
2041 expected: 'tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[]]]]],tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[text:"x"]]]]'
2042 test_parser name: "junk after attribute close-quote", \
2043 html: '<p><b c="d", e="f">foo<p>x',
2044 expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'