1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # Each node is an obect of the Node class. Here are the Node types:
52 TYPE_TAG = 0 # name, {attributes}, [children]
53 TYPE_TEXT = 1 # "text"
56 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
57 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
58 TYPE_END_TAG = 5 # name
60 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
61 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
73 debug_log_each = (cb) ->
74 for str in g_debug_log
79 constructor: (type, args = {}) ->
80 @type = type # one of the TYPE_* constants above
81 @name = args.name ? '' # tag name
82 @text = args.text ? '' # contents for text/comment nodes
83 @attrs = args.attrs ? {}
84 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
85 @children = args.children ? []
86 @namespace = args.namespace ? NS_HTML
87 @parent = args.parent ? null
91 @id = "#{++prev_node_id}"
92 shallow_clone: -> # return a new node that's the same except without the children or parent
93 # WARNING this doesn't work right on open tags that are still being parsed
95 attrs[k] = v for k, v of @attrs
96 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
97 serialize: (shallow = false, show_ids = false) -> # for unit tests
102 ret += JSON.stringify @name
108 ret += JSON.stringify @attrs
114 ret += c.serialize shallow, show_ids
118 ret += JSON.stringify @text
121 ret += JSON.stringify @text
127 when TYPE_AAA_BOOKMARK
128 ret += 'aaa_bookmark'
131 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
134 # helpers: (only take args that are normally known when parser creates nodes)
135 new_open_tag = (name) ->
136 return new Node TYPE_START_TAG, name: name
137 new_end_tag = (name) ->
138 return new Node TYPE_END_TAG, name: name
139 new_element = (name) ->
140 return new Node TYPE_TAG, name: name
141 new_text_node = (txt) ->
142 return new Node TYPE_TEXT, text: txt
143 new_comment_node = (txt) ->
144 return new Node TYPE_COMMENT, text: txt
146 return new Node TYPE_EOF
148 return new Node TYPE_AFE_MARKER
149 new_aaa_bookmark = ->
150 return new Node TYPE_AAA_BOOKMARK
152 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
153 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
154 digits = "0123456789"
155 alnum = lc_alpha + uc_alpha + digits
156 hex_chars = digits + "abcdefABCDEF"
158 # some SVG elements have dashes in them
159 tag_name_chars = alnum + "-"
161 # http://www.w3.org/TR/html5/infrastructure.html#space-character
162 space_chars = "\u0009\u000a\u000c\u000d\u0020"
164 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
165 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
167 # These are the character references that don't need a terminating semicolon
168 # min length: 2, max: 6, none are a prefix of any other.
170 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
171 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
172 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
173 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
174 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
175 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
176 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
177 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
178 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
179 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
180 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
181 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
182 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
183 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
184 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
185 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
186 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
190 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
191 raw_text_elements = ['script', 'style']
192 escapable_raw_text_elements = ['textarea', 'title']
193 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
195 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
196 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
197 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
198 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
199 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
200 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
201 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
202 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
203 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
204 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
205 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
206 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
207 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
208 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
212 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
214 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
215 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
216 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
217 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
218 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
219 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
220 'determinant', 'diff', 'divergence', 'divide', 'domain',
221 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
222 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
223 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
224 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
225 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
226 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
227 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
228 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
229 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
230 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
231 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
232 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
233 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
234 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
235 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
236 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
237 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
238 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
239 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
240 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
241 'vectorproduct', 'xor'
243 # foreign_elements = [svg_elements..., mathml_elements...]
244 #normal_elements = All other allowed HTML elements are normal elements.
248 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
249 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
250 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
251 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
252 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
253 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
254 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
255 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
256 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
257 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
258 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
259 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
260 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
261 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
262 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
263 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
264 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
265 wbr:NS_HTML, xmp:NS_HTML,
268 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
269 'annotation-xml':NS_MATHML,
272 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
275 formatting_elements = {
276 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
277 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
281 foster_parenting_targets = {
303 el_is_special = (e) ->
304 return special_elements[e.name]?
305 # FIXME it should really be:
306 #return special_elements[e.name] is e.namespace
308 # decode_named_char_ref()
310 # The list of named character references is _huge_ so ask the browser to decode
311 # for us instead of wasting bandwidth/space on including the table here.
313 # Pass without the "&" but with the ";" examples:
314 # for "&" pass "amp;"
315 # for "′" pass "x2032;"
318 textarea: document.createElement('textarea')
320 # TODO test this in IE8
321 decode_named_char_ref = (txt) ->
323 decoded = g_dncr.cache[txt]
324 return decoded if decoded?
325 g_dncr.textarea.innerHTML = txt
326 decoded = g_dncr.textarea.value
327 return null if decoded is txt
328 return g_dncr.cache[txt] = decoded
330 parse_html = (txt, parse_error_cb = null) ->
331 cur = 0 # index of next char in txt to be parsed
332 # declare tree and tokenizer variables so they're in scope below
334 open_els = [] # stack of open elements
335 insertion_mode = null
337 tok_cur_tag = null # partially parsed tag
338 flag_frameset_ok = null
340 flag_foster_parenting = null
341 form_element_pointer = null
342 afe = [] # active formatting elements
348 console.log "Parse error at character #{cur} of #{txt.length}"
351 # the functions below impliment the Tree Contstruction algorithm
352 # http://www.w3.org/TR/html5/syntax.html#tree-construction
354 # But first... the helpers
355 template_tag_is_open = ->
357 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
360 is_in_scope_x = (tag_name, scope, namespace) ->
362 if t.name is tag_name and (namespace is null or namespace is t.namespace)
364 if scope[t.name] is t.namespace
367 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
369 if t.name is tag_name and (namespace is null or namespace is t.namespace)
371 if scope[t.name] is t.namespace
373 if scope2[t.name] is t.namespace
376 standard_scopers = { # FIXME these are supposed to be namespace specific
377 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
378 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
379 template: NS_HTML, mi: NS_MATHML,
381 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
382 'annotation-xml': NS_MATHML,
384 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
386 button_scopers = button: NS_HTML
387 li_scopers = ol: NS_HTML, ul: NS_HTML
388 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
389 is_in_scope = (tag_name, namespace = null) ->
390 return is_in_scope_x tag_name, standard_scopers, namespace
391 is_in_button_scope = (tag_name, namespace = null) ->
392 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
393 is_in_table_scope = (tag_name, namespace = null) ->
394 return is_in_scope_x tag_name, table_scopers, namespace
395 is_in_select_scope = (tag_name, namespace = null) ->
397 if t.name is tag_name and (namespace is null or namespace is t.namespace)
399 if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
402 # this checks for a particular element, not by name
403 el_is_in_scope = (el) ->
407 if standard_scopers[t.name] is t.namespace
412 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
413 reset_insertion_mode = ->
414 # 1. Let last be false.
416 # 2. Let node be the last node in the stack of open elements.
418 node = open_els[node_i]
419 # 3. Loop: If node is the first node in the stack of open elements,
420 # then set last to true, and, if the parser was originally created as
421 # part of the HTML fragment parsing algorithm (fragment case) set node
422 # to the context element.
424 if node_i is open_els.length - 1
426 # fixfull (fragment case)
428 # 4. If node is a select element, run these substeps:
429 if node.name is 'select'
430 # 1. If last is true, jump to the step below labeled done.
432 # 2. Let ancestor be node.
435 # 3. Loop: If ancestor is the first node in the stack of
436 # open elements, jump to the step below labeled done.
438 if ancestor_i is open_els.length - 1
440 # 4. Let ancestor be the node before ancestor in the stack
443 ancestor = open_els[ancestor_i]
444 # 5. If ancestor is a template node, jump to the step below
446 if ancestor.name is 'template'
448 # 6. If ancestor is a table node, switch the insertion mode
449 # to "in select in table" and abort these steps.
450 if ancestor.name is 'table'
451 insertion_mode = ins_mode_in_select_in_table
453 # 7. Jump back to the step labeled loop.
454 # 8. Done: Switch the insertion mode to "in select" and abort
456 insertion_mode = ins_mode_in_select
458 # 5. If node is a td or th element and last is false, then switch
459 # the insertion mode to "in cell" and abort these steps.
460 if (node.name is 'td' or node.name is 'th') and last is false
461 insertion_mode = ins_mode_in_cell
463 # 6. If node is a tr element, then switch the insertion mode to "in
464 # row" and abort these steps.
466 insertion_mode = ins_mode_in_row
468 # 7. If node is a tbody, thead, or tfoot element, then switch the
469 # insertion mode to "in table body" and abort these steps.
470 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
471 insertion_mode = ins_mode_in_table_body
473 # 8. If node is a caption element, then switch the insertion mode
474 # to "in caption" and abort these steps.
475 if node.name is 'caption'
476 insertion_mode = ins_mode_in_caption
478 # 9. If node is a colgroup element, then switch the insertion mode
479 # to "in column group" and abort these steps.
480 if node.name is 'colgroup'
481 insertion_mode = ins_mode_in_column_group
483 # 10. If node is a table element, then switch the insertion mode to
484 # "in table" and abort these steps.
485 if node.name is 'table'
486 insertion_mode = ins_mode_in_table
488 # 11. If node is a template element, then switch the insertion mode
489 # to the current template insertion mode and abort these steps.
490 # fixfull (template insertion mode stack)
492 # 12. If node is a head element and last is true, then switch the
493 # insertion mode to "in body" ("in body"! not "in head"!) and abort
494 # these steps. (fragment case)
495 if node.name is 'head' and last
496 insertion_mode = ins_mode_in_body
498 # 13. If node is a head element and last is false, then switch the
499 # insertion mode to "in head" and abort these steps.
500 if node.name is 'head' and last is false
501 insertion_mode = ins_mode_in_head
503 # 14. If node is a body element, then switch the insertion mode to
504 # "in body" and abort these steps.
505 if node.name is 'body'
506 insertion_mode = ins_mode_in_body
508 # 15. If node is a frameset element, then switch the insertion mode
509 # to "in frameset" and abort these steps. (fragment case)
510 if node.name is 'frameset'
511 insertion_mode = ins_mode_in_frameset
513 # 16. If node is an html element, run these substeps:
514 if node.name is 'html'
515 # 1. If the head element pointer is null, switch the insertion
516 # mode to "before head" and abort these steps. (fragment case)
517 # fixfull (fragment case)
519 # 2. Otherwise, the head element pointer is not null, switch
520 # the insertion mode to "after head" and abort these steps.
521 insertion_mode = ins_mode_in_body # FIXME fixfull
523 # 17. If last is true, then switch the insertion mode to "in body"
524 # and abort these steps. (fragment case)
526 insertion_mode = ins_mode_in_body
528 # 18. Let node now be the node before node in the stack of open
531 node = open_els[node_i]
532 # 19. Return to the step labeled loop.
534 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
535 # this implementation is structured (mostly) as described at the link above.
536 # capitalized comments are the "labels" described at the link above.
537 reconstruct_active_formatting_elements = ->
538 return if afe.length is 0
539 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
544 if i is afe.length - 1
547 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
552 el = afe[i].shallow_clone()
553 tree_insert_element el
558 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
559 # adoption agency algorithm
561 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
562 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
563 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
564 adoption_agency = (subject) ->
565 debug_log "adoption_agency()"
566 debug_log "tree: #{serialize_els tree.children, false, true}"
567 debug_log "open_els: #{serialize_els open_els, true, true}"
568 debug_log "afe: #{serialize_els afe, true, true}"
569 if open_els[0].name is subject
572 # remove it from the list of active formatting elements (if found)
577 debug_log "aaa: starting off with subject on top of stack, exiting"
584 # 5. Let formatting element be the last element in the list of
585 # active formatting elements that: is between the end of the list
586 # and the last scope marker in the list, if any, or the start of
587 # the list otherwise, and has the tag name subject.
589 for t, fe_of_afe in afe
590 if t.type is TYPE_AFE_MARKER
595 # If there is no such element, then abort these steps and instead
596 # act as described in the "any other end tag" entry above.
598 debug_log "aaa: fe not found in afe"
599 in_body_any_other_end_tag subject
601 # 6. If formatting element is not in the stack of open elements,
602 # then this is a parse error; remove the element from the list, and
605 for t, fe_of_open_els in open_els
610 debug_log "aaa: fe not found in open_els"
612 # "remove it from the list" must mean afe, since it's not in open_els
613 afe.splice fe_of_afe, 1
615 # 7. If formatting element is in the stack of open elements, but
616 # the element is not in scope, then this is a parse error; abort
618 unless el_is_in_scope fe
619 debug_log "aaa: fe not in scope"
622 # 8. If formatting element is not the current node, this is a parse
623 # error. (But do not abort these steps.)
624 unless open_els[0] is fe
627 # 9. Let furthest block be the topmost node in the stack of open
628 # elements that is lower in the stack than formatting element, and
629 # is an element in the special category. There might not be one.
631 fb_of_open_els = null
638 # and continue, to see if there's one that's more "topmost"
639 # 10. If there is no furthest block, then the UA must first pop all
640 # the nodes from the bottom of the stack of open elements, from the
641 # current node up to and including formatting element, then remove
642 # formatting element from the list of active formatting elements,
643 # and finally abort these steps.
645 debug_log "aaa: no fb"
649 afe.splice fe_of_afe, 1
651 # 11. Let common ancestor be the element immediately above
652 # formatting element in the stack of open elements.
653 ca = open_els[fe_of_open_els + 1] # common ancestor
655 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
656 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
657 bookmark = new_aaa_bookmark()
660 afe.splice i, 0, bookmark
662 node = last_node = fb
666 # 3. Let node be the element immediately above node in the
667 # stack of open elements, or if node is no longer in the stack
668 # of open elements (e.g. because it got removed by this
669 # algorithm), the element that was immediately above node in
670 # the stack of open elements before node was removed.
674 node_next = open_els[i + 1]
676 node = node_next ? node_above
677 debug_log "inner loop #{inner}"
678 debug_log "tree: #{serialize_els tree.children, false, true}"
679 debug_log "open_els: #{serialize_els open_els, true, true}"
680 debug_log "afe: #{serialize_els afe, true, true}"
681 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
682 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
683 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
684 debug_log "node: #{node.serialize true, true}"
685 # TODO make sure node_above gets re-set if/when node is removed from open_els
687 # 4. If node is formatting element, then go to the next step in
688 # the overall algorithm.
692 # 5. If inner loop counter is greater than three and node is in
693 # the list of active formatting elements, then remove node from
694 # the list of active formatting elements.
700 debug_log "max out inner"
705 # 6. If node is not in the list of active formatting elements,
706 # then remove node from the stack of open elements and then go
707 # back to the step labeled inner loop.
709 debug_log "not in afe"
712 node_above = open_els[i + 1]
716 debug_log "the bones"
717 # 7. create an element for the token for which the element node
718 # was created, in the HTML namespace, with common ancestor as
719 # the intended parent; replace the entry for node in the list
720 # of active formatting elements with an entry for the new
721 # element, replace the entry for node in the stack of open
722 # elements with an entry for the new element, and let node be
724 new_node = node.shallow_clone()
728 debug_log "replaced in afe"
732 node_above = open_els[i + 1]
733 open_els[i] = new_node
734 debug_log "replaced in open_els"
737 # 8. If last node is furthest block, then move the
738 # aforementioned bookmark to be immediately after the new node
739 # in the list of active formatting elements.
744 debug_log "removed bookmark"
748 # "after" means lower
749 afe.splice i, 0, bookmark # "after as <-
750 debug_log "placed bookmark after node"
751 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
753 # 9. Insert last node into node, first removing it from its
754 # previous parent node if any.
756 debug_log "last_node has parent"
757 for c, i in last_node.parent.children
759 debug_log "removing last_node from parent"
760 last_node.parent.children.splice i, 1
762 node.children.push last_node
763 last_node.parent = node
764 # 10. Let last node be node.
767 # 11. Return to the step labeled inner loop.
768 # 14. Insert whatever last node ended up being in the previous step
769 # at the appropriate place for inserting a node, but using common
770 # ancestor as the override target.
772 # JASON: In the case where fe is immediately followed by fb:
773 # * inner loop exits out early (node==fe)
775 # * last_node is still in the tree (not a duplicate)
777 debug_log "FEFIRST? last_node has parent"
778 for c, i in last_node.parent.children
780 debug_log "removing last_node from parent"
781 last_node.parent.children.splice i, 1
784 debug_log "after aaa inner loop"
785 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
786 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
787 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
788 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
789 debug_log "tree: #{serialize_els tree.children, false, true}"
794 # can't use standard insert token thing, because it's already in
795 # open_els and must stay at it's current position in open_els
796 dest = adjusted_insertion_location ca
797 dest[0].children.splice dest[1], 0, last_node
798 last_node.parent = dest[0]
801 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
802 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
803 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
804 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
805 debug_log "tree: #{serialize_els tree.children, false, true}"
807 # 15. Create an element for the token for which formatting element
808 # was created, in the HTML namespace, with furthest block as the
810 new_element = fe.shallow_clone() # FIXME intended parent thing
811 # 16. Take all of the child nodes of furthest block and append them
812 # to the element created in the last step.
813 while fb.children.length
814 t = fb.children.shift()
815 t.parent = new_element
816 new_element.children.push t
817 # 17. Append that new element to furthest block.
818 new_element.parent = fb
819 fb.children.push new_element
820 # 18. Remove formatting element from the list of active formatting
821 # elements, and insert the new element into the list of active
822 # formatting elements at the position of the aforementioned
832 # 19. Remove formatting element from the stack of open elements,
833 # and insert the new element into the stack of open elements
834 # immediately below the position of furthest block in that stack.
841 open_els.splice i, 0, new_element
843 # 20. Jump back to the step labeled outer loop.
844 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
845 debug_log "tree: #{serialize_els tree.children, false, true}"
846 debug_log "open_els: #{serialize_els open_els, true, true}"
847 debug_log "afe: #{serialize_els afe, true, true}"
850 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
851 # FIXME test this (particularly emplied end tags)
853 generate_implied_end_tags 'p' # arg is exception
854 if open_els[0].name isnt 'p'
856 while open_els.length > 1 # just in case
857 el = open_els.shift()
860 close_p_if_in_button_scope = ->
861 if is_in_button_scope 'p'
864 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
865 tree_insert_text = (t) ->
866 dest = adjusted_insertion_location()
867 # fixfull check for Document node
869 prev = dest[0].children[dest[1] - 1]
870 if prev.type is TYPE_TEXT
873 dest[0].children.splice dest[1], 0, t
876 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
877 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
878 adjusted_insertion_location = (override_target = null) ->
879 # 1. If there was an override target specified, then let target be the
882 target = override_target
883 else # Otherwise, let target be the current node.
885 # 2. Determine the adjusted insertion location using the first matching
886 # steps from the following list:
888 # If foster parenting is enabled and target is a table, tbody, tfoot,
889 # thead, or tr element Foster parenting happens when content is
890 # misnested in tables.
891 if flag_foster_parenting and foster_parenting_targets[target.name]
892 loop # once. this is here so we can ``break`` to "abort these substeps"
893 # 1. Let last template be the last template element in the
894 # stack of open elements, if any.
896 last_template_i = null
897 for el, i in open_els
898 if el.name is 'template'
902 # 2. Let last table be the last table element in the stack of
903 # open elements, if any.
906 for el, i in open_els
907 if el.name is 'table'
911 # 3. If there is a last template and either there is no last
912 # table, or there is one, but last template is lower (more
913 # recently added) than last table in the stack of open
914 # elements, then: let adjusted insertion location be inside
915 # last template's template contents, after its last child (if
916 # any), and abort these substeps.
917 if last_template and (last_table is null or last_template_i < last_table_i)
918 target = template # fixfull should be it's contents
919 target_i = target.children.length
921 # 4. If there is no last table, then let adjusted insertion
922 # location be inside the first element in the stack of open
923 # elements (the html element), after its last child (if any),
924 # and abort these substeps. (fragment case)
925 if last_table is null
927 target = open_els[open_els.length - 1]
928 target_i = target.children.length
929 # 5. If last table has a parent element, then let adjusted
930 # insertion location be inside last table's parent element,
931 # immediately before last table, and abort these substeps.
932 if last_table.parent?
933 for c, i in last_table.parent.children
935 target = last_table.parent
939 # 6. Let previous element be the element immediately above last
940 # table in the stack of open elements.
942 # huh? how could it not have a parent?
943 previous_element = open_els[last_table_i + 1]
944 # 7. Let adjusted insertion location be inside previous
945 # element, after its last child (if any).
946 target = previous_element
947 target_i = target.children.length
948 # Note: These steps are involved in part because it's possible
949 # for elements, the table element in this case in particular,
950 # to have been moved by a script around in the DOM, or indeed
951 # removed from the DOM entirely, after the element was inserted
953 break # don't really loop
955 # Otherwise Let adjusted insertion location be inside target, after
956 # its last child (if any).
957 target_i = target.children.length
959 # 3. If the adjusted insertion location is inside a template element,
960 # let it instead be inside the template element's template contents,
961 # after its last child (if any).
964 # 4. Return the adjusted insertion location.
965 return [target, target_i]
967 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
968 # aka create_an_element_for_token
969 token_to_element = (t, namespace, intended_parent) ->
970 t.type = TYPE_TAG # not TYPE_START_TAG
971 # convert attributes into a hash
973 while t.attrs_a.length
975 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
976 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
978 # TODO 2. If the newly created element has an xmlns attribute in the
979 # XMLNS namespace whose value is not exactly the same as the element's
980 # namespace, that is a parse error. Similarly, if the newly created
981 # element has an xmlns:xlink attribute in the XMLNS namespace whose
982 # value is not the XLink Namespace, that is a parse error.
984 # fixfull: the spec says stuff about form pointers and ownerDocument
988 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
989 insert_foreign_element = (token, namespace) ->
990 ail = adjusted_insertion_location()
993 el = token_to_element token, namespace, ail_el
994 # TODO skip this next step if it's broken (eg ail_el is document with child already)
996 ail_el.children.splice ail_i, 0, el
999 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1000 insert_html_element = insert_foreign_element # (token, namespace) ->
1002 # FIXME read implement "foster parenting" part
1003 # FIXME read spec, do this right
1004 # FIXME implement the override target thing
1005 # note: this assumes it's an open tag
1006 # FIXME what part of the spec is this?
1007 # TODO look through all callers of this, and see what they should really be doing.
1008 # eg probably insert_html_element for tokens
1009 tree_insert_element = (el, override_target = null, namespace = null) ->
1011 el.namespace = namespace
1012 dest = adjusted_insertion_location override_target
1013 if el.type is TYPE_START_TAG # means it's a "token"
1014 el = token_to_element el, namespace, dest[0]
1015 unless el.namespace?
1016 namespace = dest.namespace
1017 # fixfull: Document nodes sometimes can't accept more chidren
1018 dest[0].children.splice dest[1], 0, el
1023 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1024 # position should be [node, index_within_children]
1025 tree_insert_a_comment = (t, position = null) ->
1026 position ?= adjusted_insertion_location()
1027 position[0].children.splice position[1], 0, t
1029 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1030 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1031 generate_implied_end_tags = (except = null) ->
1032 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1035 # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1036 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1037 for node, i in open_els
1038 if node.name is name # FIXME check namespace too
1039 generate_implied_end_tags name # arg is exception
1040 parse_error() unless i is 0
1045 if special_elements[node.name]? # FIXME check namespac too
1048 ins_mode_in_body = (t) ->
1054 when "\t", "\u000a", "\u000c", "\u000d", ' '
1055 reconstruct_active_formatting_elements()
1058 reconstruct_active_formatting_elements()
1060 flag_frameset_ok = false
1062 tree_insert_a_comment t
1069 return if template_tag_is_open()
1070 root_attrs = open_els[open_els.length - 1].attrs
1072 root_attrs[k] = v unless root_attrs[k]?
1073 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1074 # FIXME also do this for </template> (end tag)
1075 return tree_in_head t
1082 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1083 close_p_if_in_button_scope()
1084 insert_html_element t
1085 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1086 close_p_if_in_button_scope()
1087 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1090 insert_html_element t
1091 # TODO lots more to implement here
1093 # If the list of active formatting elements
1094 # contains an a element between the end of the list and
1095 # the last marker on the list (or the start of the list
1096 # if there is no marker on the list), then this is a
1097 # parse error; run the adoption agency algorithm for
1098 # the tag name "a", then remove that element from the
1099 # list of active formatting elements and the stack of
1100 # open elements if the adoption agency algorithm didn't
1101 # already remove it (it might not have if the element
1102 # is not in table scope).
1105 if el.type is TYPE_AFE_MARKER
1115 for el, i in open_els
1117 open_els.splice i, 1
1118 reconstruct_active_formatting_elements()
1119 el = insert_html_element t
1121 when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1122 reconstruct_active_formatting_elements()
1123 el = insert_html_element t
1126 # fixfull quirksmode thing
1127 close_p_if_in_button_scope()
1128 insert_html_element t
1129 insertion_mode = ins_mode_in_table
1130 # TODO lots more to implement here
1131 else # any other start tag
1132 reconstruct_active_formatting_elements()
1133 insert_html_element t
1136 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1137 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1140 unless ok_tags[t.name]?
1143 # TODO stack of template insertion modes thing
1144 flag_parsing = false # stop parsing
1148 unless is_in_scope 'body'
1151 # TODO implement parse error and move to tree_after_body
1153 unless is_in_scope 'body' # weird, but it's what the spec says
1156 # TODO implement parse error and move to tree_after_body, reprocess
1157 when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1158 unless is_in_scope t.name, NS_HTML
1161 generate_implied_end_tags()
1162 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1165 el = open_els.shift()
1166 if el.name is t.name and el.namespace is NS_HTML
1168 # TODO lots more close tags to implement here
1170 unless is_in_button_scope 'p'
1172 insert_html_element new_open_tag 'p'
1174 # TODO lots more close tags to implement here
1175 when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1176 adoption_agency t.name
1177 # TODO lots more close tags to implement here
1179 in_body_any_other_end_tag t.name
1182 ins_mode_in_table_else = (t) ->
1184 flag_foster_parenting = true # FIXME
1186 flag_foster_parenting = false
1194 clear_to_table_stopers = {
1199 clear_stack_to_table_context = ->
1201 if clear_to_table_stopers[open_els[0].name]?
1205 clear_to_table_body_stopers = {
1212 clear_stack_to_table_body_context = ->
1214 if clear_to_table_body_stopers[open_els[0].name]?
1218 clear_to_table_row_stopers = {
1223 clear_stack_to_table_row_context = ->
1225 if clear_to_table_row_stopers[open_els[0].name]?
1229 clear_afe_to_marker = ->
1232 if el.type is TYPE_AFE_MARKER
1234 ins_mode_in_table = (t) ->
1237 if can_in_table[t.name]
1238 original_insertion_mode = insertion_mode
1239 insertion_mode = ins_mode_in_table_text
1242 ins_mode_in_table_else t
1244 tree_insert_a_comment t
1250 clear_stack_to_table_context()
1251 afe.unshift new_afe_marker()
1252 insert_html_element t
1253 insertion_mode = ins_mode_in_caption
1255 clear_stack_to_table_context()
1256 insert_html_element t
1257 insertion_mode = ins_mode_in_column_group
1259 clear_stack_to_table_context()
1260 insert_html_element new_open_tag 'colgroup'
1261 insertion_mode = ins_mode_in_column_group
1263 when 'tbody', 'tfoot', 'thead'
1264 clear_stack_to_table_context()
1265 insert_html_element t
1266 insertion_mode = ins_mode_in_table_body
1267 when 'td', 'th', 'tr'
1268 clear_stack_to_table_context()
1269 insert_html_element new_open_tag 'tbody'
1270 insertion_mode = ins_mode_in_table_body
1274 if is_in_table_scope 'table'
1276 el = open_els.shift()
1277 if el.name is 'table'
1279 reset_insertion_mode()
1281 when 'style', 'script', 'template'
1284 if token_is_input_hidden t
1285 ins_mode_in_table_else t
1288 insert_html_element t
1290 # fixfull acknowledge sef-closing flag
1293 if form_element_pointer?
1295 if template_tag_is_open()
1297 form_element_pointer = insert_html_element t
1300 ins_mode_in_table_else t
1304 if is_in_table_scope 'table'
1306 el = open_els.shift()
1307 if el.name is 'table'
1309 reset_insertion_mode()
1312 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1317 ins_mode_in_table_else t
1321 ins_mode_in_table_else t
1324 ins_mode_in_table_text = (t) ->
1331 console.log "unimplemented ins_mode_in_table_text"
1334 ins_mode_in_table_body = (t) ->
1335 if t.type is TYPE_START_TAG and t.name is 'tr'
1336 clear_stack_to_table_body_context()
1337 insert_html_element t
1339 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1341 clear_stack_to_table_body_context()
1342 insert_html_element new_open_tag 'tr'
1343 insertion_mode = ins_mode_in_row
1346 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1347 unless is_in_table_scope t.name # fixfull check namespace
1350 clear_stack_to_table_body_context()
1352 insertion_mode = ins_mode_in_table
1354 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1357 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1360 if table_scopers[el.name]
1365 clear_stack_to_table_body_context()
1367 insertion_mode = ins_mode_in_table
1370 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1376 ins_mode_in_row = (t) ->
1377 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1378 clear_stack_to_table_row_context()
1379 insert_html_element t
1380 insertion_mode = ins_mode_in_cell
1381 afe.unshift new_afe_marker()
1383 if t.type is TYPE_END_TAG and t.name is 'tr'
1384 if is_in_table_scope 'tr'
1385 clear_stack_to_table_row_context()
1387 insertion_mode = ins_mode_in_table_body
1391 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1392 if is_in_table_scope 'tr'
1393 clear_stack_to_table_row_context()
1395 insertion_mode = ins_mode_in_table_body
1400 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1401 if is_in_table_scope t.name # fixfull namespace
1402 if is_in_table_scope 'tr'
1403 clear_stack_to_table_row_context()
1405 insertion_mode = ins_mode_in_table_body
1410 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1416 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1418 generate_implied_end_tags()
1419 unless open_els[0].name is 'td' or open_els[0] is 'th'
1422 el = open_els.shift()
1423 if el.name is 'td' or el.name is 'th'
1425 clear_afe_to_marker()
1426 insertion_mode = ins_mode_in_row
1428 # http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1429 ins_mode_in_cell = (t) ->
1430 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1431 if is_in_table_scope t.name
1432 generate_implied_end_tags()
1433 if open_els[0].name isnt t.name
1436 el = open_els.shift()
1437 if el.name is t.name
1439 clear_afe_to_marker()
1440 insertion_mode = ins_mode_in_row
1444 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1447 if el.name is 'td' or el.name is 'th'
1450 if table_scopers[el.name]
1458 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1461 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1462 if is_in_table_scope t.name # fixfull namespace
1472 # the functions below implement the tokenizer stats described here:
1473 # http://www.w3.org/TR/html5/syntax.html#tokenization
1475 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
1477 switch c = txt.charAt(cur++)
1479 return new_text_node tokenize_character_reference()
1481 tok_state = tok_state_tag_open
1484 return new_text_node c
1486 return new_eof_token()
1488 return new_text_node c
1491 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
1492 # not needed: tok_state_character_reference_in_data = ->
1493 # just call tok_state_character_reference_in_data()
1495 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
1496 tok_state_tag_open = ->
1497 switch c = txt.charAt(cur++)
1499 tok_state = tok_state_markup_declaration_open
1501 tok_state = tok_state_end_tag_open
1504 tok_state = tok_state_bogus_comment
1506 if lc_alpha.indexOf(c) > -1
1507 tok_cur_tag = new_open_tag c
1508 tok_state = tok_state_tag_name
1509 else if uc_alpha.indexOf(c) > -1
1510 tok_cur_tag = new_open_tag c.toLowerCase()
1511 tok_state = tok_state_tag_name
1514 tok_state = tok_state_data
1515 cur -= 1 # we didn't parse/handle the char after <
1516 return new_text_node '<'
1519 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
1520 tok_state_end_tag_open = ->
1521 switch c = txt.charAt(cur++)
1524 tok_state = tok_state_data
1527 tok_state = tok_state_data
1528 return new_text_node '</'
1530 if uc_alpha.indexOf(c) > -1
1531 tok_cur_tag = new_end_tag c.toLowerCase()
1532 tok_state = tok_state_tag_name
1533 else if lc_alpha.indexOf(c) > -1
1534 tok_cur_tag = new_end_tag c
1535 tok_state = tok_state_tag_name
1538 tok_state = tok_state_bogus_comment
1541 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
1542 tok_state_tag_name = ->
1543 switch c = txt.charAt(cur++)
1544 when "\t", "\n", "\u000c", ' '
1545 tok_state = tok_state_before_attribute_name
1547 tok_state = tok_state_self_closing_start_tag
1549 tok_state = tok_state_data
1555 tok_cur_tag.name += "\ufffd"
1558 tok_state = tok_state_data
1560 if uc_alpha.indexOf(c) > -1
1561 tok_cur_tag.name += c.toLowerCase()
1563 tok_cur_tag.name += c
1566 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
1567 tok_state_before_attribute_name = ->
1569 switch c = txt.charAt(cur++)
1570 when "\t", "\n", "\u000c", ' '
1573 tok_state = tok_state_self_closing_start_tag
1576 tok_state = tok_state_data
1582 attr_name = "\ufffd"
1583 when '"', "'", '<', '='
1588 tok_state = tok_state_data
1590 if uc_alpha.indexOf(c) > -1
1591 attr_name = c.toLowerCase()
1595 tok_cur_tag.attrs_a.unshift [attr_name, '']
1596 tok_state = tok_state_attribute_name
1599 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
1600 tok_state_attribute_name = ->
1601 switch c = txt.charAt(cur++)
1602 when "\t", "\n", "\u000c", ' '
1603 tok_state = tok_state_after_attribute_name
1605 tok_state = tok_state_self_closing_start_tag
1607 tok_state = tok_state_before_attribute_value
1609 tok_state = tok_state_data
1615 tok_cur_tag.attrs_a[0][0] = "\ufffd"
1618 tok_cur_tag.attrs_a[0][0] = c
1621 tok_state = tok_state_data
1623 if uc_alpha.indexOf(c) > -1
1624 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
1626 tok_cur_tag.attrs_a[0][0] += c
1629 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
1630 tok_state_before_attribute_value = ->
1631 switch c = txt.charAt(cur++)
1632 when "\t", "\n", "\u000c", ' '
1635 tok_state = tok_state_attribute_value_double_quoted
1637 tok_state = tok_state_attribute_value_unquoted
1640 tok_state = tok_state_attribute_value_single_quoted
1643 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1644 tok_state = tok_state_attribute_value_unquoted
1647 tok_state = tok_state_data
1653 tok_state = tok_state_data
1655 tok_cur_tag.attrs_a[0][1] += c
1656 tok_state = tok_state_attribute_value_unquoted
1659 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
1660 tok_state_attribute_value_double_quoted = ->
1661 switch c = txt.charAt(cur++)
1663 tok_state = tok_state_after_attribute_value_quoted
1665 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
1668 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1671 tok_state = tok_state_data
1673 tok_cur_tag.attrs_a[0][1] += c
1676 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
1677 tok_state_attribute_value_single_quoted = ->
1678 switch c = txt.charAt(cur++)
1680 tok_state = tok_state_after_attribute_value_quoted
1682 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
1685 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1688 tok_state = tok_state_data
1690 tok_cur_tag.attrs_a[0][1] += c
1693 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
1694 tok_state_attribute_value_unquoted = ->
1695 switch c = txt.charAt(cur++)
1696 when "\t", "\n", "\u000c", ' '
1697 tok_state = tok_state_before_attribute_name
1699 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
1701 tok_state = tok_state_data
1706 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1709 tok_state = tok_state_data
1711 # Parse Error if ', <, = or ` (backtick)
1712 tok_cur_tag.attrs_a[0][1] += c
1715 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
1716 tok_state_after_attribute_value_quoted = ->
1717 switch c = txt.charAt(cur++)
1718 when "\t", "\n", "\u000c", ' '
1719 tok_state = tok_state_before_attribute_name
1721 tok_state = tok_state_self_closing_start_tag
1723 tok_state = tok_state_data
1729 tok_state = tok_state_data
1732 tok_state = tok_state_before_attribute_name
1733 cur -= 1 # we didn't handle that char
1736 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
1737 # Don't set this as a state, just call it
1738 # returns a string (NOT a text node)
1739 tokenize_character_reference = (allowed_char = null, in_attr = false) ->
1740 if cur >= txt.length
1742 switch c = txt.charAt(cur)
1743 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
1744 # explicitly not a parse error
1747 # there has to be "one or more" alnums between & and ; to be a parse error
1750 if cur + 1 >= txt.length
1752 if txt.charAt(cur + 1).toLowerCase() is 'x'
1761 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
1765 if txt.charAt(start + i) is ';'
1767 # FIXME This is supposed to generate parse errors for some chars
1768 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
1775 if alnum.indexOf(txt.charAt(cur + i)) is -1
1778 # exit early, because parse_error() below needs at least one alnum
1780 if txt.charAt(cur + i) is ';'
1781 i += 1 # include ';' terminator in value
1782 decoded = decode_named_char_ref txt.substr(cur, i)
1789 # no ';' terminator (only legacy char refs)
1791 for i in [2..max] # no prefix matches, so ok to check shortest first
1792 c = legacy_char_refs[txt.substr(cur, i)]
1795 if txt.charAt(cur + i) is '='
1796 # "because some legacy user agents will
1797 # misinterpret the markup in those cases"
1800 if alnum.indexOf(txt.charAt(cur + i)) > -1
1801 # this makes attributes forgiving about url args
1803 # ok, and besides the weird exceptions for attributes...
1804 # return the matching char
1805 cur += i # consume entity chars
1806 parse_error() # because no terminating ";"
1810 return # never reached
1812 # tree constructor initialization
1813 # see comments on TYPE_TAG/etc for the structure of this data
1814 tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
1816 insertion_mode = ins_mode_in_body
1817 flag_frameset_ok = true
1819 flag_foster_parenting = false
1820 form_element_pointer = null
1821 afe = [] # active formatting elements
1823 # tokenizer initialization
1824 tok_state = tok_state_data
1831 return tree.children
1833 # everything below is tests on the above
1834 test_equals = (description, output, expected_output) ->
1835 if output is expected_output
1836 console.log "passed." # don't say name, so smart consoles can merge all of these
1838 console.log "FAILED: \"#{description}\""
1839 console.log " Expected: #{expected_output}"
1840 console.log " Actual: #{output}"
1841 serialize_els = (els, shallow, show_ids) ->
1847 serialized += t.serialize shallow, show_ids
1849 test_parser = (args) ->
1854 prev_node_id = 0 # reset counter
1855 parsed = parse_html args.html, errors_cb
1856 serialized = serialize_els parsed, false, false
1857 if serialized isnt args.expected
1858 debug_log_each (str) ->
1860 console.log "FAILED: \"#{args.name}\""
1861 console.log " Input: #{args.html}"
1862 console.log " Correct: #{args.expected}"
1863 console.log " Output: #{serialized}"
1864 if parse_errors.length > 0
1865 console.log " parse errs: #{JSON.stringify parse_errors}"
1867 console.log " No parse errors"
1869 console.log "passed \"#{args.name}\""
1871 test_parser name: "empty", \
1874 test_parser name: "just text", \
1876 expected: 'text:"abc"'
1877 test_parser name: "named entity", \
1879 expected: 'text:"a&1234"'
1880 test_parser name: "broken named character references", \
1881 html: "1&2&&3&aabbcc;",
1882 expected: 'text:"1&2&&3&aabbcc;"'
1883 test_parser name: "numbered entity overrides", \
1884 html: "1€€ ƒ",
1885 expected: 'text:"1€€ ƒ"'
1886 test_parser name: "open tag", \
1887 html: "foo<span>bar",
1888 expected: 'text:"foo",tag:"span",{},[text:"bar"]'
1889 test_parser name: "open tag with attributes", \
1890 html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
1891 expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
1892 test_parser name: "open tag with attributes of various quotings", \
1893 html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
1894 expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]'
1895 test_parser name: "attribute entity exceptions dq", \
1896 html: "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar",
1897 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
1898 test_parser name: "attribute entity exceptions sq", \
1899 html: "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar",
1900 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
1901 test_parser name: "attribute entity exceptions uq", \
1902 html: "foo<a href=foo?t=1&=2&o=3&lt=foo>bar",
1903 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
1904 test_parser name: "matching closing tags", \
1905 html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
1906 expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"'
1907 test_parser name: "missing closing tag inside", \
1908 html: "foo<div>bar<span>baz</div>qux",
1909 expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"'
1910 test_parser name: "mis-matched closing tags", \
1911 html: "<span>12<div>34</span>56</div>78",
1912 expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]'
1913 test_parser name: "mis-matched formatting elements", \
1914 html: "12<b>34<i>56</b>78</i>90",
1915 expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"'
1916 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
1917 html: '<p>1<b>2<i>3</b>4</i>5</p>',
1918 expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]'
1919 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
1920 html: '<b>1<p>2</b>3</p>',
1921 expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]'
1922 test_parser name: "crazy formatting elements test", \
1923 html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
1924 # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
1925 # firefox does this:
1926 expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
1927 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
1928 test_parser name: "html5lib aaa 1", \
1929 html: '<a><p></a></p>',
1930 expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]'
1931 test_parser name: "html5lib aaa 2", \
1932 html: '<a>1<p>2</a>3</p>',
1933 expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]'
1934 test_parser name: "html5lib aaa 3", \
1935 html: '<a>1<button>2</a>3</button>',
1936 expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]'
1937 test_parser name: "html5lib aaa 4", \
1938 html: '<a>1<b>2</a>3</b>',
1939 expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]'
1940 test_parser name: "html5lib aaa 5 (two divs deep)", \
1941 html: '<a>1<div>2<div>3</a>4</div>5</div>',
1942 expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]'
1943 test_parser name: "html5lib aaa 6 (foster parenting)", \
1944 html: '<table><a>1<p>2</a>3</p>',
1945 expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]'
1946 test_parser name: "html5lib aaa 7 (aaa, eof) 1", \
1947 html: '<b><b><a><p></a>',
1948 expected: 'tag:"b",{},[tag:"b",{},[tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]]]'
1949 test_parser name: "html5lib aaa 8 (aaa, eof) 2", \
1950 html: '<b><a><b><p></a>',
1951 expected: 'tag:"b",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
1952 test_parser name: "html5lib aaa 9 (aaa, eof) 3", \
1953 html: '<a><b><b><p></a>',
1954 expected: 'tag:"a",{},[tag:"b",{},[tag:"b",{},[]]],tag:"b",{},[tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
1955 test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \
1956 html: '<p>1<s id="A">2<b id="B">3</p>4</s>5</b>',
1957 expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]'
1958 test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
1959 html: '<table><a>1<td>2</td>3</table>',
1960 expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]'
1961 test_parser name: "html5lib aaa 12 (table with foster parenting, split text)", \
1962 html: '<table>A<td>B</td>C</table>',
1963 expected: 'text:"AC",tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
1964 # TODO implement svg and namespacing
1965 #test_parser name: "html5lib aaa 13 (svg tr input)", \
1966 # html: '<a><svg><tr><input></a>',
1967 # expected: 'tag:"a",{},[svg:"svg",{},[svg:"tr",{},[svg:"input"]]]'
1968 test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \
1969 html: '<div><a><b><div><div><div><div><div><div><div><div><div><div></a>',
1970 expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"div",{},[tag:"div",{},[]]]]]]]]]]]]]'
1971 test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \
1972 html: '<div><a><b><u><i><code><div></a>',
1973 expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]'