1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # Each node is an obect of the Node class. Here are the Node types:
52 TYPE_TAG = 0 # name, {attributes}, [children]
53 TYPE_TEXT = 1 # "text"
56 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
57 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
58 TYPE_END_TAG = 5 # name
60 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
61 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
73 debug_log_each = (cb) ->
74 for str in g_debug_log
79 constructor: (type, args = {}) ->
80 @type = type # one of the TYPE_* constants above
81 @name = args.name ? '' # tag name
82 @text = args.text ? '' # contents for text/comment nodes
83 @attrs = args.attrs ? {}
84 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
85 @children = args.children ? []
86 @namespace = args.namespace ? NS_HTML
87 @parent = args.parent ? null
91 @id = "#{++prev_node_id}"
92 shallow_clone: -> # return a new node that's the same except without the children or parent
93 # WARNING this doesn't work right on open tags that are still being parsed
95 attrs[k] = v for k, v of @attrs
96 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
97 serialize: (shallow = false, show_ids = false) -> # for unit tests
102 ret += JSON.stringify @name
108 ret += JSON.stringify @attrs
114 ret += c.serialize shallow, show_ids
118 ret += JSON.stringify @text
121 ret += JSON.stringify @text
127 when TYPE_AAA_BOOKMARK
128 ret += 'aaa_bookmark'
131 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
134 # helpers: (only take args that are normally known when parser creates nodes)
135 new_open_tag = (name) ->
136 return new Node TYPE_START_TAG, name: name
137 new_end_tag = (name) ->
138 return new Node TYPE_END_TAG, name: name
139 new_element = (name) ->
140 return new Node TYPE_TAG, name: name
141 new_text_node = (txt) ->
142 return new Node TYPE_TEXT, text: txt
143 new_comment_node = (txt) ->
144 return new Node TYPE_COMMENT, text: txt
146 return new Node TYPE_EOF
148 return new Node TYPE_AFE_MARKER
149 new_aaa_bookmark = ->
150 return new Node TYPE_AAA_BOOKMARK
152 lc_alpha = "abcdefghijklmnopqrstuvwxqz"
153 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXQZ"
154 digits = "0123456789"
155 alnum = lc_alpha + uc_alpha + digits
156 hex_chars = digits + "abcdefABCDEF"
158 # some SVG elements have dashes in them
159 tag_name_chars = alnum + "-"
161 # http://www.w3.org/TR/html5/infrastructure.html#space-character
162 space_chars = "\u0009\u000a\u000c\u000d\u0020"
164 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
165 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
167 # These are the character references that don't need a terminating semicolon
168 # min length: 2, max: 6, none are a prefix of any other.
170 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
171 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
172 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
173 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
174 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
175 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
176 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
177 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
178 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
179 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
180 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
181 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
182 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
183 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
184 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
185 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
186 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
190 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
191 raw_text_elements = ['script', 'style']
192 escapable_raw_text_elements = ['textarea', 'title']
193 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
195 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
196 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
197 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
198 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
199 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
200 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
201 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
202 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
203 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
204 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
205 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
206 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
207 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
208 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
212 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
214 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
215 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
216 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
217 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
218 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
219 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
220 'determinant', 'diff', 'divergence', 'divide', 'domain',
221 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
222 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
223 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
224 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
225 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
226 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
227 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
228 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
229 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
230 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
231 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
232 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
233 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
234 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
235 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
236 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
237 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
238 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
239 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
240 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
241 'vectorproduct', 'xor'
243 # foreign_elements = [svg_elements..., mathml_elements...]
244 #normal_elements = All other allowed HTML elements are normal elements.
248 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
249 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
250 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
251 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
252 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
253 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
254 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
255 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
256 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
257 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
258 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
259 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
260 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
261 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
262 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
263 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
264 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
265 wbr:NS_HTML, xmp:NS_HTML,
268 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
269 'annotation-xml':NS_MATHML,
272 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
275 formatting_elements = {
276 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
277 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
281 foster_parenting_targets = {
303 el_is_special = (e) ->
304 return special_elements[e.name]?
305 # FIXME it should really be:
306 #return special_elements[e.name] is e.namespace
308 # decode_named_char_ref()
310 # The list of named character references is _huge_ so ask the browser to decode
311 # for us instead of wasting bandwidth/space on including the table here.
313 # Pass without the "&" but with the ";" examples:
314 # for "&" pass "amp;"
315 # for "′" pass "x2032;"
318 textarea: document.createElement('textarea')
320 # TODO test this in IE8
321 decode_named_char_ref = (txt) ->
323 decoded = g_dncr.cache[txt]
324 return decoded if decoded?
325 g_dncr.textarea.innerHTML = txt
326 decoded = g_dncr.textarea.value
327 return null if decoded is txt
328 return g_dncr.cache[txt] = decoded
330 parse_html = (txt, parse_error_cb = null) ->
331 cur = 0 # index of next char in txt to be parsed
332 # declare tree and tokenizer variables so they're in scope below
334 open_els = [] # stack of open elements
335 insertion_mode = null
337 tok_cur_tag = null # partially parsed tag
338 flag_frameset_ok = null
340 flag_foster_parenting = null
341 form_element_pointer = null
342 afe = [] # active formatting elements
348 console.log "Parse error at character #{cur} of #{txt.length}"
351 # the functions below impliment the Tree Contstruction algorithm
352 # http://www.w3.org/TR/html5/syntax.html#tree-construction
354 # But first... the helpers
355 template_tag_is_open = ->
357 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
360 is_in_scope_x = (tag_name, scope, namespace) ->
362 if t.name is tag_name and (namespace is null or namespace is t.namespace)
364 if scope[t.name] is t.namespace
367 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
369 if t.name is tag_name and (namespace is null or namespace is t.namespace)
371 if scope[t.name] is t.namespace
373 if scope2[t.name] is t.namespace
376 standard_scopers = { # FIXME these are supposed to be namespace specific
377 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
378 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
379 template: NS_HTML, mi: NS_MATHML,
381 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
382 'annotation-xml': NS_MATHML,
384 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
386 button_scopers = button: NS_HTML
387 li_scopers = ol: NS_HTML, ul: NS_HTML
388 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
389 is_in_scope = (tag_name, namespace = null) ->
390 return is_in_scope_x tag_name, standard_scopers, namespace
391 is_in_button_scope = (tag_name, namespace = null) ->
392 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
393 is_in_table_scope = (tag_name, namespace = null) ->
394 return is_in_scope_x tag_name, table_scopers, namespace
395 is_in_select_scope = (tag_name, namespace = null) ->
397 if t.name is tag_name and (namespace is null or namespace is t.namespace)
399 if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
402 # this checks for a particular element, not by name
403 el_is_in_scope = (el) ->
407 if standard_scopers[t.name] is t.namespace
412 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
413 reset_insertion_mode = ->
414 # 1. Let last be false.
416 # 2. Let node be the last node in the stack of open elements.
418 node = open_els[node_i]
419 # 3. Loop: If node is the first node in the stack of open elements,
420 # then set last to true, and, if the parser was originally created as
421 # part of the HTML fragment parsing algorithm (fragment case) set node
422 # to the context element.
424 if node_i is open_els.length - 1
426 # fixfull (fragment case)
428 # 4. If node is a select element, run these substeps:
429 if node.name is 'select'
430 # 1. If last is true, jump to the step below labeled done.
432 # 2. Let ancestor be node.
435 # 3. Loop: If ancestor is the first node in the stack of
436 # open elements, jump to the step below labeled done.
438 if ancestor_i is open_els.length - 1
440 # 4. Let ancestor be the node before ancestor in the stack
443 ancestor = open_els[ancestor_i]
444 # 5. If ancestor is a template node, jump to the step below
446 if ancestor.name is 'template'
448 # 6. If ancestor is a table node, switch the insertion mode
449 # to "in select in table" and abort these steps.
450 if ancestor.name is 'table'
451 insertion_mode = ins_mode_in_select_in_table
453 # 7. Jump back to the step labeled loop.
454 # 8. Done: Switch the insertion mode to "in select" and abort
456 insertion_mode = ins_mode_in_select
458 # 5. If node is a td or th element and last is false, then switch
459 # the insertion mode to "in cell" and abort these steps.
460 if (node.name is 'td' or node.name is 'th') and last is false
461 insertion_mode = ins_mode_in_cell
463 # 6. If node is a tr element, then switch the insertion mode to "in
464 # row" and abort these steps.
466 insertion_mode = ins_mode_in_row
468 # 7. If node is a tbody, thead, or tfoot element, then switch the
469 # insertion mode to "in table body" and abort these steps.
470 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
471 insertion_mode = ins_mode_in_table_body
473 # 8. If node is a caption element, then switch the insertion mode
474 # to "in caption" and abort these steps.
475 if node.name is 'caption'
476 insertion_mode = ins_mode_in_caption
478 # 9. If node is a colgroup element, then switch the insertion mode
479 # to "in column group" and abort these steps.
480 if node.name is 'colgroup'
481 insertion_mode = ins_mode_in_column_group
483 # 10. If node is a table element, then switch the insertion mode to
484 # "in table" and abort these steps.
485 if node.name is 'table'
486 insertion_mode = ins_mode_in_table
488 # 11. If node is a template element, then switch the insertion mode
489 # to the current template insertion mode and abort these steps.
490 # fixfull (template insertion mode stack)
492 # 12. If node is a head element and last is true, then switch the
493 # insertion mode to "in body" ("in body"! not "in head"!) and abort
494 # these steps. (fragment case)
495 if node.name is 'head' and last
496 insertion_mode = ins_mode_in_body
498 # 13. If node is a head element and last is false, then switch the
499 # insertion mode to "in head" and abort these steps.
500 if node.name is 'head' and last is false
501 insertion_mode = ins_mode_in_head
503 # 14. If node is a body element, then switch the insertion mode to
504 # "in body" and abort these steps.
505 if node.name is 'body'
506 insertion_mode = ins_mode_in_body
508 # 15. If node is a frameset element, then switch the insertion mode
509 # to "in frameset" and abort these steps. (fragment case)
510 if node.name is 'frameset'
511 insertion_mode = ins_mode_in_frameset
513 # 16. If node is an html element, run these substeps:
514 if node.name is 'html'
515 # 1. If the head element pointer is null, switch the insertion
516 # mode to "before head" and abort these steps. (fragment case)
517 # fixfull (fragment case)
519 # 2. Otherwise, the head element pointer is not null, switch
520 # the insertion mode to "after head" and abort these steps.
521 insertion_mode = ins_mode_in_body # FIXME fixfull
523 # 17. If last is true, then switch the insertion mode to "in body"
524 # and abort these steps. (fragment case)
526 insertion_mode = ins_mode_in_body
528 # 18. Let node now be the node before node in the stack of open
531 node = open_els[node_i]
532 # 19. Return to the step labeled loop.
534 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
535 # this implementation is structured (mostly) as described at the link above.
536 # capitalized comments are the "labels" described at the link above.
537 reconstruct_active_formatting_elements = ->
538 return if afe.length is 0
539 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
544 if i is afe.length - 1
547 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
552 el = afe[i].shallow_clone()
553 tree_insert_element el
558 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
559 # adoption agency algorithm
561 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
562 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
563 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
564 adoption_agency = (subject) ->
565 if open_els[0].name is subject
568 # remove it from the list of active formatting elements (if found)
579 # 5. Let formatting element be the last element in the list of
580 # active formatting elements that: is between the end of the list
581 # and the last scope marker in the list, if any, or the start of
582 # the list otherwise, and has the tag name subject.
584 for t, fe_of_afe in afe
585 if t.type is TYPE_AFE_MARKER
590 # If there is no such element, then abort these steps and instead
591 # act as described in the "any other end tag" entry above.
593 in_body_any_other_end_tag subject
595 # 6. If formatting element is not in the stack of open elements,
596 # then this is a parse error; remove the element from the list, and
599 for t, fe_of_open_els in open_els
605 # "remove it from the list" must mean afe, since it's not in open_els
606 afe.splice fe_of_afe, 1
608 # 7. If formatting element is in the stack of open elements, but
609 # the element is not in scope, then this is a parse error; abort
611 unless el_is_in_scope fe
614 # 8. If formatting element is not the current node, this is a parse
615 # error. (But do not abort these steps.)
616 unless open_els[0] is fe
619 # 9. Let furthest block be the topmost node in the stack of open
620 # elements that is lower in the stack than formatting element, and
621 # is an element in the special category. There might not be one.
623 fb_of_open_els = null
630 # and continue, to see if there's one that's more "topmost"
631 # 10. If there is no furthest block, then the UA must first pop all
632 # the nodes from the bottom of the stack of open elements, from the
633 # current node up to and including formatting element, then remove
634 # formatting element from the list of active formatting elements,
635 # and finally abort these steps.
640 afe.splice fe_of_afe, 1
642 # 11. Let common ancestor be the element immediately above
643 # formatting element in the stack of open elements.
644 ca = open_els[fe_of_open_els + 1] # common ancestor
646 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
647 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
648 bookmark = new_aaa_bookmark()
651 afe.splice i, 0, bookmark
653 node = last_node = fb
657 # 3. Let node be the element immediately above node in the
658 # stack of open elements, or if node is no longer in the stack
659 # of open elements (e.g. because it got removed by this
660 # algorithm), the element that was immediately above node in
661 # the stack of open elements before node was removed.
665 node_next = open_els[i + 1]
667 node = node_next ? node_above
668 debug_log "inner loop #{inner}"
669 debug_log "open_els: #{serialize_els open_els, true, true}"
670 debug_log "tree: #{serialize_els tree.children, false, true}"
671 debug_log "afe: #{serialize_els afe, true, true}"
672 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
673 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
674 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
675 debug_log "node: #{node.serialize true, true}"
676 # TODO make sure node_above gets re-set if/when node is removed from open_els
678 # 4. If node is formatting element, then go to the next step in
679 # the overall algorithm.
683 # 5. If inner loop counter is greater than three and node is in
684 # the list of active formatting elements, then remove node from
685 # the list of active formatting elements.
691 debug_log "max out inner"
696 # 6. If node is not in the list of active formatting elements,
697 # then remove node from the stack of open elements and then go
698 # back to the step labeled inner loop.
700 debug_log "not in afe"
703 node_above = open_els[i + 1]
707 debug_log "the bones"
708 # 7. create an element for the token for which the element node
709 # was created, in the HTML namespace, with common ancestor as
710 # the intended parent; replace the entry for node in the list
711 # of active formatting elements with an entry for the new
712 # element, replace the entry for node in the stack of open
713 # elements with an entry for the new element, and let node be
715 new_node = node.shallow_clone()
719 debug_log "replaced in afe"
723 node_above = open_els[i + 1]
724 open_els[i] = new_node
725 debug_log "replaced in open_els"
728 # 8. If last node is furthest block, then move the
729 # aforementioned bookmark to be immediately after the new node
730 # in the list of active formatting elements.
735 debug_log "removed bookmark"
739 # "after" means lower
740 afe.splice i, 0, bookmark # "after as <-
741 debug_log "placed bookmark after node"
742 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
744 # 9. Insert last node into node, first removing it from its
745 # previous parent node if any.
747 debug_log "last_node has parent"
748 for c, i in last_node.parent.children
750 debug_log "removing last_node from parent"
751 last_node.parent.children.splice i, 1
753 node.children.push last_node
754 last_node.parent = node
755 # 10. Let last node be node.
758 # 11. Return to the step labeled inner loop.
759 # 14. Insert whatever last node ended up being in the previous step
760 # at the appropriate place for inserting a node, but using common
761 # ancestor as the override target.
763 # JASON: In the case where fe is immediately followed by fb:
764 # * inner loop exits out early (node==fe)
766 # * last_node is still in the tree (not a duplicate)
768 debug_log "FEFIRST? last_node has parent"
769 for c, i in last_node.parent.children
771 debug_log "removing last_node from parent"
772 last_node.parent.children.splice i, 1
775 debug_log "after aaa inner loop"
776 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
777 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
778 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
779 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
780 debug_log "tree: #{serialize_els tree.children, false, true}"
785 # can't use standard insert token thing, because it's already in
786 # open_els and must stay at it's current position in open_els
787 dest = adjusted_insertion_location ca
788 dest[0].children.splice dest[1], 0, last_node
789 last_node.parent = dest[0]
792 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
793 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
794 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
795 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
796 debug_log "tree: #{serialize_els tree.children, false, true}"
798 # 15. Create an element for the token for which formatting element
799 # was created, in the HTML namespace, with furthest block as the
801 new_element = fe.shallow_clone() # FIXME intended parent thing
802 # 16. Take all of the child nodes of furthest block and append them
803 # to the element created in the last step.
804 while fb.children.length
805 t = fb.children.shift()
806 t.parent = new_element
807 new_element.children.push t
808 # 17. Append that new element to furthest block.
809 new_element.parent = fb
810 fb.children.push new_element
811 # 18. Remove formatting element from the list of active formatting
812 # elements, and insert the new element into the list of active
813 # formatting elements at the position of the aforementioned
823 # 19. Remove formatting element from the stack of open elements,
824 # and insert the new element into the stack of open elements
825 # immediately below the position of furthest block in that stack.
832 open_els.splice i, 0, new_element
834 # 20. Jump back to the step labeled outer loop.
835 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
836 debug_log "tree: #{serialize_els tree.children, false, true}"
837 debug_log "open_els: #{serialize_els open_els, true, true}"
838 debug_log "afe: #{serialize_els afe, true, true}"
841 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
842 # FIXME test this (particularly emplied end tags)
844 generate_implied_end_tags 'p' # arg is exception
845 if open_els[0].name isnt 'p'
847 while open_els.length > 1 # just in case
851 close_p_if_in_button_scope = ->
852 if is_in_button_scope 'p'
855 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
856 tree_insert_text = (t) ->
857 dest = adjusted_insertion_location()
859 prev = dest[0].children[dest[1] - 1]
860 if prev.type is TYPE_TEXT
863 dest[0].children.splice dest[1], 0, t
866 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
867 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
868 adjusted_insertion_location = (override_target = null) ->
869 # 1. If there was an override target specified, then let target be the
872 target = override_target
873 else # Otherwise, let target be the current node.
875 # 2. Determine the adjusted insertion location using the first matching
876 # steps from the following list:
878 # If foster parenting is enabled and target is a table, tbody, tfoot,
879 # thead, or tr element Foster parenting happens when content is
880 # misnested in tables.
881 if flag_foster_parenting and foster_parenting_targets[target.name]
882 loop # once. this is here so we can ``break`` to "abort these substeps"
883 # 1. Let last template be the last template element in the
884 # stack of open elements, if any.
886 last_template_i = null
887 for el, i in open_els
888 if el.name is 'template'
892 # 2. Let last table be the last table element in the stack of
893 # open elements, if any.
896 for el, i in open_els
897 if el.name is 'table'
901 # 3. If there is a last template and either there is no last
902 # table, or there is one, but last template is lower (more
903 # recently added) than last table in the stack of open
904 # elements, then: let adjusted insertion location be inside
905 # last template's template contents, after its last child (if
906 # any), and abort these substeps.
907 if last_template and (last_table is null or last_template_i < last_table_i)
908 target = template # fixfull should be it's contents
909 target_i = target.children.length
911 # 4. If there is no last table, then let adjusted insertion
912 # location be inside the first element in the stack of open
913 # elements (the html element), after its last child (if any),
914 # and abort these substeps. (fragment case)
915 if last_table is null
917 target = open_els[open_els.length - 1]
918 target_i = target.children.length
919 # 5. If last table has a parent element, then let adjusted
920 # insertion location be inside last table's parent element,
921 # immediately before last table, and abort these substeps.
922 if last_table.parent?
923 for c, i in last_table.parent.children
925 target = last_table.parent
929 # 6. Let previous element be the element immediately above last
930 # table in the stack of open elements.
932 # huh? how could it not have a parent?
933 previous_element = open_els[last_table_i + 1]
934 # 7. Let adjusted insertion location be inside previous
935 # element, after its last child (if any).
936 target = previous_element
937 target_i = target.children.length
938 # Note: These steps are involved in part because it's possible
939 # for elements, the table element in this case in particular,
940 # to have been moved by a script around in the DOM, or indeed
941 # removed from the DOM entirely, after the element was inserted
943 break # don't really loop
945 # Otherwise Let adjusted insertion location be inside target, after
946 # its last child (if any).
947 target_i = target.children.length
949 # 3. If the adjusted insertion location is inside a template element,
950 # let it instead be inside the template element's template contents,
951 # after its last child (if any).
954 # 4. Return the adjusted insertion location.
955 return [target, target_i]
957 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
958 # aka create_an_element_for_token
959 token_to_element = (t, namespace, intended_parent) ->
960 t.type = TYPE_TAG # not TYPE_START_TAG
961 # convert attributes into a hash
963 while t.attrs_a.length
965 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
966 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
968 # TODO 2. If the newly created element has an xmlns attribute in the
969 # XMLNS namespace whose value is not exactly the same as the element's
970 # namespace, that is a parse error. Similarly, if the newly created
971 # element has an xmlns:xlink attribute in the XMLNS namespace whose
972 # value is not the XLink Namespace, that is a parse error.
974 # fixfull: the spec says stuff about form pointers and ownerDocument
978 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
979 insert_foreign_element = (token, namespace) ->
980 ail = adjusted_insertion_location()
983 el = token_to_element token, namespace, ail_el
984 # TODO skip this next step if it's broken (eg ail_el is document with child already)
986 ail_el.children.splice ail_i, 0, el
989 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
990 insert_html_element = insert_foreign_element # (token, namespace) ->
992 # FIXME read implement "foster parenting" part
993 # FIXME read spec, do this right
994 # FIXME implement the override target thing
995 # note: this assumes it's an open tag
996 # FIXME what part of the spec is this?
997 # TODO look through all callers of this, and see what they should really be doing.
998 # eg probably insert_html_element for tokens
999 tree_insert_element = (el, override_target = null, namespace = null) ->
1001 el.namespace = namespace
1002 dest = adjusted_insertion_location override_target
1003 if el.type is TYPE_START_TAG # means it's a "token"
1004 el = token_to_element el, namespace, dest[0]
1005 unless el.namespace?
1006 namespace = dest.namespace
1007 # fixfull: Document nodes sometimes can't accept more chidren
1008 dest[0].children.splice dest[1], 0, el
1013 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1014 # position should be [node, index_within_children]
1015 tree_insert_a_comment = (t, position = null) ->
1016 position ?= adjusted_insertion_location()
1017 position[0].children.splice position[1], 0, t
1019 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1020 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1021 generate_implied_end_tags = (except = null) ->
1022 while end_tag_implied[open_els[0]] and open_els[0].name isnt except
1025 # 8.2.5.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1026 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1027 for node, i in open_els
1028 if node.name is name # FIXME check namespace too
1029 generate_implied_end_tags name # arg is exception
1030 parse_error() unless i is 0
1035 if special_elements[node.name]? # FIXME check namespac too
1038 ins_mode_in_body = (t) ->
1044 when "\t", "\u000a", "\u000c", "\u000d", ' '
1045 reconstruct_active_formatting_elements()
1048 reconstruct_active_formatting_elements()
1050 flag_frameset_ok = false
1052 tree_insert_a_comment t
1059 return if template_tag_is_open()
1060 root_attrs = open_els[open_els.length - 1].attrs
1062 root_attrs[k] = v unless root_attrs[k]?
1063 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1064 # FIXME also do this for </template> (end tag)
1065 return tree_in_head t
1072 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1073 close_p_if_in_button_scope()
1074 insert_html_element t
1075 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1076 close_p_if_in_button_scope()
1077 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1080 insert_html_element t
1081 # TODO lots more to implement here
1083 # If the list of active formatting elements
1084 # contains an a element between the end of the list and
1085 # the last marker on the list (or the start of the list
1086 # if there is no marker on the list), then this is a
1087 # parse error; run the adoption agency algorithm for
1088 # the tag name "a", then remove that element from the
1089 # list of active formatting elements and the stack of
1090 # open elements if the adoption agency algorithm didn't
1091 # already remove it (it might not have if the element
1092 # is not in table scope).
1095 if el.type is TYPE_AFE_MARKER
1105 for el, i in open_els
1107 open_els.splice i, 1
1108 reconstruct_active_formatting_elements()
1109 el = tree_insert_element t
1111 when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1112 reconstruct_active_formatting_elements()
1113 el = tree_insert_element t
1116 # fixfull quirksmode thing
1117 close_p_if_in_button_scope()
1118 insert_html_element t
1119 insertion_mode = ins_mode_in_table
1120 # TODO lots more to implement here
1121 else # any other start tag
1122 reconstruct_active_formatting_elements()
1123 tree_insert_element t
1126 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1127 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1130 unless ok_tags[t.name]?
1133 # TODO stack of template insertion modes thing
1134 flag_parsing = false # stop parsing
1138 unless is_in_scope 'body'
1141 # TODO implement parse error and move to tree_after_body
1143 unless is_in_scope 'body' # weird, but it's what the spec says
1146 # TODO implement parse error and move to tree_after_body, reprocess
1147 when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1148 unless is_in_scope t.name, NS_HTML
1151 generate_implied_end_tags()
1152 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1155 el = open_els.shift()
1156 if el.name is t.name and el.namespace is NS_HTML
1158 # TODO lots more close tags to implement here
1160 unless is_in_button_scope 'p'
1162 insert_html_element new_open_tag 'p'
1164 # TODO lots more close tags to implement here
1165 when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1166 adoption_agency t.name
1167 # TODO lots more close tags to implement here
1169 in_body_any_other_end_tag t.name
1172 ins_mode_in_table_else = (t) ->
1174 flag_foster_parenting = true # FIXME
1176 flag_foster_parenting = false
1184 clear_to_table_stopers = {
1189 clear_stack_to_table_context = ->
1191 if clear_to_table_stopers[open_els[0].name]?
1195 clear_to_table_body_stopers = {
1202 clear_stack_to_table_body_context = ->
1204 if clear_to_table_body_stopers[open_els[0].name]?
1208 clear_to_table_row_stopers = {
1213 clear_stack_to_table_row_context = ->
1215 if clear_to_table_row_stopers[open_els[0].name]?
1219 clear_afe_to_marker = ->
1222 if el.type is TYPE_AFE_MARKER
1224 ins_mode_in_table = (t) ->
1227 if can_in_table[t.name]
1228 original_insertion_mode = insertion_mode
1229 insertion_mode = ins_mode_in_table_text
1232 ins_mode_in_table_else t
1234 tree_insert_a_comment t
1240 clear_stack_to_table_context()
1241 afe.unshift new_afe_marker()
1242 insert_html_element t
1243 insertion_mode = ins_mode_in_caption
1245 clear_stack_to_table_context()
1246 insert_html_element t
1247 insertion_mode = ins_mode_in_column_group
1249 clear_stack_to_table_context()
1250 insert_html_element new_open_tag 'colgroup'
1251 insertion_mode = ins_mode_in_column_group
1253 when 'tbody', 'tfoot', 'thead'
1254 clear_stack_to_table_context()
1255 insert_html_element t
1256 insertion_mode = ins_mode_in_table_body
1257 when 'td', 'th', 'tr'
1258 clear_stack_to_table_context()
1259 insert_html_element new_open_tag 'tbody'
1260 insertion_mode = ins_mode_in_table_body
1264 if is_in_table_scope 'table'
1266 el = open_els.shift()
1267 if el.name is 'table'
1269 reset_insertion_mode()
1271 when 'style', 'script', 'template'
1274 if token_is_input_hidden t
1275 ins_mode_in_table_else t
1278 insert_html_element t
1280 # fixfull acknowledge sef-closing flag
1283 if form_element_pointer?
1285 if template_tag_is_open()
1287 form_element_pointer = insert_html_element t
1290 ins_mode_in_table_else t
1294 if is_in_table_scope 'table'
1296 el = open_els.shift()
1297 if el.name is 'table'
1299 reset_insertion_mode()
1302 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1307 ins_mode_in_table_else t
1311 ins_mode_in_table_else t
1314 ins_mode_in_table_text = (t) ->
1321 console.log "unimplemented ins_mode_in_table_text"
1324 ins_mode_in_table_body = (t) ->
1325 if t.type is TYPE_START_TAG and t.name is 'tr'
1326 clear_stack_to_table_body_context()
1327 insert_html_element t
1329 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1331 clear_stack_to_table_body_context()
1332 insert_html_element new_open_tag 'tr'
1333 insertion_mode = ins_mode_in_row
1336 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1337 unless is_in_table_scope t.name # fixfull check namespace
1340 clear_stack_to_table_body_context()
1342 insertion_mode = ins_mode_in_table
1344 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1347 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1350 if table_scopers[el.name]
1355 clear_stack_to_table_body_context()
1357 insertion_mode = ins_mode_in_table
1360 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1366 ins_mode_in_row = (t) ->
1367 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1368 clear_stack_to_table_row_context()
1369 insert_html_element t
1370 insertion_mode = ins_mode_in_cell
1371 afe.unshift new_afe_marker()
1373 if t.type is TYPE_END_TAG and t.name is 'tr'
1374 if is_in_table_scope 'tr'
1375 clear_stack_to_table_row_context()
1377 insertion_mode = ins_mode_in_table_body
1381 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1382 if is_in_table_scope 'tr'
1383 clear_stack_to_table_row_context()
1385 insertion_mode = ins_mode_in_table_body
1390 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1391 if is_in_table_scope t.name # fixfull namespace
1392 if is_in_table_scope 'tr'
1393 clear_stack_to_table_row_context()
1395 insertion_mode = ins_mode_in_table_body
1400 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1406 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1408 generate_implied_end_tags()
1409 unless open_els[0].name is 'td' or open_els[0] is 'th'
1412 el = open_els.shift()
1413 if el.name is 'td' or el.name is 'th'
1415 clear_afe_to_marker()
1416 insertion_mode = ins_mode_in_row
1418 # http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1419 ins_mode_in_cell = (t) ->
1420 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1421 if is_in_table_scope t.name
1422 generate_implied_end_tags()
1423 if open_els[0].name isnt t.name
1426 el = open_els.shift()
1427 if el.name is t.name
1429 clear_afe_to_marker()
1430 insertion_mode = ins_mode_in_row
1434 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1437 if el.name is 'td' or el.name is 'th'
1440 if table_scopers[el.name]
1448 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1451 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1452 if is_in_table_scope t.name # fixfull namespace
1462 # the functions below implement the tokenizer stats described here:
1463 # http://www.w3.org/TR/html5/syntax.html#tokenization
1465 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
1467 switch c = txt.charAt(cur++)
1469 return new_text_node tokenize_character_reference()
1471 tok_state = tok_state_tag_open
1474 return new_text_node c
1476 return new_eof_token()
1478 return new_text_node c
1481 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
1482 # not needed: tok_state_character_reference_in_data = ->
1483 # just call tok_state_character_reference_in_data()
1485 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
1486 tok_state_tag_open = ->
1487 switch c = txt.charAt(cur++)
1489 tok_state = tok_state_markup_declaration_open
1491 tok_state = tok_state_end_tag_open
1494 tok_state = tok_state_bogus_comment
1496 if lc_alpha.indexOf(c) > -1
1497 tok_cur_tag = new_open_tag c
1498 tok_state = tok_state_tag_name
1499 else if uc_alpha.indexOf(c) > -1
1500 tok_cur_tag = new_open_tag c.toLowerCase()
1501 tok_state = tok_state_tag_name
1504 tok_state = tok_state_data
1505 cur -= 1 # we didn't parse/handle the char after <
1506 return new_text_node '<'
1509 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
1510 tok_state_end_tag_open = ->
1511 switch c = txt.charAt(cur++)
1514 tok_state = tok_state_data
1517 tok_state = tok_state_data
1518 return new_text_node '</'
1520 if uc_alpha.indexOf(c) > -1
1521 tok_cur_tag = new_end_tag c.toLowerCase()
1522 tok_state = tok_state_tag_name
1523 else if lc_alpha.indexOf(c) > -1
1524 tok_cur_tag = new_end_tag c
1525 tok_state = tok_state_tag_name
1528 tok_state = tok_state_bogus_comment
1531 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
1532 tok_state_tag_name = ->
1533 switch c = txt.charAt(cur++)
1534 when "\t", "\n", "\u000c", ' '
1535 tok_state = tok_state_before_attribute_name
1537 tok_state = tok_state_self_closing_start_tag
1539 tok_state = tok_state_data
1545 tok_cur_tag.name += "\ufffd"
1548 tok_state = tok_state_data
1550 if uc_alpha.indexOf(c) > -1
1551 tok_cur_tag.name += c.toLowerCase()
1553 tok_cur_tag.name += c
1556 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
1557 tok_state_before_attribute_name = ->
1559 switch c = txt.charAt(cur++)
1560 when "\t", "\n", "\u000c", ' '
1563 tok_state = tok_state_self_closing_start_tag
1566 tok_state = tok_state_data
1572 attr_name = "\ufffd"
1573 when '"', "'", '<', '='
1578 tok_state = tok_state_data
1580 if uc_alpha.indexOf(c) > -1
1581 attr_name = c.toLowerCase()
1585 tok_cur_tag.attrs_a.unshift [attr_name, '']
1586 tok_state = tok_state_attribute_name
1589 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
1590 tok_state_attribute_name = ->
1591 switch c = txt.charAt(cur++)
1592 when "\t", "\n", "\u000c", ' '
1593 tok_state = tok_state_after_attribute_name
1595 tok_state = tok_state_self_closing_start_tag
1597 tok_state = tok_state_before_attribute_value
1599 tok_state = tok_state_data
1605 tok_cur_tag.attrs_a[0][0] = "\ufffd"
1608 tok_cur_tag.attrs_a[0][0] = c
1611 tok_state = tok_state_data
1613 if uc_alpha.indexOf(c) > -1
1614 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
1616 tok_cur_tag.attrs_a[0][0] += c
1619 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
1620 tok_state_before_attribute_value = ->
1621 switch c = txt.charAt(cur++)
1622 when "\t", "\n", "\u000c", ' '
1625 tok_state = tok_state_attribute_value_double_quoted
1627 tok_state = tok_state_attribute_value_unquoted
1630 tok_state = tok_state_attribute_value_single_quoted
1633 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1634 tok_state = tok_state_attribute_value_unquoted
1637 tok_state = tok_state_data
1643 tok_state = tok_state_data
1645 tok_cur_tag.attrs_a[0][1] += c
1646 tok_state = tok_state_attribute_value_unquoted
1649 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
1650 tok_state_attribute_value_double_quoted = ->
1651 switch c = txt.charAt(cur++)
1653 tok_state = tok_state_after_attribute_value_quoted
1655 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '"', true
1658 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1661 tok_state = tok_state_data
1663 tok_cur_tag.attrs_a[0][1] += c
1666 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
1667 tok_state_attribute_value_single_quoted = ->
1668 switch c = txt.charAt(cur++)
1670 tok_state = tok_state_after_attribute_value_quoted
1672 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference "'", true
1675 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1678 tok_state = tok_state_data
1680 tok_cur_tag.attrs_a[0][1] += c
1683 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
1684 tok_state_attribute_value_unquoted = ->
1685 switch c = txt.charAt(cur++)
1686 when "\t", "\n", "\u000c", ' '
1687 tok_state = tok_state_before_attribute_name
1689 tok_cur_tag.attrs_a[0][1] += tokenize_character_reference '>', true
1691 tok_state = tok_state_data
1696 tok_cur_tag.attrs_a[0][1] += "\ufffd"
1699 tok_state = tok_state_data
1701 # Parse Error if ', <, = or ` (backtick)
1702 tok_cur_tag.attrs_a[0][1] += c
1705 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
1706 tok_state_after_attribute_value_quoted = ->
1707 switch c = txt.charAt(cur++)
1708 when "\t", "\n", "\u000c", ' '
1709 tok_state = tok_state_before_attribute_name
1711 tok_state = tok_state_self_closing_start_tag
1713 tok_state = tok_state_data
1719 tok_state = tok_state_data
1722 tok_state = tok_state_before_attribute_name
1723 cur -= 1 # we didn't handle that char
1726 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
1727 # Don't set this as a state, just call it
1728 # returns a string (NOT a text node)
1729 tokenize_character_reference = (allowed_char = null, in_attr = false) ->
1730 if cur >= txt.length
1732 switch c = txt.charAt(cur)
1733 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
1734 # explicitly not a parse error
1737 # there has to be "one or more" alnums between & and ; to be a parse error
1740 if cur + 1 >= txt.length
1742 if txt.charAt(cur + 1).toLowerCase() is 'x'
1751 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
1755 if txt.charAt(start + i) is ';'
1757 # FIXME This is supposed to generate parse errors for some chars
1758 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
1765 if alnum.indexOf(txt.charAt(cur + i)) is -1
1768 # exit early, because parse_error() below needs at least one alnum
1770 if txt.charAt(cur + i) is ';'
1771 i += 1 # include ';' terminator in value
1772 decoded = decode_named_char_ref txt.substr(cur, i)
1779 # no ';' terminator (only legacy char refs)
1781 for i in [2..max] # no prefix matches, so ok to check shortest first
1782 c = legacy_char_refs[txt.substr(cur, i)]
1785 if txt.charAt(cur + i) is '='
1786 # "because some legacy user agents will
1787 # misinterpret the markup in those cases"
1790 if alnum.indexOf(txt.charAt(cur + i)) > -1
1791 # this makes attributes forgiving about url args
1793 # ok, and besides the weird exceptions for attributes...
1794 # return the matching char
1795 cur += i # consume entity chars
1796 parse_error() # because no terminating ";"
1800 return # never reached
1802 # tree constructor initialization
1803 # see comments on TYPE_TAG/etc for the structure of this data
1804 tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
1806 insertion_mode = ins_mode_in_body
1807 flag_frameset_ok = true
1809 flag_foster_parenting = false
1810 form_element_pointer = null
1811 afe = [] # active formatting elements
1813 # tokenizer initialization
1814 tok_state = tok_state_data
1821 return tree.children
1823 # everything below is tests on the above
1824 test_equals = (description, output, expected_output) ->
1825 if output is expected_output
1826 console.log "passed." # don't say name, so smart consoles can merge all of these
1828 console.log "FAILED: \"#{description}\""
1829 console.log " Expected: #{expected_output}"
1830 console.log " Actual: #{output}"
1831 serialize_els = (els, shallow, show_ids) ->
1837 serialized += t.serialize shallow, show_ids
1839 test_parser = (args) ->
1844 prev_node_id = 0 # reset counter
1845 parsed = parse_html args.html, errors_cb
1846 serialized = serialize_els parsed, false, false
1847 if serialized isnt args.expected # or parse_errors.length isnt args.errors
1848 debug_log_each (str) ->
1850 console.log "FAILED: \"#{args.name}\""
1852 console.log "passed \"#{args.name}\""
1853 if serialized isnt args.expected
1854 console.log " Input: #{args.html}"
1855 console.log " Correct: #{args.expected}"
1856 console.log " Output: #{serialized}"
1857 if parse_errors.length isnt args.errors
1858 console.log " Expected #{args.errors} parse errors, but got these: #{JSON.stringify parse_errors}"
1860 test_parser name: "empty", \
1864 test_parser name: "just text", \
1866 expected: 'text:"abc"',
1868 test_parser name: "named entity", \
1870 expected: 'text:"a&1234"',
1872 test_parser name: "broken named character references", \
1873 html: "1&2&&3&aabbcc;",
1874 expected: 'text:"1&2&&3&aabbcc;"',
1876 test_parser name: "numbered entity overrides", \
1877 html: "1€€ ƒ",
1878 expected: 'text:"1€€ ƒ"',
1880 test_parser name: "open tag", \
1881 html: "foo<span>bar",
1882 expected: 'text:"foo",tag:"span",{},[text:"bar"]',
1883 errors: 1 # no close tag
1884 test_parser name: "open tag with attributes", \
1885 html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
1886 expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]',
1887 errors: 1 # no close tag
1888 test_parser name: "open tag with attributes of various quotings", \
1889 html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
1890 expected: 'text:"foo",tag:"span",{"abc":"def","g":"hij","klm":"nopqrstuv\\"","autofocus":""},[text:"bar"]',
1891 errors: 1 # no close tag
1892 test_parser name: "attribute entity exceptions dq", \
1893 html: "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar",
1894 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
1895 errors: 2 # no close tag, &= in attr
1896 test_parser name: "attribute entity exceptions sq", \
1897 html: "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar",
1898 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
1899 errors: 2 # no close tag, &= in attr
1900 test_parser name: "attribute entity exceptions uq", \
1901 html: "foo<a href=foo?t=1&=2&o=3&lt=foo>bar",
1902 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]',
1903 errors: 2 # no close tag, &= in attr
1904 test_parser name: "matching closing tags", \
1905 html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
1906 expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"',
1908 test_parser name: "missing closing tag inside", \
1909 html: "foo<div>bar<span>baz</div>qux",
1910 expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"',
1911 errors: 1 # close tag mismatch
1912 test_parser name: "mis-matched closing tags", \
1913 html: "<span>12<div>34</span>56</div>78",
1914 expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]',
1915 errors: 2 # misplaced </span>, no </span> at the end
1916 test_parser name: "mis-matched formatting elements", \
1917 html: "12<b>34<i>56</b>78</i>90",
1918 expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"',
1919 errors: 1 # no idea how many their should be
1920 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
1921 html: '<p>1<b>2<i>3</b>4</i>5</p>',
1922 expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]',
1924 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
1925 html: '<b>1<p>2</b>3</p>',
1926 expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]',
1928 test_parser name: "crazy formatting elements test", \
1929 html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
1930 # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
1931 # firefox does this:
1932 expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"',
1933 errors: 6 # no idea how many there should be
1934 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
1935 test_parser name: "html5lib aaa 1", \
1936 html: '<a><p></a></p>',
1937 expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]',
1939 test_parser name: "html5lib aaa 2", \
1940 html: '<a>1<p>2</a>3</p>',
1941 expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]',
1943 test_parser name: "html5lib aaa 3", \
1944 html: '<a>1<button>2</a>3</button>',
1945 expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]',
1947 test_parser name: "html5lib aaa 4", \
1948 html: '<a>1<b>2</a>3</b>',
1949 expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]',
1951 test_parser name: "html5lib aaa 5 (two divs deep)", \
1952 html: '<a>1<div>2<div>3</a>4</div>5</div>',
1953 expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]',
1955 test_parser name: "html5lib aaa 6 (foster parenting)", \
1956 html: '<table><a>1<p>2</a>3</p>',
1957 expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]',
1959 test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
1960 html: '<table><a>1<td>2</td>3</table>',
1961 expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]',