1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # Each node is an obect of the Node class. Here are the Node types:
52 TYPE_TAG = 0 # name, {attributes}, [children]
53 TYPE_TEXT = 1 # "text"
56 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
57 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
58 TYPE_END_TAG = 5 # name
60 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
61 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
73 debug_log_each = (cb) ->
74 for str in g_debug_log
79 constructor: (type, args = {}) ->
80 @type = type # one of the TYPE_* constants above
81 @name = args.name ? '' # tag name
82 @text = args.text ? '' # contents for text/comment nodes
83 @attrs = args.attrs ? {}
84 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
85 @children = args.children ? []
86 @namespace = args.namespace ? NS_HTML
87 @parent = args.parent ? null
91 @id = "#{++prev_node_id}"
92 shallow_clone: -> # return a new node that's the same except without the children or parent
93 # WARNING this doesn't work right on open tags that are still being parsed
95 attrs[k] = v for k, v of @attrs
96 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
97 acknowledge_self_closing: ->
98 @flag 'did_self_close', true
101 serialize: (shallow = false, show_ids = false) -> # for unit tests
106 ret += JSON.stringify @name
121 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
127 ret += c.serialize shallow, show_ids
131 ret += JSON.stringify @text
134 ret += JSON.stringify @text
140 when TYPE_AAA_BOOKMARK
141 ret += 'aaa_bookmark'
144 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
147 # helpers: (only take args that are normally known when parser creates nodes)
148 new_open_tag = (name) ->
149 return new Node TYPE_START_TAG, name: name
150 new_end_tag = (name) ->
151 return new Node TYPE_END_TAG, name: name
152 new_element = (name) ->
153 return new Node TYPE_TAG, name: name
154 new_text_node = (txt) ->
155 return new Node TYPE_TEXT, text: txt
156 new_character_token = new_text_node
157 new_comment_node = (txt) ->
158 return new Node TYPE_COMMENT, text: txt
160 return new Node TYPE_EOF
162 return new Node TYPE_AFE_MARKER
163 new_aaa_bookmark = ->
164 return new Node TYPE_AAA_BOOKMARK
166 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
167 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
168 digits = "0123456789"
169 alnum = lc_alpha + uc_alpha + digits
170 hex_chars = digits + "abcdefABCDEF"
172 # some SVG elements have dashes in them
173 tag_name_chars = alnum + "-"
175 # http://www.w3.org/TR/html5/infrastructure.html#space-character
176 space_chars = "\u0009\u000a\u000c\u000d\u0020"
178 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
179 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
181 # These are the character references that don't need a terminating semicolon
182 # min length: 2, max: 6, none are a prefix of any other.
184 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
185 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
186 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
187 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
188 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
189 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
190 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
191 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
192 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
193 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
194 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
195 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
196 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
197 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
198 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
199 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
200 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
204 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
205 raw_text_elements = ['script', 'style']
206 escapable_raw_text_elements = ['textarea', 'title']
207 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
209 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
210 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
211 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
212 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
213 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
214 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
215 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
216 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
217 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
218 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
219 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
220 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
221 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
222 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
226 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
228 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
229 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
230 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
231 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
232 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
233 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
234 'determinant', 'diff', 'divergence', 'divide', 'domain',
235 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
236 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
237 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
238 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
239 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
240 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
241 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
242 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
243 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
244 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
245 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
246 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
247 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
248 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
249 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
250 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
251 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
252 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
253 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
254 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
255 'vectorproduct', 'xor'
257 # foreign_elements = [svg_elements..., mathml_elements...]
258 #normal_elements = All other allowed HTML elements are normal elements.
262 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
263 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
264 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
265 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
266 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
267 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
268 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
269 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
270 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
271 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
272 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
273 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
274 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
275 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
276 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
277 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
278 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
279 wbr:NS_HTML, xmp:NS_HTML,
282 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
283 'annotation-xml':NS_MATHML,
286 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
289 formatting_elements = {
290 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
291 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
295 foster_parenting_targets = {
317 el_is_special = (e) ->
318 return special_elements[e.name] is e.namespace
320 # decode_named_char_ref()
322 # The list of named character references is _huge_ so ask the browser to decode
323 # for us instead of wasting bandwidth/space on including the table here.
325 # Pass without the "&" but with the ";" examples:
326 # for "&" pass "amp;"
327 # for "′" pass "x2032;"
330 textarea: document.createElement('textarea')
332 # TODO test this in IE8
333 decode_named_char_ref = (txt) ->
335 decoded = g_dncr.cache[txt]
336 return decoded if decoded?
337 g_dncr.textarea.innerHTML = txt
338 decoded = g_dncr.textarea.value
339 return null if decoded is txt
340 return g_dncr.cache[txt] = decoded
342 parse_html = (txt, parse_error_cb = null) ->
343 cur = 0 # index of next char in txt to be parsed
344 # declare tree and tokenizer variables so they're in scope below
346 open_els = null # stack of open elements
347 afe = null # active formatting elements
348 template_insertion_modes = null
349 insertion_mode = null
350 original_insertion_mode = null
352 tok_cur_tag = null # partially parsed tag
353 flag_scripting = null
354 flag_frameset_ok = null
356 flag_foster_parenting = null
357 form_element_pointer = null
358 temporary_buffer = null
359 pending_table_character_tokens = null
365 console.log "Parse error at character #{cur} of #{txt.length}"
367 afe_push = (new_el) ->
370 if el.name is new_el.name and el.namespace is new_el.namespace
372 continue unless new_el.attrs[k] is v
373 for k, v of new_el.attrs
374 continue unless el.attrs[k] is v
381 afe.unshift new_afe_marker()
383 # the functions below impliment the Tree Contstruction algorithm
384 # http://www.w3.org/TR/html5/syntax.html#tree-construction
386 # But first... the helpers
387 template_tag_is_open = ->
389 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
392 is_in_scope_x = (tag_name, scope, namespace) ->
394 if t.name is tag_name and (namespace is null or namespace is t.namespace)
396 if scope[t.name] is t.namespace
399 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
401 if t.name is tag_name and (namespace is null or namespace is t.namespace)
403 if scope[t.name] is t.namespace
405 if scope2[t.name] is t.namespace
408 standard_scopers = { # FIXME these are supposed to be namespace specific
409 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
410 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
411 template: NS_HTML, mi: NS_MATHML,
413 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
414 'annotation-xml': NS_MATHML,
416 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
418 button_scopers = button: NS_HTML
419 li_scopers = ol: NS_HTML, ul: NS_HTML
420 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
421 is_in_scope = (tag_name, namespace = null) ->
422 return is_in_scope_x tag_name, standard_scopers, namespace
423 is_in_button_scope = (tag_name, namespace = null) ->
424 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
425 is_in_table_scope = (tag_name, namespace = null) ->
426 return is_in_scope_x tag_name, table_scopers, namespace
427 is_in_select_scope = (tag_name, namespace = null) ->
429 if t.name is tag_name and (namespace is null or namespace is t.namespace)
431 if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
434 # this checks for a particular element, not by name
435 el_is_in_scope = (el) ->
439 if standard_scopers[t.name] is t.namespace
444 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
445 reset_insertion_mode = ->
446 # 1. Let last be false.
448 # 2. Let node be the last node in the stack of open elements.
450 node = open_els[node_i]
451 # 3. Loop: If node is the first node in the stack of open elements,
452 # then set last to true, and, if the parser was originally created as
453 # part of the HTML fragment parsing algorithm (fragment case) set node
454 # to the context element.
456 if node_i is open_els.length - 1
458 # fixfull (fragment case)
460 # 4. If node is a select element, run these substeps:
461 if node.name is 'select'
462 # 1. If last is true, jump to the step below labeled done.
464 # 2. Let ancestor be node.
467 # 3. Loop: If ancestor is the first node in the stack of
468 # open elements, jump to the step below labeled done.
470 if ancestor_i is open_els.length - 1
472 # 4. Let ancestor be the node before ancestor in the stack
475 ancestor = open_els[ancestor_i]
476 # 5. If ancestor is a template node, jump to the step below
478 if ancestor.name is 'template'
480 # 6. If ancestor is a table node, switch the insertion mode
481 # to "in select in table" and abort these steps.
482 if ancestor.name is 'table'
483 insertion_mode = ins_mode_in_select_in_table
485 # 7. Jump back to the step labeled loop.
486 # 8. Done: Switch the insertion mode to "in select" and abort
488 insertion_mode = ins_mode_in_select
490 # 5. If node is a td or th element and last is false, then switch
491 # the insertion mode to "in cell" and abort these steps.
492 if (node.name is 'td' or node.name is 'th') and last is false
493 insertion_mode = ins_mode_in_cell
495 # 6. If node is a tr element, then switch the insertion mode to "in
496 # row" and abort these steps.
498 insertion_mode = ins_mode_in_row
500 # 7. If node is a tbody, thead, or tfoot element, then switch the
501 # insertion mode to "in table body" and abort these steps.
502 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
503 insertion_mode = ins_mode_in_table_body
505 # 8. If node is a caption element, then switch the insertion mode
506 # to "in caption" and abort these steps.
507 if node.name is 'caption'
508 insertion_mode = ins_mode_in_caption
510 # 9. If node is a colgroup element, then switch the insertion mode
511 # to "in column group" and abort these steps.
512 if node.name is 'colgroup'
513 insertion_mode = ins_mode_in_column_group
515 # 10. If node is a table element, then switch the insertion mode to
516 # "in table" and abort these steps.
517 if node.name is 'table'
518 insertion_mode = ins_mode_in_table
520 # 11. If node is a template element, then switch the insertion mode
521 # to the current template insertion mode and abort these steps.
522 # fixfull (template insertion mode stack)
524 # 12. If node is a head element and last is true, then switch the
525 # insertion mode to "in body" ("in body"! not "in head"!) and abort
526 # these steps. (fragment case)
527 if node.name is 'head' and last
528 insertion_mode = ins_mode_in_body
530 # 13. If node is a head element and last is false, then switch the
531 # insertion mode to "in head" and abort these steps.
532 if node.name is 'head' and last is false
533 insertion_mode = ins_mode_in_head
535 # 14. If node is a body element, then switch the insertion mode to
536 # "in body" and abort these steps.
537 if node.name is 'body'
538 insertion_mode = ins_mode_in_body
540 # 15. If node is a frameset element, then switch the insertion mode
541 # to "in frameset" and abort these steps. (fragment case)
542 if node.name is 'frameset'
543 insertion_mode = ins_mode_in_frameset
545 # 16. If node is an html element, run these substeps:
546 if node.name is 'html'
547 # 1. If the head element pointer is null, switch the insertion
548 # mode to "before head" and abort these steps. (fragment case)
549 # fixfull (fragment case)
551 # 2. Otherwise, the head element pointer is not null, switch
552 # the insertion mode to "after head" and abort these steps.
553 insertion_mode = ins_mode_in_body # FIXME fixfull
555 # 17. If last is true, then switch the insertion mode to "in body"
556 # and abort these steps. (fragment case)
558 insertion_mode = ins_mode_in_body
560 # 18. Let node now be the node before node in the stack of open
563 node = open_els[node_i]
564 # 19. Return to the step labeled loop.
566 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
567 # this implementation is structured (mostly) as described at the link above.
568 # capitalized comments are the "labels" described at the link above.
569 reconstruct_active_formatting_elements = ->
570 return if afe.length is 0
571 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
576 if i is afe.length - 1
579 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
584 el = afe[i].shallow_clone()
585 tree_insert_element el
590 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
591 # adoption agency algorithm
593 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
594 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
595 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
596 adoption_agency = (subject) ->
597 debug_log "adoption_agency()"
598 debug_log "tree: #{serialize_els tree.children, false, true}"
599 debug_log "open_els: #{serialize_els open_els, true, true}"
600 debug_log "afe: #{serialize_els afe, true, true}"
601 if open_els[0].name is subject
604 # remove it from the list of active formatting elements (if found)
609 debug_log "aaa: starting off with subject on top of stack, exiting"
616 # 5. Let formatting element be the last element in the list of
617 # active formatting elements that: is between the end of the list
618 # and the last scope marker in the list, if any, or the start of
619 # the list otherwise, and has the tag name subject.
621 for t, fe_of_afe in afe
622 if t.type is TYPE_AFE_MARKER
627 # If there is no such element, then abort these steps and instead
628 # act as described in the "any other end tag" entry above.
630 debug_log "aaa: fe not found in afe"
631 in_body_any_other_end_tag subject
633 # 6. If formatting element is not in the stack of open elements,
634 # then this is a parse error; remove the element from the list, and
637 for t, fe_of_open_els in open_els
642 debug_log "aaa: fe not found in open_els"
644 # "remove it from the list" must mean afe, since it's not in open_els
645 afe.splice fe_of_afe, 1
647 # 7. If formatting element is in the stack of open elements, but
648 # the element is not in scope, then this is a parse error; abort
650 unless el_is_in_scope fe
651 debug_log "aaa: fe not in scope"
654 # 8. If formatting element is not the current node, this is a parse
655 # error. (But do not abort these steps.)
656 unless open_els[0] is fe
659 # 9. Let furthest block be the topmost node in the stack of open
660 # elements that is lower in the stack than formatting element, and
661 # is an element in the special category. There might not be one.
663 fb_of_open_els = null
670 # and continue, to see if there's one that's more "topmost"
671 # 10. If there is no furthest block, then the UA must first pop all
672 # the nodes from the bottom of the stack of open elements, from the
673 # current node up to and including formatting element, then remove
674 # formatting element from the list of active formatting elements,
675 # and finally abort these steps.
677 debug_log "aaa: no fb"
681 afe.splice fe_of_afe, 1
683 # 11. Let common ancestor be the element immediately above
684 # formatting element in the stack of open elements.
685 ca = open_els[fe_of_open_els + 1] # common ancestor
687 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
688 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
689 bookmark = new_aaa_bookmark()
692 afe.splice i, 0, bookmark
694 node = last_node = fb
698 # 3. Let node be the element immediately above node in the
699 # stack of open elements, or if node is no longer in the stack
700 # of open elements (e.g. because it got removed by this
701 # algorithm), the element that was immediately above node in
702 # the stack of open elements before node was removed.
706 node_next = open_els[i + 1]
708 node = node_next ? node_above
709 debug_log "inner loop #{inner}"
710 debug_log "tree: #{serialize_els tree.children, false, true}"
711 debug_log "open_els: #{serialize_els open_els, true, true}"
712 debug_log "afe: #{serialize_els afe, true, true}"
713 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
714 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
715 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
716 debug_log "node: #{node.serialize true, true}"
717 # TODO make sure node_above gets re-set if/when node is removed from open_els
719 # 4. If node is formatting element, then go to the next step in
720 # the overall algorithm.
724 # 5. If inner loop counter is greater than three and node is in
725 # the list of active formatting elements, then remove node from
726 # the list of active formatting elements.
732 debug_log "max out inner"
737 # 6. If node is not in the list of active formatting elements,
738 # then remove node from the stack of open elements and then go
739 # back to the step labeled inner loop.
741 debug_log "not in afe"
744 node_above = open_els[i + 1]
748 debug_log "the bones"
749 # 7. create an element for the token for which the element node
750 # was created, in the HTML namespace, with common ancestor as
751 # the intended parent; replace the entry for node in the list
752 # of active formatting elements with an entry for the new
753 # element, replace the entry for node in the stack of open
754 # elements with an entry for the new element, and let node be
756 new_node = node.shallow_clone()
760 debug_log "replaced in afe"
764 node_above = open_els[i + 1]
765 open_els[i] = new_node
766 debug_log "replaced in open_els"
769 # 8. If last node is furthest block, then move the
770 # aforementioned bookmark to be immediately after the new node
771 # in the list of active formatting elements.
776 debug_log "removed bookmark"
780 # "after" means lower
781 afe.splice i, 0, bookmark # "after as <-
782 debug_log "placed bookmark after node"
783 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
785 # 9. Insert last node into node, first removing it from its
786 # previous parent node if any.
788 debug_log "last_node has parent"
789 for c, i in last_node.parent.children
791 debug_log "removing last_node from parent"
792 last_node.parent.children.splice i, 1
794 node.children.push last_node
795 last_node.parent = node
796 # 10. Let last node be node.
799 # 11. Return to the step labeled inner loop.
800 # 14. Insert whatever last node ended up being in the previous step
801 # at the appropriate place for inserting a node, but using common
802 # ancestor as the override target.
804 # JASON: In the case where fe is immediately followed by fb:
805 # * inner loop exits out early (node==fe)
807 # * last_node is still in the tree (not a duplicate)
809 debug_log "FEFIRST? last_node has parent"
810 for c, i in last_node.parent.children
812 debug_log "removing last_node from parent"
813 last_node.parent.children.splice i, 1
816 debug_log "after aaa inner loop"
817 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
818 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
819 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
820 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
821 debug_log "tree: #{serialize_els tree.children, false, true}"
826 # can't use standard insert token thing, because it's already in
827 # open_els and must stay at it's current position in open_els
828 dest = adjusted_insertion_location ca
829 dest[0].children.splice dest[1], 0, last_node
830 last_node.parent = dest[0]
833 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
834 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
835 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
836 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
837 debug_log "tree: #{serialize_els tree.children, false, true}"
839 # 15. Create an element for the token for which formatting element
840 # was created, in the HTML namespace, with furthest block as the
842 new_element = fe.shallow_clone() # FIXME intended parent thing
843 # 16. Take all of the child nodes of furthest block and append them
844 # to the element created in the last step.
845 while fb.children.length
846 t = fb.children.shift()
847 t.parent = new_element
848 new_element.children.push t
849 # 17. Append that new element to furthest block.
850 new_element.parent = fb
851 fb.children.push new_element
852 # 18. Remove formatting element from the list of active formatting
853 # elements, and insert the new element into the list of active
854 # formatting elements at the position of the aforementioned
864 # 19. Remove formatting element from the stack of open elements,
865 # and insert the new element into the stack of open elements
866 # immediately below the position of furthest block in that stack.
873 open_els.splice i, 0, new_element
875 # 20. Jump back to the step labeled outer loop.
876 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
877 debug_log "tree: #{serialize_els tree.children, false, true}"
878 debug_log "open_els: #{serialize_els open_els, true, true}"
879 debug_log "afe: #{serialize_els afe, true, true}"
882 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
884 generate_implied_end_tags 'p' # arg is exception
885 if open_els[0].name isnt 'p'
887 while open_els.length > 1 # just in case
888 el = open_els.shift()
891 close_p_if_in_button_scope = ->
892 if is_in_button_scope 'p'
895 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
896 # aka insert_a_character = (t) ->
897 insert_character = (t) ->
898 dest = adjusted_insertion_location()
899 # fixfull check for Document node
901 prev = dest[0].children[dest[1] - 1]
902 if prev.type is TYPE_TEXT
905 dest[0].children.splice dest[1], 0, t
908 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
909 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
910 adjusted_insertion_location = (override_target = null) ->
911 # 1. If there was an override target specified, then let target be the
914 target = override_target
915 else # Otherwise, let target be the current node.
917 # 2. Determine the adjusted insertion location using the first matching
918 # steps from the following list:
920 # If foster parenting is enabled and target is a table, tbody, tfoot,
921 # thead, or tr element Foster parenting happens when content is
922 # misnested in tables.
923 if flag_foster_parenting and foster_parenting_targets[target.name]
924 loop # once. this is here so we can ``break`` to "abort these substeps"
925 # 1. Let last template be the last template element in the
926 # stack of open elements, if any.
928 last_template_i = null
929 for el, i in open_els
930 if el.name is 'template'
934 # 2. Let last table be the last table element in the stack of
935 # open elements, if any.
938 for el, i in open_els
939 if el.name is 'table'
943 # 3. If there is a last template and either there is no last
944 # table, or there is one, but last template is lower (more
945 # recently added) than last table in the stack of open
946 # elements, then: let adjusted insertion location be inside
947 # last template's template contents, after its last child (if
948 # any), and abort these substeps.
949 if last_template and (last_table is null or last_template_i < last_table_i)
950 target = template # fixfull should be it's contents
951 target_i = target.children.length
953 # 4. If there is no last table, then let adjusted insertion
954 # location be inside the first element in the stack of open
955 # elements (the html element), after its last child (if any),
956 # and abort these substeps. (fragment case)
957 if last_table is null
959 target = open_els[open_els.length - 1]
960 target_i = target.children.length
961 # 5. If last table has a parent element, then let adjusted
962 # insertion location be inside last table's parent element,
963 # immediately before last table, and abort these substeps.
964 if last_table.parent?
965 for c, i in last_table.parent.children
967 target = last_table.parent
971 # 6. Let previous element be the element immediately above last
972 # table in the stack of open elements.
974 # huh? how could it not have a parent?
975 previous_element = open_els[last_table_i + 1]
976 # 7. Let adjusted insertion location be inside previous
977 # element, after its last child (if any).
978 target = previous_element
979 target_i = target.children.length
980 # Note: These steps are involved in part because it's possible
981 # for elements, the table element in this case in particular,
982 # to have been moved by a script around in the DOM, or indeed
983 # removed from the DOM entirely, after the element was inserted
985 break # don't really loop
987 # Otherwise Let adjusted insertion location be inside target, after
988 # its last child (if any).
989 target_i = target.children.length
991 # 3. If the adjusted insertion location is inside a template element,
992 # let it instead be inside the template element's template contents,
993 # after its last child (if any).
996 # 4. Return the adjusted insertion location.
997 return [target, target_i]
999 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1000 # aka create_an_element_for_token
1001 token_to_element = (t, namespace, intended_parent) ->
1002 t.type = TYPE_TAG # not TYPE_START_TAG
1003 # convert attributes into a hash
1005 while t.attrs_a.length
1007 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1008 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
1010 # TODO 2. If the newly created element has an xmlns attribute in the
1011 # XMLNS namespace whose value is not exactly the same as the element's
1012 # namespace, that is a parse error. Similarly, if the newly created
1013 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1014 # value is not the XLink Namespace, that is a parse error.
1016 # fixfull: the spec says stuff about form pointers and ownerDocument
1020 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1021 insert_foreign_element = (token, namespace) ->
1022 ail = adjusted_insertion_location()
1025 el = token_to_element token, namespace, ail_el
1026 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1028 ail_el.children.splice ail_i, 0, el
1031 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1032 insert_html_element = insert_foreign_element # (token, namespace) ->
1034 # FIXME read implement "foster parenting" part
1035 # FIXME read spec, do this right
1036 # FIXME implement the override target thing
1037 # note: this assumes it's an open tag
1038 # FIXME what part of the spec is this?
1039 # TODO look through all callers of this, and see what they should really be doing.
1040 # eg probably insert_html_element for tokens
1041 tree_insert_element = (el, override_target = null, namespace = null) ->
1043 el.namespace = namespace
1044 dest = adjusted_insertion_location override_target
1045 if el.type is TYPE_START_TAG # means it's a "token"
1046 el = token_to_element el, namespace, dest[0]
1047 unless el.namespace?
1048 namespace = dest.namespace
1049 # fixfull: Document nodes sometimes can't accept more chidren
1050 dest[0].children.splice dest[1], 0, el
1055 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1056 # position should be [node, index_within_children]
1057 insert_comment = (t, position = null) ->
1058 position ?= adjusted_insertion_location()
1059 position[0].children.splice position[1], 0, t
1062 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1063 parse_generic_raw_text = (t) ->
1064 insert_html_element t
1065 tok_state = tok_state_rawtext
1066 original_insertion_mode = insertion_mode
1067 insertion_mode = ins_mode_text
1068 parse_generic_rcdata_text = (t) ->
1069 insert_html_element t
1070 tok_state = tok_state_rcdata
1071 original_insertion_mode = insertion_mode
1072 insertion_mode = ins_mode_text
1074 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1075 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1076 generate_implied_end_tags = (except = null) ->
1077 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1080 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1081 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1082 open_els.shift() # spec says this will be a 'head' node
1083 insertion_mode = ins_mode_after_head
1085 ins_mode_in_head = (t) ->
1086 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1089 if t.type is TYPE_COMMENT
1092 if t.type is TYPE_DOCTYPE
1095 if t.type is TYPE_START_TAG and t.name is 'html'
1098 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1099 el = insert_html_element t
1101 el.acknowledge_self_closing()
1103 if t.type is TYPE_START_TAG and t.name is 'meta'
1104 el = insert_html_element t
1106 el.acknowledge_self_closing()
1107 # fixfull encoding stuff
1109 if t.type is TYPE_START_TAG and t.name is 'title'
1110 parse_generic_rcdata_element t
1112 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1113 parse_generic_raw_text t
1115 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1116 insert_html_element t
1117 insertion_mode = in_head_noscript # FIXME implement
1119 if t.type is TYPE_START_TAG and t.name is 'script'
1120 ail = adjusted_insertion_location()
1121 el = token_to_element t, NS_HTML, ail
1122 el.flag_parser_inserted true # FIXME implement
1123 # fixfull frament case
1124 ail[0].children.splice ail[1], 0, el
1126 tok_state = tok_state_script_data
1127 original_insertion_mode = insertion_mode # make sure orig... is defined
1128 insertion_mode = ins_mode_text # FIXME implement
1130 if t.type is TYPE_END_TAG and t.name is 'head'
1131 open_els.shift() # will be a head element... spec says so
1132 insertion_mode = ins_mode_after_head
1134 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1135 ins_mode_in_head_else t
1137 if t.type is TYPE_START_TAG and t.name is 'template'
1138 insert_html_element t
1140 flag_frameset_ok = false
1141 insertion_mode = ins_mode_in_template
1142 template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1144 if t.type is TYPE_END_TAG and t.name is 'template'
1145 if template_tag_is_open()
1146 generate_implied_end_tags
1147 if open_els[0].name isnt 'template'
1150 el = open_els.shift()
1151 if el.name is 'template'
1153 clear_afe_to_marker()
1154 template_insertion_modes.shift()
1155 reset_insertion_mode()
1159 if (t.type is TYPE_OPEN_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1162 ins_mode_in_head_else t
1164 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1165 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1166 for node, i in open_els
1167 if node.name is name # FIXME check namespace too
1168 generate_implied_end_tags name # arg is exception
1169 parse_error() unless i is 0
1174 if special_elements[node.name]? # FIXME check namespac too
1177 ins_mode_in_body = (t) ->
1183 when "\t", "\u000a", "\u000c", "\u000d", ' '
1184 reconstruct_active_formatting_elements()
1187 reconstruct_active_formatting_elements()
1189 flag_frameset_ok = false
1198 return if template_tag_is_open()
1199 root_attrs = open_els[open_els.length - 1].attrs
1201 root_attrs[k] = v unless root_attrs[k]?
1202 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1203 # FIXME also do this for </template> (end tag)
1204 return ins_mode_in_head t
1211 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1212 close_p_if_in_button_scope()
1213 insert_html_element t
1214 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1215 close_p_if_in_button_scope()
1216 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1219 insert_html_element t
1220 # TODO lots more to implement here
1222 # If the list of active formatting elements
1223 # contains an a element between the end of the list and
1224 # the last marker on the list (or the start of the list
1225 # if there is no marker on the list), then this is a
1226 # parse error; run the adoption agency algorithm for
1227 # the tag name "a", then remove that element from the
1228 # list of active formatting elements and the stack of
1229 # open elements if the adoption agency algorithm didn't
1230 # already remove it (it might not have if the element
1231 # is not in table scope).
1234 if el.type is TYPE_AFE_MARKER
1244 for el, i in open_els
1246 open_els.splice i, 1
1247 reconstruct_active_formatting_elements()
1248 el = insert_html_element t
1250 when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1251 reconstruct_active_formatting_elements()
1252 el = insert_html_element t
1255 # fixfull quirksmode thing
1256 close_p_if_in_button_scope()
1257 insert_html_element t
1258 insertion_mode = ins_mode_in_table
1259 # TODO lots more to implement here
1260 else # any other start tag
1261 reconstruct_active_formatting_elements()
1262 insert_html_element t
1265 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1266 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1269 unless ok_tags[t.name]?
1272 # TODO stack of template insertion modes thing
1273 flag_parsing = false # stop parsing
1277 unless is_in_scope 'body'
1280 # TODO implement parse error and move to tree_after_body
1282 unless is_in_scope 'body' # weird, but it's what the spec says
1285 # TODO implement parse error and move to tree_after_body, reprocess
1286 when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1287 unless is_in_scope t.name, NS_HTML
1290 generate_implied_end_tags()
1291 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1294 el = open_els.shift()
1295 if el.name is t.name and el.namespace is NS_HTML
1297 # TODO lots more close tags to implement here
1299 unless is_in_button_scope 'p'
1301 insert_html_element new_open_tag 'p'
1303 # TODO lots more close tags to implement here
1304 when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1305 adoption_agency t.name
1306 # TODO lots more close tags to implement here
1308 in_body_any_other_end_tag t.name
1311 ins_mode_in_table_else = (t) ->
1313 flag_foster_parenting = true # FIXME
1315 flag_foster_parenting = false
1323 clear_to_table_stopers = {
1328 clear_stack_to_table_context = ->
1330 if clear_to_table_stopers[open_els[0].name]?
1334 clear_to_table_body_stopers = {
1341 clear_stack_to_table_body_context = ->
1343 if clear_to_table_body_stopers[open_els[0].name]?
1347 clear_to_table_row_stopers = {
1352 clear_stack_to_table_row_context = ->
1354 if clear_to_table_row_stopers[open_els[0].name]?
1358 clear_afe_to_marker = ->
1361 if el.type is TYPE_AFE_MARKER
1364 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1365 ins_mode_text = (t) ->
1366 if t.type is TYPE_TEXT
1369 if t.type is TYPE_EOF
1371 if open_els[0].name is 'script'
1372 open_els[0].flag 'already started', true
1374 insertion_mode = original_insertion_mode
1377 if t.type is TYPE_END_TAG and t.name is 'script'
1379 insertion_mode = original_insertion_mode
1380 # fixfull the spec seems to assume that I'm going to run the script
1381 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1383 if t.type is TYPE_END_TAG
1385 insertion_mode = original_insertion_mode
1387 console.log 'warning: end of ins_mode_text reached'
1389 # the functions below implement the tokenizer stats described here:
1390 # http://www.w3.org/TR/html5/syntax.html#tokenization
1392 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1393 ins_mode_in_table = (t) ->
1396 if can_in_table[t.name]
1397 original_insertion_mode = insertion_mode
1398 insertion_mode = ins_mode_in_table_text
1401 ins_mode_in_table_else t
1409 clear_stack_to_table_context()
1411 insert_html_element t
1412 insertion_mode = ins_mode_in_caption
1414 clear_stack_to_table_context()
1415 insert_html_element t
1416 insertion_mode = ins_mode_in_column_group
1418 clear_stack_to_table_context()
1419 insert_html_element new_open_tag 'colgroup'
1420 insertion_mode = ins_mode_in_column_group
1422 when 'tbody', 'tfoot', 'thead'
1423 clear_stack_to_table_context()
1424 insert_html_element t
1425 insertion_mode = ins_mode_in_table_body
1426 when 'td', 'th', 'tr'
1427 clear_stack_to_table_context()
1428 insert_html_element new_open_tag 'tbody'
1429 insertion_mode = ins_mode_in_table_body
1433 if is_in_table_scope 'table'
1435 el = open_els.shift()
1436 if el.name is 'table'
1438 reset_insertion_mode()
1440 when 'style', 'script', 'template'
1443 if token_is_input_hidden t
1444 ins_mode_in_table_else t
1447 el = insert_html_element t
1449 el.acknowledge_self_closing()
1452 if form_element_pointer?
1454 if template_tag_is_open()
1456 form_element_pointer = insert_html_element t
1459 ins_mode_in_table_else t
1463 if is_in_table_scope 'table'
1465 el = open_els.shift()
1466 if el.name is 'table'
1468 reset_insertion_mode()
1471 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1476 ins_mode_in_table_else t
1480 ins_mode_in_table_else t
1483 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1484 ins_mode_in_table_text = (t) ->
1485 if t.type is TYPE_TEXT and t.text is "\u0000"
1486 # huh? I thought the tokenizer didn't emit these
1489 if t.type is TYPE_TEXT
1490 pending_table_character_tokens.push t
1494 for old in pending_table_character_tokens
1495 unless space_chars.indexOf(old.text) > -1
1499 for old in pending_table_character_tokens
1500 insert_character old
1502 for old in pending_table_character_tokens
1503 ins_mode_table_else old
1504 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1505 insertion_mode = original_insertion_mode
1508 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1509 ins_mode_in_caption = (t) ->
1510 if t.type is TYPE_END_TAG and t.name is 'caption'
1511 if is_in_table_scope 'caption'
1512 generate_implied_end_tags()
1513 if open_els[0].name isnt 'caption'
1516 el = open_els.shift()
1517 if el.name is 'caption'
1519 clear_afe_to_marker()
1520 insertion_mode = in_table
1525 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1527 if is_in_table_scope 'caption'
1529 el = open_els.shift()
1530 if el.name is 'caption'
1532 clear_afe_to_marker()
1533 insertion_mode = in_table
1535 # else fragment case
1537 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1543 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1544 ins_mode_in_column_group = (t) ->
1545 if t.type is TYPE_TEXT and space_chars.indexOf(t.text) > -1
1548 if t.type is TYPE_COMMENT
1551 if t.type is TYPE_DOCTYPE
1554 if t.type is TYPE_START_TAG and t.name is 'html'
1557 if t.type is TYPE_START_TAG and t.name is 'col'
1558 el = insert_html_element t
1560 el.acknowledge_self_closing()
1562 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1563 if open_els[0].name is 'colgroup'
1565 insertion_mode = ins_mode_in_table
1569 if t.type is TYPE_END_TAG and t.name is 'col'
1572 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1575 if t.type is TYPE_EOF
1579 if open_els[0].name isnt 'colgroup'
1583 insertion_mode = ins_mode_in_table
1587 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1588 ins_mode_in_table_body = (t) ->
1589 if t.type is TYPE_START_TAG and t.name is 'tr'
1590 clear_stack_to_table_body_context()
1591 insert_html_element t
1592 insertion_mode = ins_mode_in_row
1594 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1596 clear_stack_to_table_body_context()
1597 insert_html_element new_open_tag 'tr'
1598 insertion_mode = ins_mode_in_row
1601 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1602 unless is_in_table_scope t.name # fixfull check namespace
1605 clear_stack_to_table_body_context()
1607 insertion_mode = ins_mode_in_table
1609 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1612 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1615 if table_scopers[el.name]
1620 clear_stack_to_table_body_context()
1622 insertion_mode = ins_mode_in_table
1625 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1631 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1632 ins_mode_in_row = (t) ->
1633 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1634 clear_stack_to_table_row_context()
1635 insert_html_element t
1636 insertion_mode = ins_mode_in_cell
1639 if t.type is TYPE_END_TAG and t.name is 'tr'
1640 if is_in_table_scope 'tr'
1641 clear_stack_to_table_row_context()
1643 insertion_mode = ins_mode_in_table_body
1647 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1648 if is_in_table_scope 'tr'
1649 clear_stack_to_table_row_context()
1651 insertion_mode = ins_mode_in_table_body
1656 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1657 if is_in_table_scope t.name # fixfull namespace
1658 if is_in_table_scope 'tr'
1659 clear_stack_to_table_row_context()
1661 insertion_mode = ins_mode_in_table_body
1666 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1672 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1674 generate_implied_end_tags()
1675 unless open_els[0].name is 'td' or open_els[0] is 'th'
1678 el = open_els.shift()
1679 if el.name is 'td' or el.name is 'th'
1681 clear_afe_to_marker()
1682 insertion_mode = ins_mode_in_row
1684 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1685 ins_mode_in_cell = (t) ->
1686 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1687 if is_in_table_scope t.name
1688 generate_implied_end_tags()
1689 if open_els[0].name isnt t.name
1692 el = open_els.shift()
1693 if el.name is t.name
1695 clear_afe_to_marker()
1696 insertion_mode = ins_mode_in_row
1700 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1703 if el.name is 'td' or el.name is 'th'
1706 if table_scopers[el.name]
1714 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1717 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1718 if is_in_table_scope t.name # fixfull namespace
1727 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
1729 switch c = txt.charAt(cur++)
1731 return new_text_node parse_character_reference()
1733 tok_state = tok_state_tag_open
1736 return new_text_node c
1738 return new_eof_token()
1740 return new_text_node c
1743 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
1744 # not needed: tok_state_character_reference_in_data = ->
1745 # just call parse_character_reference()
1747 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
1748 tok_state_rcdata = ->
1749 switch c = txt.charAt(cur++)
1751 return new_text_node parse_character_reference()
1753 tok_state = tok_state_rcdata_less_than_sign
1756 return new_character_token "\ufffd"
1758 return new_eof_token()
1760 return new_character_token c
1763 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
1764 # not needed: tok_state_character_reference_in_rcdata = ->
1765 # just call parse_character_reference()
1767 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
1768 tok_state_rawtext = ->
1769 switch c = txt.charAt(cur++)
1771 tok_state = tok_state_rawtext_less_than_sign
1774 return new_character_token "\ufffd"
1776 return new_eof_token()
1778 return new_character_token c
1781 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
1782 tok_state_script_data = ->
1783 switch c = txt.charAt(cur++)
1785 tok_state = tok_state_script_data_less_than_sign
1788 return new_character_token "\ufffd"
1790 return new_eof_token()
1792 return new_character_token c
1795 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
1796 tok_state_plaintext = ->
1797 switch c = txt.charAt(cur++)
1800 return new_character_token "\ufffd"
1802 return new_eof_token()
1804 return new_character_token c
1808 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
1809 tok_state_tag_open = ->
1810 switch c = txt.charAt(cur++)
1812 tok_state = tok_state_markup_declaration_open
1814 tok_state = tok_state_end_tag_open
1817 tok_state = tok_state_bogus_comment
1819 if lc_alpha.indexOf(c) > -1
1820 tok_cur_tag = new_open_tag c
1821 tok_state = tok_state_tag_name
1822 else if uc_alpha.indexOf(c) > -1
1823 tok_cur_tag = new_open_tag c.toLowerCase()
1824 tok_state = tok_state_tag_name
1827 tok_state = tok_state_data
1828 cur -= 1 # we didn't parse/handle the char after <
1829 return new_text_node '<'
1832 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
1833 tok_state_end_tag_open = ->
1834 switch c = txt.charAt(cur++)
1837 tok_state = tok_state_data
1840 tok_state = tok_state_data
1841 return new_text_node '</'
1843 if uc_alpha.indexOf(c) > -1
1844 tok_cur_tag = new_end_tag c.toLowerCase()
1845 tok_state = tok_state_tag_name
1846 else if lc_alpha.indexOf(c) > -1
1847 tok_cur_tag = new_end_tag c
1848 tok_state = tok_state_tag_name
1851 tok_state = tok_state_bogus_comment
1854 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
1855 tok_state_tag_name = ->
1856 switch c = txt.charAt(cur++)
1857 when "\t", "\n", "\u000c", ' '
1858 tok_state = tok_state_before_attribute_name
1860 tok_state = tok_state_self_closing_start_tag
1862 tok_state = tok_state_data
1868 tok_cur_tag.name += "\ufffd"
1871 tok_state = tok_state_data
1873 if uc_alpha.indexOf(c) > -1
1874 tok_cur_tag.name += c.toLowerCase()
1876 tok_cur_tag.name += c
1879 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
1880 tok_state_rcdata_less_than_sign = ->
1881 c = txt.charAt(cur++)
1883 temporary_buffer = ''
1884 tok_state = tok_state_rcdata_end_tag_open
1887 tok_state = tok_state_rcdata
1888 cur -= 1 # reconsume the input character
1889 return new_character_token '<'
1891 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
1892 tok_state_rcdata_end_tag_open = ->
1893 c = txt.charAt(cur++)
1894 if uc_alpha.indexOf(c) > -1
1895 tok_cur_tag = new_end_tag c.toLowerCase()
1896 temporary_buffer += c
1897 tok_state = tok_state_rcdata_end_tag_name
1899 if lc_alpha.indexOf(c) > -1
1900 tok_cur_tag = new_end_tag c
1901 temporary_buffer += c
1902 tok_state = tok_state_rcdata_end_tag_name
1905 tok_state = tok_state_rcdata
1906 cur -= 1 # reconsume the input character
1907 return new_character_token "</" # fixfull separate these
1909 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
1910 is_appropriate_end_tag = (t) ->
1911 # spec says to check against "the tag name of the last start tag to
1912 # have been emitted from this tokenizer", but this is only called from
1913 # the various "raw" states, which I'm pretty sure all push the start
1914 # token onto open_els. TODO: verify this after the script data states
1916 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
1917 return t.type is TYPE_END_TAG and t.name is open_els[0].name
1919 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
1920 tok_state_rcdata_end_tag_name = ->
1921 c = txt.charAt(cur++)
1922 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
1923 if is_appropriate_end_tag tok_cur_tag
1924 tok_state = tok_state_before_attribute_name
1926 # else fall through to "Anything else"
1928 if is_appropriate_end_tag tok_cur_tag
1929 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
1931 # else fall through to "Anything else"
1933 if is_appropriate_end_tag tok_cur_tag
1934 tok_state = tok_state_data
1936 # else fall through to "Anything else"
1937 if uc_alpha.indexOf(c) > -1
1938 tok_cur_tag.name += c.toLowerCase()
1939 temporary_buffer += c
1941 if lc_alpha.indexOf(c) > -1
1942 tok_cur_tag.name += c
1943 temporary_buffer += c
1946 tok_state = tok_state_rcdata
1947 cur -= 1 # reconsume the input character
1948 return new_character_token '</' + temporary_buffer # fixfull separate these
1950 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
1951 tok_state_rawtext_less_than_sign = ->
1952 c = txt.charAt(cur++)
1954 temporary_buffer = ''
1955 tok_state = tok_state_rawtext_end_tag_open
1958 tok_state = tok_state_rawtext
1959 cur -= 1 # reconsume the input character
1960 return new_character_token '<'
1962 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
1963 tok_state_rawtext_end_tag_open = ->
1964 c = txt.charAt(cur++)
1965 if uc_alpha.indexOf(c) > -1
1966 tok_cur_tag = new_end_tag c.toLowerCase()
1967 temporary_buffer += c
1968 tok_state = tok_state_rawtext_end_tag_name
1970 if lc_alpha.indexOf(c) > -1
1971 tok_cur_tag = new_end_tag c
1972 temporary_buffer += c
1973 tok_state = tok_state_rawtext_end_tag_name
1976 tok_state = tok_state_rawtext
1977 cur -= 1 # reconsume the input character
1978 return new_character_token "</" # fixfull separate these
1980 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
1981 tok_state_rawtext_end_tag_name = ->
1982 c = txt.charAt(cur++)
1983 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
1984 if is_appropriate_end_tag tok_cur_tag
1985 tok_state = tok_state_before_attribute_name
1987 # else fall through to "Anything else"
1989 if is_appropriate_end_tag tok_cur_tag
1990 tok_state = tok_state_self_closing_start_tag
1992 # else fall through to "Anything else"
1994 if is_appropriate_end_tag tok_cur_tag
1995 tok_state = tok_state_data
1997 # else fall through to "Anything else"
1998 if uc_alpha.indexOf(c) > -1
1999 tok_cur_tag.name += c.toLowerCase()
2000 temporary_buffer += c
2002 if lc_alpha.indexOf(c) > -1
2003 tok_cur_tag.name += c
2004 temporary_buffer += c
2007 tok_state = tok_state_rawtext
2008 cur -= 1 # reconsume the input character
2009 return new_character_token '</' + temporary_buffer # fixfull separate these
2011 # TODO _all_ of the missing states here (17-33) are for parsing script tags
2013 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2014 tok_state_before_attribute_name = ->
2016 switch c = txt.charAt(cur++)
2017 when "\t", "\n", "\u000c", ' '
2020 tok_state = tok_state_self_closing_start_tag
2023 tok_state = tok_state_data
2029 attr_name = "\ufffd"
2030 when '"', "'", '<', '='
2035 tok_state = tok_state_data
2037 if uc_alpha.indexOf(c) > -1
2038 attr_name = c.toLowerCase()
2042 tok_cur_tag.attrs_a.unshift [attr_name, '']
2043 tok_state = tok_state_attribute_name
2046 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2047 tok_state_attribute_name = ->
2048 switch c = txt.charAt(cur++)
2049 when "\t", "\n", "\u000c", ' '
2050 tok_state = tok_state_after_attribute_name
2052 tok_state = tok_state_self_closing_start_tag
2054 tok_state = tok_state_before_attribute_value
2056 tok_state = tok_state_data
2062 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2065 tok_cur_tag.attrs_a[0][0] = c
2068 tok_state = tok_state_data
2070 if uc_alpha.indexOf(c) > -1
2071 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2073 tok_cur_tag.attrs_a[0][0] += c
2076 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2077 tok_state_after_attribute_name = ->
2078 c = txt.charAt(cur++)
2079 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2082 tok_state = tok_state_self_closing_start_tag
2085 tok_state = tok_state_before_attribute_value
2088 tok_state = tok_state_data
2090 if uc_alpha.indexOf(c) > -1
2091 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2092 tok_state = tok_state_attribute_name
2096 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2097 tok_state = tok_state_attribute_name
2101 tok_state = tok_state_data
2102 cur -= 1 # reconsume
2104 if c is '"' or c is "'" or c is '<'
2106 # fall through to Anything else
2108 tok_cur_tag.attrs_a.unshift [c, '']
2109 tok_state = tok_state_attribute_name
2111 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2112 tok_state_before_attribute_value = ->
2113 switch c = txt.charAt(cur++)
2114 when "\t", "\n", "\u000c", ' '
2117 tok_state = tok_state_attribute_value_double_quoted
2119 tok_state = tok_state_attribute_value_unquoted
2122 tok_state = tok_state_attribute_value_single_quoted
2125 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2126 tok_state = tok_state_attribute_value_unquoted
2129 tok_state = tok_state_data
2135 tok_state = tok_state_data
2137 tok_cur_tag.attrs_a[0][1] += c
2138 tok_state = tok_state_attribute_value_unquoted
2141 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2142 tok_state_attribute_value_double_quoted = ->
2143 switch c = txt.charAt(cur++)
2145 tok_state = tok_state_after_attribute_value_quoted
2147 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2150 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2153 tok_state = tok_state_data
2155 tok_cur_tag.attrs_a[0][1] += c
2158 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2159 tok_state_attribute_value_single_quoted = ->
2160 switch c = txt.charAt(cur++)
2162 tok_state = tok_state_after_attribute_value_quoted
2164 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2167 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2170 tok_state = tok_state_data
2172 tok_cur_tag.attrs_a[0][1] += c
2175 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2176 tok_state_attribute_value_unquoted = ->
2177 switch c = txt.charAt(cur++)
2178 when "\t", "\n", "\u000c", ' '
2179 tok_state = tok_state_before_attribute_name
2181 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2183 tok_state = tok_state_data
2188 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2191 tok_state = tok_state_data
2193 # Parse Error if ', <, = or ` (backtick)
2194 tok_cur_tag.attrs_a[0][1] += c
2197 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2198 tok_state_after_attribute_value_quoted = ->
2199 switch c = txt.charAt(cur++)
2200 when "\t", "\n", "\u000c", ' '
2201 tok_state = tok_state_before_attribute_name
2203 tok_state = tok_state_self_closing_start_tag
2205 tok_state = tok_state_data
2211 tok_state = tok_state_data
2214 tok_state = tok_state_before_attribute_name
2215 cur -= 1 # we didn't handle that char
2218 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
2219 # Don't set this as a state, just call it
2220 # returns a string (NOT a text node)
2221 parse_character_reference = (allowed_char = null, in_attr = false) ->
2222 if cur >= txt.length
2224 switch c = txt.charAt(cur)
2225 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
2226 # explicitly not a parse error
2229 # there has to be "one or more" alnums between & and ; to be a parse error
2232 if cur + 1 >= txt.length
2234 if txt.charAt(cur + 1).toLowerCase() is 'x'
2243 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
2247 if txt.charAt(start + i) is ';'
2249 # FIXME This is supposed to generate parse errors for some chars
2250 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
2257 if alnum.indexOf(txt.charAt(cur + i)) is -1
2260 # exit early, because parse_error() below needs at least one alnum
2262 if txt.charAt(cur + i) is ';'
2263 i += 1 # include ';' terminator in value
2264 decoded = decode_named_char_ref txt.substr(cur, i)
2271 # no ';' terminator (only legacy char refs)
2273 for i in [2..max] # no prefix matches, so ok to check shortest first
2274 c = legacy_char_refs[txt.substr(cur, i)]
2277 if txt.charAt(cur + i) is '='
2278 # "because some legacy user agents will
2279 # misinterpret the markup in those cases"
2282 if alnum.indexOf(txt.charAt(cur + i)) > -1
2283 # this makes attributes forgiving about url args
2285 # ok, and besides the weird exceptions for attributes...
2286 # return the matching char
2287 cur += i # consume entity chars
2288 parse_error() # because no terminating ";"
2292 return # never reached
2294 # tree constructor initialization
2295 # see comments on TYPE_TAG/etc for the structure of this data
2296 tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
2298 afe = [] # active formatting elements
2299 template_insertion_modes = []
2300 insertion_mode = ins_mode_in_body
2301 original_insertion_mode = insertion_mode # TODO check spec
2302 flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
2303 flag_frameset_ok = true
2305 flag_foster_parenting = false
2306 form_element_pointer = null
2307 temporary_buffer = null
2308 pending_table_character_tokens = []
2310 # tokenizer initialization
2311 tok_state = tok_state_data
2318 return tree.children
2320 # everything below is tests on the above
2321 test_equals = (description, output, expected_output) ->
2322 if output is expected_output
2323 console.log "passed." # don't say name, so smart consoles can merge all of these
2325 console.log "FAILED: \"#{description}\""
2326 console.log " Expected: #{expected_output}"
2327 console.log " Actual: #{output}"
2328 serialize_els = (els, shallow, show_ids) ->
2334 serialized += t.serialize shallow, show_ids
2336 test_parser = (args) ->
2341 prev_node_id = 0 # reset counter
2342 parsed = parse_html args.html, errors_cb
2343 serialized = serialize_els parsed, false, false
2344 if serialized isnt args.expected
2345 debug_log_each (str) ->
2347 console.log "FAILED: \"#{args.name}\""
2348 console.log " Input: #{args.html}"
2349 console.log " Correct: #{args.expected}"
2350 console.log " Output: #{serialized}"
2351 if parse_errors.length > 0
2352 console.log " parse errs: #{JSON.stringify parse_errors}"
2354 console.log " No parse errors"
2356 console.log "passed \"#{args.name}\""
2358 test_parser name: "empty", \
2361 test_parser name: "just text", \
2363 expected: 'text:"abc"'
2364 test_parser name: "named entity", \
2366 expected: 'text:"a&1234"'
2367 test_parser name: "broken named character references", \
2368 html: "1&2&&3&aabbcc;",
2369 expected: 'text:"1&2&&3&aabbcc;"'
2370 test_parser name: "numbered entity overrides", \
2371 html: "1€€ ƒ",
2372 expected: 'text:"1€€ ƒ"'
2373 test_parser name: "open tag", \
2374 html: "foo<span>bar",
2375 expected: 'text:"foo",tag:"span",{},[text:"bar"]'
2376 test_parser name: "open tag with attributes", \
2377 html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
2378 expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
2379 test_parser name: "open tag with attributes of various quotings", \
2380 html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
2381 expected: 'text:"foo",tag:"span",{"abc":"def","autofocus":"","g":"hij","klm":"nopqrstuv\\""},[text:"bar"]'
2382 test_parser name: "attribute entity exceptions dq", \
2383 html: "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar",
2384 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
2385 test_parser name: "attribute entity exceptions sq", \
2386 html: "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar",
2387 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
2388 test_parser name: "attribute entity exceptions uq", \
2389 html: "foo<a href=foo?t=1&=2&o=3&lt=foo>bar",
2390 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
2391 test_parser name: "matching closing tags", \
2392 html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
2393 expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"'
2394 test_parser name: "missing closing tag inside", \
2395 html: "foo<div>bar<span>baz</div>qux",
2396 expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"'
2397 test_parser name: "mis-matched closing tags", \
2398 html: "<span>12<div>34</span>56</div>78",
2399 expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]'
2400 test_parser name: "mis-matched formatting elements", \
2401 html: "12<b>34<i>56</b>78</i>90",
2402 expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"'
2403 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
2404 html: '<p>1<b>2<i>3</b>4</i>5</p>',
2405 expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]'
2406 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
2407 html: '<b>1<p>2</b>3</p>',
2408 expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]'
2409 test_parser name: "crazy formatting elements test", \
2410 html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
2411 # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
2412 # firefox does this:
2413 expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
2414 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
2415 test_parser name: "html5lib aaa 1", \
2416 html: '<a><p></a></p>',
2417 expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]'
2418 test_parser name: "html5lib aaa 2", \
2419 html: '<a>1<p>2</a>3</p>',
2420 expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]'
2421 test_parser name: "html5lib aaa 3", \
2422 html: '<a>1<button>2</a>3</button>',
2423 expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]'
2424 test_parser name: "html5lib aaa 4", \
2425 html: '<a>1<b>2</a>3</b>',
2426 expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]'
2427 test_parser name: "html5lib aaa 5 (two divs deep)", \
2428 html: '<a>1<div>2<div>3</a>4</div>5</div>',
2429 expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]'
2430 test_parser name: "html5lib aaa 6 (foster parenting)", \
2431 html: '<table><a>1<p>2</a>3</p>',
2432 expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]'
2433 test_parser name: "html5lib aaa 7 (aaa, eof) 1", \
2434 html: '<b><b><a><p></a>',
2435 expected: 'tag:"b",{},[tag:"b",{},[tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]]]'
2436 test_parser name: "html5lib aaa 8 (aaa, eof) 2", \
2437 html: '<b><a><b><p></a>',
2438 expected: 'tag:"b",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2439 test_parser name: "html5lib aaa 9 (aaa, eof) 3", \
2440 html: '<a><b><b><p></a>',
2441 expected: 'tag:"a",{},[tag:"b",{},[tag:"b",{},[]]],tag:"b",{},[tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2442 test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \
2443 html: '<p>1<s id="A">2<b id="B">3</p>4</s>5</b>',
2444 expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]'
2445 test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
2446 html: '<table><a>1<td>2</td>3</table>',
2447 expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]'
2448 test_parser name: "html5lib aaa 12 (table with foster parenting, split text)", \
2449 html: '<table>A<td>B</td>C</table>',
2450 expected: 'text:"AC",tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2451 # TODO implement svg and namespacing
2452 #test_parser name: "html5lib aaa 13 (svg tr input)", \
2453 # html: '<a><svg><tr><input></a>',
2454 # expected: 'tag:"a",{},[svg:"svg",{},[svg:"tr",{},[svg:"input"]]]'
2455 test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \
2456 html: '<div><a><b><div><div><div><div><div><div><div><div><div><div></a>',
2457 expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"div",{},[tag:"div",{},[]]]]]]]]]]]]]'
2458 test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \
2459 html: '<div><a><b><u><i><code><div></a>',
2460 expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]'
2461 test_parser name: "html5lib aaa 16 (correctly nested 4b)", \
2462 html: '<b><b><b><b>x</b></b></b></b>y',
2463 expected: 'tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]],text:"y"'
2464 test_parser name: "html5lib aaa 17 (formatting, implied /p, noah's ark)", \
2465 html: '<p><b><b><b><b><p>x',
2466 expected: 'tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[]]]]],tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]]'
2467 test_parser name: "variation on html5lib aaa 17 (with attributes in various orders)", \
2468 html: '<p><b c="d" e="f"><b e="f" c="d"><b e="f" c="d"><b c="d" e="f"><p>x',
2469 expected: 'tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[]]]]],tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[text:"x"]]]]'
2470 test_parser name: "junk after attribute close-quote", \
2471 html: '<p><b c="d", e="f">foo<p>x',
2472 expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'
2473 test_parser name: "html5lib aaa02 1", \
2474 html: '<b>1<i>2<p>3</b>4',
2475 expected: 'tag:"b",{},[text:"1",tag:"i",{},[text:"2"]],tag:"i",{},[tag:"p",{},[tag:"b",{},[text:"3"],text:"4"]]'
2476 test_parser name: "html5lib aaa02 2", \
2477 html: '<a><div><style></style><address><a>',
2478 expected: 'tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"style",{},[]],tag:"address",{},[tag:"a",{},[],tag:"a",{},[]]]'
2479 test_parser name: "html5lib tables 1", \
2480 html: '<table><th>',
2481 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"th",{},[]]]]'
2482 test_parser name: "html5lib tables 2", \
2483 html: '<table><td>',
2484 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
2485 test_parser name: "html5lib tables 3", \
2486 html: "<table><col foo='bar'>",
2487 expected: 'tag:"table",{},[tag:"colgroup",{},[tag:"col",{"foo":"bar"},[]]]'
2488 test_parser name: "html5lib tables 4", \
2489 html: '<table><colgroup></html>foo',
2490 expected: 'text:"foo",tag:"table",{},[tag:"colgroup",{},[]]'
2491 test_parser name: "html5lib tables 5", \
2492 html: '<table></table><p>foo',
2493 expected: 'tag:"table",{},[],tag:"p",{},[text:"foo"]'
2494 test_parser name: "html5lib tables 6", \
2495 html: '<table></body></caption></col></colgroup></html></tbody></td></tfoot></th></thead></tr><td>',
2496 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
2497 test_parser name: "html5lib tables 7", \
2498 html: '<table><select><option>3</select></table>',
2499 expected: 'tag:"select",{},[tag:"option",{},[text:"3"]],tag:"table",{},[]'
2500 test_parser name: "html5lib tables 8", \
2501 html: '<table><select><table></table></select></table>',
2502 expected: 'tag:"select",{},[],tag:"table",{},[],tag:"table",{},[]'
2503 test_parser name: "html5lib tables 9", \
2504 html: '<table><select></table>',
2505 expected: 'tag:"select",{},[],tag:"table",{},[]'
2506 test_parser name: "html5lib tables 10", \
2507 html: '<table><select><option>A<tr><td>B</td></tr></table>',
2508 expected: 'tag:"select",{},[tag:"option",{},[text:"A"]],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2509 test_parser name: "html5lib tables 11", \
2510 html: '<table><td></body></caption></col></colgroup></html>foo',
2511 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
2512 test_parser name: "html5lib tables 12", \
2513 html: '<table><td>A</table>B',
2514 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"A"]]]],text:"B"'
2515 test_parser name: "html5lib tables 13", \
2516 html: '<table><tr><caption>',
2517 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[]],tag:"caption",{},[]]'
2518 test_parser name: "html5lib tables 14", \
2519 html: '<table><tr></body></caption></col></colgroup></html></td></th><td>foo',
2520 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
2521 test_parser name: "html5lib tables 15", \
2522 html: '<table><td><tr>',
2523 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]],tag:"tr",{},[]]]'
2524 test_parser name: "html5lib tables 16", \
2525 html: '<table><td><button><td>',
2526 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[tag:"button",{},[]],tag:"td",{},[]]]]'
2527 # TODO implement svg parsing
2528 #test_parser name: "html5lib tables 17", \
2529 # html: '<table><tr><td><svg><desc><td>',
2530 # expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[svg:"svg",{},[svg:"desc",{},[]]],tag:"td",{},[]]]]'