1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # Each node is an obect of the Node class. Here are the Node types:
52 TYPE_TAG = 0 # name, {attributes}, [children]
53 TYPE_TEXT = 1 # "text"
56 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
57 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
58 TYPE_END_TAG = 5 # name
60 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
61 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
73 debug_log_each = (cb) ->
74 for str in g_debug_log
79 constructor: (type, args = {}) ->
80 @type = type # one of the TYPE_* constants above
81 @name = args.name ? '' # tag name
82 @text = args.text ? '' # contents for text/comment nodes
83 @attrs = args.attrs ? {}
84 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
85 @children = args.children ? []
86 @namespace = args.namespace ? NS_HTML
87 @parent = args.parent ? null
91 @id = "#{++prev_node_id}"
92 shallow_clone: -> # return a new node that's the same except without the children or parent
93 # WARNING this doesn't work right on open tags that are still being parsed
95 attrs[k] = v for k, v of @attrs
96 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id
97 acknowledge_self_closing: ->
98 @flag 'did_self_close', true
101 serialize: (shallow = false, show_ids = false) -> # for unit tests
106 ret += JSON.stringify @name
121 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
127 ret += c.serialize shallow, show_ids
131 ret += JSON.stringify @text
134 ret += JSON.stringify @text
140 when TYPE_AAA_BOOKMARK
141 ret += 'aaa_bookmark'
144 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
147 # helpers: (only take args that are normally known when parser creates nodes)
148 new_open_tag = (name) ->
149 return new Node TYPE_START_TAG, name: name
150 new_end_tag = (name) ->
151 return new Node TYPE_END_TAG, name: name
152 new_element = (name) ->
153 return new Node TYPE_TAG, name: name
154 new_text_node = (txt) ->
155 return new Node TYPE_TEXT, text: txt
156 new_character_token = new_text_node
157 new_comment_node = (txt) ->
158 return new Node TYPE_COMMENT, text: txt
160 return new Node TYPE_EOF
162 return new Node TYPE_AFE_MARKER
163 new_aaa_bookmark = ->
164 return new Node TYPE_AAA_BOOKMARK
166 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
167 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
168 digits = "0123456789"
169 alnum = lc_alpha + uc_alpha + digits
170 hex_chars = digits + "abcdefABCDEF"
172 # some SVG elements have dashes in them
173 tag_name_chars = alnum + "-"
175 # http://www.w3.org/TR/html5/infrastructure.html#space-character
176 space_chars = "\u0009\u000a\u000c\u000d\u0020"
178 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
179 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
181 # These are the character references that don't need a terminating semicolon
182 # min length: 2, max: 6, none are a prefix of any other.
184 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
185 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
186 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
187 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
188 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
189 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
190 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
191 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
192 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
193 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
194 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
195 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
196 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
197 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
198 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
199 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
200 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
204 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
205 raw_text_elements = ['script', 'style']
206 escapable_raw_text_elements = ['textarea', 'title']
207 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
209 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
210 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
211 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
212 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
213 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
214 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
215 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
216 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
217 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
218 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
219 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
220 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
221 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
222 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
226 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
228 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
229 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
230 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
231 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
232 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
233 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
234 'determinant', 'diff', 'divergence', 'divide', 'domain',
235 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
236 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
237 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
238 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
239 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
240 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
241 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
242 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
243 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
244 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
245 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
246 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
247 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
248 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
249 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
250 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
251 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
252 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
253 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
254 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
255 'vectorproduct', 'xor'
257 # foreign_elements = [svg_elements..., mathml_elements...]
258 #normal_elements = All other allowed HTML elements are normal elements.
262 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
263 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
264 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
265 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
266 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
267 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
268 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
269 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
270 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
271 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
272 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
273 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
274 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
275 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
276 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
277 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
278 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
279 wbr:NS_HTML, xmp:NS_HTML,
282 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
283 'annotation-xml':NS_MATHML,
286 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
289 formatting_elements = {
290 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
291 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
295 foster_parenting_targets = {
317 el_is_special = (e) ->
318 return special_elements[e.name]?
319 # FIXME it should really be:
320 #return special_elements[e.name] is e.namespace
322 # decode_named_char_ref()
324 # The list of named character references is _huge_ so ask the browser to decode
325 # for us instead of wasting bandwidth/space on including the table here.
327 # Pass without the "&" but with the ";" examples:
328 # for "&" pass "amp;"
329 # for "′" pass "x2032;"
332 textarea: document.createElement('textarea')
334 # TODO test this in IE8
335 decode_named_char_ref = (txt) ->
337 decoded = g_dncr.cache[txt]
338 return decoded if decoded?
339 g_dncr.textarea.innerHTML = txt
340 decoded = g_dncr.textarea.value
341 return null if decoded is txt
342 return g_dncr.cache[txt] = decoded
344 parse_html = (txt, parse_error_cb = null) ->
345 cur = 0 # index of next char in txt to be parsed
346 # declare tree and tokenizer variables so they're in scope below
348 open_els = null # stack of open elements
349 afe = null # active formatting elements
350 template_insertion_modes = null
351 insertion_mode = null
352 original_insertion_mode = null
354 tok_cur_tag = null # partially parsed tag
355 flag_scripting = null
356 flag_frameset_ok = null
358 flag_foster_parenting = null
359 form_element_pointer = null
360 temporary_buffer = null
361 pending_table_character_tokens = null
367 console.log "Parse error at character #{cur} of #{txt.length}"
369 afe_push = (new_el) ->
372 if el.name is new_el.name and el.namespace is new_el.namespace
374 continue unless new_el.attrs[k] is v
375 for k, v of new_el.attrs
376 continue unless el.attrs[k] is v
383 afe.unshift new_afe_marker()
385 # the functions below impliment the Tree Contstruction algorithm
386 # http://www.w3.org/TR/html5/syntax.html#tree-construction
388 # But first... the helpers
389 template_tag_is_open = ->
391 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
394 is_in_scope_x = (tag_name, scope, namespace) ->
396 if t.name is tag_name and (namespace is null or namespace is t.namespace)
398 if scope[t.name] is t.namespace
401 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
403 if t.name is tag_name and (namespace is null or namespace is t.namespace)
405 if scope[t.name] is t.namespace
407 if scope2[t.name] is t.namespace
410 standard_scopers = { # FIXME these are supposed to be namespace specific
411 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
412 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
413 template: NS_HTML, mi: NS_MATHML,
415 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
416 'annotation-xml': NS_MATHML,
418 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
420 button_scopers = button: NS_HTML
421 li_scopers = ol: NS_HTML, ul: NS_HTML
422 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
423 is_in_scope = (tag_name, namespace = null) ->
424 return is_in_scope_x tag_name, standard_scopers, namespace
425 is_in_button_scope = (tag_name, namespace = null) ->
426 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
427 is_in_table_scope = (tag_name, namespace = null) ->
428 return is_in_scope_x tag_name, table_scopers, namespace
429 is_in_select_scope = (tag_name, namespace = null) ->
431 if t.name is tag_name and (namespace is null or namespace is t.namespace)
433 if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
436 # this checks for a particular element, not by name
437 el_is_in_scope = (el) ->
441 if standard_scopers[t.name] is t.namespace
446 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
447 reset_insertion_mode = ->
448 # 1. Let last be false.
450 # 2. Let node be the last node in the stack of open elements.
452 node = open_els[node_i]
453 # 3. Loop: If node is the first node in the stack of open elements,
454 # then set last to true, and, if the parser was originally created as
455 # part of the HTML fragment parsing algorithm (fragment case) set node
456 # to the context element.
458 if node_i is open_els.length - 1
460 # fixfull (fragment case)
462 # 4. If node is a select element, run these substeps:
463 if node.name is 'select'
464 # 1. If last is true, jump to the step below labeled done.
466 # 2. Let ancestor be node.
469 # 3. Loop: If ancestor is the first node in the stack of
470 # open elements, jump to the step below labeled done.
472 if ancestor_i is open_els.length - 1
474 # 4. Let ancestor be the node before ancestor in the stack
477 ancestor = open_els[ancestor_i]
478 # 5. If ancestor is a template node, jump to the step below
480 if ancestor.name is 'template'
482 # 6. If ancestor is a table node, switch the insertion mode
483 # to "in select in table" and abort these steps.
484 if ancestor.name is 'table'
485 insertion_mode = ins_mode_in_select_in_table
487 # 7. Jump back to the step labeled loop.
488 # 8. Done: Switch the insertion mode to "in select" and abort
490 insertion_mode = ins_mode_in_select
492 # 5. If node is a td or th element and last is false, then switch
493 # the insertion mode to "in cell" and abort these steps.
494 if (node.name is 'td' or node.name is 'th') and last is false
495 insertion_mode = ins_mode_in_cell
497 # 6. If node is a tr element, then switch the insertion mode to "in
498 # row" and abort these steps.
500 insertion_mode = ins_mode_in_row
502 # 7. If node is a tbody, thead, or tfoot element, then switch the
503 # insertion mode to "in table body" and abort these steps.
504 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
505 insertion_mode = ins_mode_in_table_body
507 # 8. If node is a caption element, then switch the insertion mode
508 # to "in caption" and abort these steps.
509 if node.name is 'caption'
510 insertion_mode = ins_mode_in_caption
512 # 9. If node is a colgroup element, then switch the insertion mode
513 # to "in column group" and abort these steps.
514 if node.name is 'colgroup'
515 insertion_mode = ins_mode_in_column_group
517 # 10. If node is a table element, then switch the insertion mode to
518 # "in table" and abort these steps.
519 if node.name is 'table'
520 insertion_mode = ins_mode_in_table
522 # 11. If node is a template element, then switch the insertion mode
523 # to the current template insertion mode and abort these steps.
524 # fixfull (template insertion mode stack)
526 # 12. If node is a head element and last is true, then switch the
527 # insertion mode to "in body" ("in body"! not "in head"!) and abort
528 # these steps. (fragment case)
529 if node.name is 'head' and last
530 insertion_mode = ins_mode_in_body
532 # 13. If node is a head element and last is false, then switch the
533 # insertion mode to "in head" and abort these steps.
534 if node.name is 'head' and last is false
535 insertion_mode = ins_mode_in_head
537 # 14. If node is a body element, then switch the insertion mode to
538 # "in body" and abort these steps.
539 if node.name is 'body'
540 insertion_mode = ins_mode_in_body
542 # 15. If node is a frameset element, then switch the insertion mode
543 # to "in frameset" and abort these steps. (fragment case)
544 if node.name is 'frameset'
545 insertion_mode = ins_mode_in_frameset
547 # 16. If node is an html element, run these substeps:
548 if node.name is 'html'
549 # 1. If the head element pointer is null, switch the insertion
550 # mode to "before head" and abort these steps. (fragment case)
551 # fixfull (fragment case)
553 # 2. Otherwise, the head element pointer is not null, switch
554 # the insertion mode to "after head" and abort these steps.
555 insertion_mode = ins_mode_in_body # FIXME fixfull
557 # 17. If last is true, then switch the insertion mode to "in body"
558 # and abort these steps. (fragment case)
560 insertion_mode = ins_mode_in_body
562 # 18. Let node now be the node before node in the stack of open
565 node = open_els[node_i]
566 # 19. Return to the step labeled loop.
568 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
569 # this implementation is structured (mostly) as described at the link above.
570 # capitalized comments are the "labels" described at the link above.
571 reconstruct_active_formatting_elements = ->
572 return if afe.length is 0
573 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
578 if i is afe.length - 1
581 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
586 el = afe[i].shallow_clone()
587 tree_insert_element el
592 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
593 # adoption agency algorithm
595 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
596 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
597 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
598 adoption_agency = (subject) ->
599 debug_log "adoption_agency()"
600 debug_log "tree: #{serialize_els tree.children, false, true}"
601 debug_log "open_els: #{serialize_els open_els, true, true}"
602 debug_log "afe: #{serialize_els afe, true, true}"
603 if open_els[0].name is subject
606 # remove it from the list of active formatting elements (if found)
611 debug_log "aaa: starting off with subject on top of stack, exiting"
618 # 5. Let formatting element be the last element in the list of
619 # active formatting elements that: is between the end of the list
620 # and the last scope marker in the list, if any, or the start of
621 # the list otherwise, and has the tag name subject.
623 for t, fe_of_afe in afe
624 if t.type is TYPE_AFE_MARKER
629 # If there is no such element, then abort these steps and instead
630 # act as described in the "any other end tag" entry above.
632 debug_log "aaa: fe not found in afe"
633 in_body_any_other_end_tag subject
635 # 6. If formatting element is not in the stack of open elements,
636 # then this is a parse error; remove the element from the list, and
639 for t, fe_of_open_els in open_els
644 debug_log "aaa: fe not found in open_els"
646 # "remove it from the list" must mean afe, since it's not in open_els
647 afe.splice fe_of_afe, 1
649 # 7. If formatting element is in the stack of open elements, but
650 # the element is not in scope, then this is a parse error; abort
652 unless el_is_in_scope fe
653 debug_log "aaa: fe not in scope"
656 # 8. If formatting element is not the current node, this is a parse
657 # error. (But do not abort these steps.)
658 unless open_els[0] is fe
661 # 9. Let furthest block be the topmost node in the stack of open
662 # elements that is lower in the stack than formatting element, and
663 # is an element in the special category. There might not be one.
665 fb_of_open_els = null
672 # and continue, to see if there's one that's more "topmost"
673 # 10. If there is no furthest block, then the UA must first pop all
674 # the nodes from the bottom of the stack of open elements, from the
675 # current node up to and including formatting element, then remove
676 # formatting element from the list of active formatting elements,
677 # and finally abort these steps.
679 debug_log "aaa: no fb"
683 afe.splice fe_of_afe, 1
685 # 11. Let common ancestor be the element immediately above
686 # formatting element in the stack of open elements.
687 ca = open_els[fe_of_open_els + 1] # common ancestor
689 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
690 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
691 bookmark = new_aaa_bookmark()
694 afe.splice i, 0, bookmark
696 node = last_node = fb
700 # 3. Let node be the element immediately above node in the
701 # stack of open elements, or if node is no longer in the stack
702 # of open elements (e.g. because it got removed by this
703 # algorithm), the element that was immediately above node in
704 # the stack of open elements before node was removed.
708 node_next = open_els[i + 1]
710 node = node_next ? node_above
711 debug_log "inner loop #{inner}"
712 debug_log "tree: #{serialize_els tree.children, false, true}"
713 debug_log "open_els: #{serialize_els open_els, true, true}"
714 debug_log "afe: #{serialize_els afe, true, true}"
715 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
716 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
717 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
718 debug_log "node: #{node.serialize true, true}"
719 # TODO make sure node_above gets re-set if/when node is removed from open_els
721 # 4. If node is formatting element, then go to the next step in
722 # the overall algorithm.
726 # 5. If inner loop counter is greater than three and node is in
727 # the list of active formatting elements, then remove node from
728 # the list of active formatting elements.
734 debug_log "max out inner"
739 # 6. If node is not in the list of active formatting elements,
740 # then remove node from the stack of open elements and then go
741 # back to the step labeled inner loop.
743 debug_log "not in afe"
746 node_above = open_els[i + 1]
750 debug_log "the bones"
751 # 7. create an element for the token for which the element node
752 # was created, in the HTML namespace, with common ancestor as
753 # the intended parent; replace the entry for node in the list
754 # of active formatting elements with an entry for the new
755 # element, replace the entry for node in the stack of open
756 # elements with an entry for the new element, and let node be
758 new_node = node.shallow_clone()
762 debug_log "replaced in afe"
766 node_above = open_els[i + 1]
767 open_els[i] = new_node
768 debug_log "replaced in open_els"
771 # 8. If last node is furthest block, then move the
772 # aforementioned bookmark to be immediately after the new node
773 # in the list of active formatting elements.
778 debug_log "removed bookmark"
782 # "after" means lower
783 afe.splice i, 0, bookmark # "after as <-
784 debug_log "placed bookmark after node"
785 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
787 # 9. Insert last node into node, first removing it from its
788 # previous parent node if any.
790 debug_log "last_node has parent"
791 for c, i in last_node.parent.children
793 debug_log "removing last_node from parent"
794 last_node.parent.children.splice i, 1
796 node.children.push last_node
797 last_node.parent = node
798 # 10. Let last node be node.
801 # 11. Return to the step labeled inner loop.
802 # 14. Insert whatever last node ended up being in the previous step
803 # at the appropriate place for inserting a node, but using common
804 # ancestor as the override target.
806 # JASON: In the case where fe is immediately followed by fb:
807 # * inner loop exits out early (node==fe)
809 # * last_node is still in the tree (not a duplicate)
811 debug_log "FEFIRST? last_node has parent"
812 for c, i in last_node.parent.children
814 debug_log "removing last_node from parent"
815 last_node.parent.children.splice i, 1
818 debug_log "after aaa inner loop"
819 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
820 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
821 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
822 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
823 debug_log "tree: #{serialize_els tree.children, false, true}"
828 # can't use standard insert token thing, because it's already in
829 # open_els and must stay at it's current position in open_els
830 dest = adjusted_insertion_location ca
831 dest[0].children.splice dest[1], 0, last_node
832 last_node.parent = dest[0]
835 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
836 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
837 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
838 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
839 debug_log "tree: #{serialize_els tree.children, false, true}"
841 # 15. Create an element for the token for which formatting element
842 # was created, in the HTML namespace, with furthest block as the
844 new_element = fe.shallow_clone() # FIXME intended parent thing
845 # 16. Take all of the child nodes of furthest block and append them
846 # to the element created in the last step.
847 while fb.children.length
848 t = fb.children.shift()
849 t.parent = new_element
850 new_element.children.push t
851 # 17. Append that new element to furthest block.
852 new_element.parent = fb
853 fb.children.push new_element
854 # 18. Remove formatting element from the list of active formatting
855 # elements, and insert the new element into the list of active
856 # formatting elements at the position of the aforementioned
866 # 19. Remove formatting element from the stack of open elements,
867 # and insert the new element into the stack of open elements
868 # immediately below the position of furthest block in that stack.
875 open_els.splice i, 0, new_element
877 # 20. Jump back to the step labeled outer loop.
878 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
879 debug_log "tree: #{serialize_els tree.children, false, true}"
880 debug_log "open_els: #{serialize_els open_els, true, true}"
881 debug_log "afe: #{serialize_els afe, true, true}"
884 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
886 generate_implied_end_tags 'p' # arg is exception
887 if open_els[0].name isnt 'p'
889 while open_els.length > 1 # just in case
890 el = open_els.shift()
893 close_p_if_in_button_scope = ->
894 if is_in_button_scope 'p'
897 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
898 # aka insert_a_character = (t) ->
899 insert_character = (t) ->
900 dest = adjusted_insertion_location()
901 # fixfull check for Document node
903 prev = dest[0].children[dest[1] - 1]
904 if prev.type is TYPE_TEXT
907 dest[0].children.splice dest[1], 0, t
910 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
911 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
912 adjusted_insertion_location = (override_target = null) ->
913 # 1. If there was an override target specified, then let target be the
916 target = override_target
917 else # Otherwise, let target be the current node.
919 # 2. Determine the adjusted insertion location using the first matching
920 # steps from the following list:
922 # If foster parenting is enabled and target is a table, tbody, tfoot,
923 # thead, or tr element Foster parenting happens when content is
924 # misnested in tables.
925 if flag_foster_parenting and foster_parenting_targets[target.name]
926 loop # once. this is here so we can ``break`` to "abort these substeps"
927 # 1. Let last template be the last template element in the
928 # stack of open elements, if any.
930 last_template_i = null
931 for el, i in open_els
932 if el.name is 'template'
936 # 2. Let last table be the last table element in the stack of
937 # open elements, if any.
940 for el, i in open_els
941 if el.name is 'table'
945 # 3. If there is a last template and either there is no last
946 # table, or there is one, but last template is lower (more
947 # recently added) than last table in the stack of open
948 # elements, then: let adjusted insertion location be inside
949 # last template's template contents, after its last child (if
950 # any), and abort these substeps.
951 if last_template and (last_table is null or last_template_i < last_table_i)
952 target = template # fixfull should be it's contents
953 target_i = target.children.length
955 # 4. If there is no last table, then let adjusted insertion
956 # location be inside the first element in the stack of open
957 # elements (the html element), after its last child (if any),
958 # and abort these substeps. (fragment case)
959 if last_table is null
961 target = open_els[open_els.length - 1]
962 target_i = target.children.length
963 # 5. If last table has a parent element, then let adjusted
964 # insertion location be inside last table's parent element,
965 # immediately before last table, and abort these substeps.
966 if last_table.parent?
967 for c, i in last_table.parent.children
969 target = last_table.parent
973 # 6. Let previous element be the element immediately above last
974 # table in the stack of open elements.
976 # huh? how could it not have a parent?
977 previous_element = open_els[last_table_i + 1]
978 # 7. Let adjusted insertion location be inside previous
979 # element, after its last child (if any).
980 target = previous_element
981 target_i = target.children.length
982 # Note: These steps are involved in part because it's possible
983 # for elements, the table element in this case in particular,
984 # to have been moved by a script around in the DOM, or indeed
985 # removed from the DOM entirely, after the element was inserted
987 break # don't really loop
989 # Otherwise Let adjusted insertion location be inside target, after
990 # its last child (if any).
991 target_i = target.children.length
993 # 3. If the adjusted insertion location is inside a template element,
994 # let it instead be inside the template element's template contents,
995 # after its last child (if any).
998 # 4. Return the adjusted insertion location.
999 return [target, target_i]
1001 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1002 # aka create_an_element_for_token
1003 token_to_element = (t, namespace, intended_parent) ->
1004 t.type = TYPE_TAG # not TYPE_START_TAG
1005 # convert attributes into a hash
1007 while t.attrs_a.length
1009 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1010 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs
1012 # TODO 2. If the newly created element has an xmlns attribute in the
1013 # XMLNS namespace whose value is not exactly the same as the element's
1014 # namespace, that is a parse error. Similarly, if the newly created
1015 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1016 # value is not the XLink Namespace, that is a parse error.
1018 # fixfull: the spec says stuff about form pointers and ownerDocument
1022 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1023 insert_foreign_element = (token, namespace) ->
1024 ail = adjusted_insertion_location()
1027 el = token_to_element token, namespace, ail_el
1028 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1030 ail_el.children.splice ail_i, 0, el
1033 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1034 insert_html_element = insert_foreign_element # (token, namespace) ->
1036 # FIXME read implement "foster parenting" part
1037 # FIXME read spec, do this right
1038 # FIXME implement the override target thing
1039 # note: this assumes it's an open tag
1040 # FIXME what part of the spec is this?
1041 # TODO look through all callers of this, and see what they should really be doing.
1042 # eg probably insert_html_element for tokens
1043 tree_insert_element = (el, override_target = null, namespace = null) ->
1045 el.namespace = namespace
1046 dest = adjusted_insertion_location override_target
1047 if el.type is TYPE_START_TAG # means it's a "token"
1048 el = token_to_element el, namespace, dest[0]
1049 unless el.namespace?
1050 namespace = dest.namespace
1051 # fixfull: Document nodes sometimes can't accept more chidren
1052 dest[0].children.splice dest[1], 0, el
1057 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1058 # position should be [node, index_within_children]
1059 insert_comment = (t, position = null) ->
1060 position ?= adjusted_insertion_location()
1061 position[0].children.splice position[1], 0, t
1064 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1065 parse_generic_raw_text = (t) ->
1066 insert_html_element t
1067 tok_state = tok_state_rawtext
1068 original_insertion_mode = insertion_mode
1069 insertion_mode = ins_mode_text
1070 parse_generic_rcdata_text = (t) ->
1071 insert_html_element t
1072 tok_state = tok_state_rcdata
1073 original_insertion_mode = insertion_mode
1074 insertion_mode = ins_mode_text
1076 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1077 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1078 generate_implied_end_tags = (except = null) ->
1079 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1082 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1083 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1084 open_els.shift() # spec says this will be a 'head' node
1085 insertion_mode = ins_mode_after_head
1087 ins_mode_in_head = (t) ->
1088 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1091 if t.type is TYPE_COMMENT
1094 if t.type is TYPE_DOCTYPE
1097 if t.type is TYPE_START_TAG and t.name is 'html'
1100 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1101 el = insert_html_element t
1103 el.acknowledge_self_closing()
1105 if t.type is TYPE_START_TAG and t.name is 'meta'
1106 el = insert_html_element t
1108 el.acknowledge_self_closing()
1109 # fixfull encoding stuff
1111 if t.type is TYPE_START_TAG and t.name is 'title'
1112 parse_generic_rcdata_element t
1114 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1115 parse_generic_raw_text t
1117 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1118 insert_html_element t
1119 insertion_mode = in_head_noscript # FIXME implement
1121 if t.type is TYPE_START_TAG and t.name is 'script'
1122 ail = adjusted_insertion_location()
1123 el = token_to_element t, NS_HTML, ail
1124 el.flag_parser_inserted true # FIXME implement
1125 # fixfull frament case
1126 ail[0].children.splice ail[1], 0, el
1128 tok_state = tok_state_script_data
1129 original_insertion_mode = insertion_mode # make sure orig... is defined
1130 insertion_mode = ins_mode_text # FIXME implement
1132 if t.type is TYPE_END_TAG and t.name is 'head'
1133 open_els.shift() # will be a head element... spec says so
1134 insertion_mode = ins_mode_after_head
1136 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1137 ins_mode_in_head_else t
1139 if t.type is TYPE_START_TAG and t.name is 'template'
1140 insert_html_element t
1142 flag_frameset_ok = false
1143 insertion_mode = ins_mode_in_template
1144 template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1146 if t.type is TYPE_END_TAG and t.name is 'template'
1147 if template_tag_is_open()
1148 generate_implied_end_tags
1149 if open_els[0].name isnt 'template'
1152 el = open_els.shift()
1153 if el.name is 'template'
1155 clear_afe_to_marker()
1156 template_insertion_modes.shift()
1157 reset_insertion_mode()
1161 if (t.type is TYPE_OPEN_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1164 ins_mode_in_head_else t
1166 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1167 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1168 for node, i in open_els
1169 if node.name is name # FIXME check namespace too
1170 generate_implied_end_tags name # arg is exception
1171 parse_error() unless i is 0
1176 if special_elements[node.name]? # FIXME check namespac too
1179 ins_mode_in_body = (t) ->
1185 when "\t", "\u000a", "\u000c", "\u000d", ' '
1186 reconstruct_active_formatting_elements()
1189 reconstruct_active_formatting_elements()
1191 flag_frameset_ok = false
1200 return if template_tag_is_open()
1201 root_attrs = open_els[open_els.length - 1].attrs
1203 root_attrs[k] = v unless root_attrs[k]?
1204 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1205 # FIXME also do this for </template> (end tag)
1206 return ins_mode_in_head t
1213 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1214 close_p_if_in_button_scope()
1215 insert_html_element t
1216 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1217 close_p_if_in_button_scope()
1218 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1221 insert_html_element t
1222 # TODO lots more to implement here
1224 # If the list of active formatting elements
1225 # contains an a element between the end of the list and
1226 # the last marker on the list (or the start of the list
1227 # if there is no marker on the list), then this is a
1228 # parse error; run the adoption agency algorithm for
1229 # the tag name "a", then remove that element from the
1230 # list of active formatting elements and the stack of
1231 # open elements if the adoption agency algorithm didn't
1232 # already remove it (it might not have if the element
1233 # is not in table scope).
1236 if el.type is TYPE_AFE_MARKER
1246 for el, i in open_els
1248 open_els.splice i, 1
1249 reconstruct_active_formatting_elements()
1250 el = insert_html_element t
1252 when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1253 reconstruct_active_formatting_elements()
1254 el = insert_html_element t
1257 # fixfull quirksmode thing
1258 close_p_if_in_button_scope()
1259 insert_html_element t
1260 insertion_mode = ins_mode_in_table
1261 # TODO lots more to implement here
1262 else # any other start tag
1263 reconstruct_active_formatting_elements()
1264 insert_html_element t
1267 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1268 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1271 unless ok_tags[t.name]?
1274 # TODO stack of template insertion modes thing
1275 flag_parsing = false # stop parsing
1279 unless is_in_scope 'body'
1282 # TODO implement parse error and move to tree_after_body
1284 unless is_in_scope 'body' # weird, but it's what the spec says
1287 # TODO implement parse error and move to tree_after_body, reprocess
1288 when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1289 unless is_in_scope t.name, NS_HTML
1292 generate_implied_end_tags()
1293 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1296 el = open_els.shift()
1297 if el.name is t.name and el.namespace is NS_HTML
1299 # TODO lots more close tags to implement here
1301 unless is_in_button_scope 'p'
1303 insert_html_element new_open_tag 'p'
1305 # TODO lots more close tags to implement here
1306 when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1307 adoption_agency t.name
1308 # TODO lots more close tags to implement here
1310 in_body_any_other_end_tag t.name
1313 ins_mode_in_table_else = (t) ->
1315 flag_foster_parenting = true # FIXME
1317 flag_foster_parenting = false
1325 clear_to_table_stopers = {
1330 clear_stack_to_table_context = ->
1332 if clear_to_table_stopers[open_els[0].name]?
1336 clear_to_table_body_stopers = {
1343 clear_stack_to_table_body_context = ->
1345 if clear_to_table_body_stopers[open_els[0].name]?
1349 clear_to_table_row_stopers = {
1354 clear_stack_to_table_row_context = ->
1356 if clear_to_table_row_stopers[open_els[0].name]?
1360 clear_afe_to_marker = ->
1363 if el.type is TYPE_AFE_MARKER
1366 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1367 ins_mode_text = (t) ->
1368 if t.type is TYPE_TEXT
1371 if t.type is TYPE_EOF
1373 if open_els[0].name is 'script'
1374 open_els[0].flag 'already started', true
1376 insertion_mode = original_insertion_mode
1379 if t.type is TYPE_END_TAG and t.name is 'script'
1381 insertion_mode = original_insertion_mode
1382 # fixfull the spec seems to assume that I'm going to run the script
1383 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1385 if t.type is TYPE_END_TAG
1387 insertion_mode = original_insertion_mode
1389 console.log 'warning: end of ins_mode_text reached'
1391 # the functions below implement the tokenizer stats described here:
1392 # http://www.w3.org/TR/html5/syntax.html#tokenization
1394 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1395 ins_mode_in_table = (t) ->
1398 if can_in_table[t.name]
1399 original_insertion_mode = insertion_mode
1400 insertion_mode = ins_mode_in_table_text
1403 ins_mode_in_table_else t
1411 clear_stack_to_table_context()
1413 insert_html_element t
1414 insertion_mode = ins_mode_in_caption
1416 clear_stack_to_table_context()
1417 insert_html_element t
1418 insertion_mode = ins_mode_in_column_group
1420 clear_stack_to_table_context()
1421 insert_html_element new_open_tag 'colgroup'
1422 insertion_mode = ins_mode_in_column_group
1424 when 'tbody', 'tfoot', 'thead'
1425 clear_stack_to_table_context()
1426 insert_html_element t
1427 insertion_mode = ins_mode_in_table_body
1428 when 'td', 'th', 'tr'
1429 clear_stack_to_table_context()
1430 insert_html_element new_open_tag 'tbody'
1431 insertion_mode = ins_mode_in_table_body
1435 if is_in_table_scope 'table'
1437 el = open_els.shift()
1438 if el.name is 'table'
1440 reset_insertion_mode()
1442 when 'style', 'script', 'template'
1445 if token_is_input_hidden t
1446 ins_mode_in_table_else t
1449 el = insert_html_element t
1451 el.acknowledge_self_closing()
1454 if form_element_pointer?
1456 if template_tag_is_open()
1458 form_element_pointer = insert_html_element t
1461 ins_mode_in_table_else t
1465 if is_in_table_scope 'table'
1467 el = open_els.shift()
1468 if el.name is 'table'
1470 reset_insertion_mode()
1473 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1478 ins_mode_in_table_else t
1482 ins_mode_in_table_else t
1485 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1486 ins_mode_in_table_text = (t) ->
1487 if t.type is TYPE_TEXT and t.text is "\u0000"
1488 # huh? I thought the tokenizer didn't emit these
1491 if t.type is TYPE_TEXT
1492 pending_table_character_tokens.push t
1496 for old in pending_table_character_tokens
1497 unless space_chars.indexOf(old.text) > -1
1501 for old in pending_table_character_tokens
1502 insert_character old
1504 for old in pending_table_character_tokens
1505 ins_mode_table_else old
1506 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1507 insertion_mode = original_insertion_mode
1510 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1511 ins_mode_in_caption = (t) ->
1512 if t.type is TYPE_END_TAG and t.name is 'caption'
1513 if is_in_table_scope 'caption'
1514 generate_implied_end_tags()
1515 if open_els[0].name isnt 'caption'
1518 el = open_els.shift()
1519 if el.name is 'caption'
1521 clear_afe_to_marker()
1522 insertion_mode = in_table
1527 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1529 if is_in_table_scope 'caption'
1531 el = open_els.shift()
1532 if el.name is 'caption'
1534 clear_afe_to_marker()
1535 insertion_mode = in_table
1537 # else fragment case
1539 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1545 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1546 ins_mode_in_column_group = (t) ->
1547 if t.type is TYPE_TEXT and space_chars.indexOf(t.text) > -1
1550 if t.type is TYPE_COMMENT
1553 if t.type is TYPE_DOCTYPE
1556 if t.type is TYPE_START_TAG and t.name is 'html'
1559 if t.type is TYPE_START_TAG and t.name is 'col'
1560 el = insert_html_element t
1562 el.acknowledge_self_closing()
1564 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1565 if open_els[0].name is 'colgroup'
1567 insertion_mode = ins_mode_in_table
1571 if t.type is TYPE_END_TAG and t.name is 'col'
1574 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1577 if t.type is TYPE_EOF
1581 if open_els[0].name isnt 'colgroup'
1585 insertion_mode = ins_mode_in_table
1589 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1590 ins_mode_in_table_body = (t) ->
1591 if t.type is TYPE_START_TAG and t.name is 'tr'
1592 clear_stack_to_table_body_context()
1593 insert_html_element t
1594 insertion_mode = ins_mode_in_row
1596 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1598 clear_stack_to_table_body_context()
1599 insert_html_element new_open_tag 'tr'
1600 insertion_mode = ins_mode_in_row
1603 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1604 unless is_in_table_scope t.name # fixfull check namespace
1607 clear_stack_to_table_body_context()
1609 insertion_mode = ins_mode_in_table
1611 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1614 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1617 if table_scopers[el.name]
1622 clear_stack_to_table_body_context()
1624 insertion_mode = ins_mode_in_table
1627 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1633 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1634 ins_mode_in_row = (t) ->
1635 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1636 clear_stack_to_table_row_context()
1637 insert_html_element t
1638 insertion_mode = ins_mode_in_cell
1641 if t.type is TYPE_END_TAG and t.name is 'tr'
1642 if is_in_table_scope 'tr'
1643 clear_stack_to_table_row_context()
1645 insertion_mode = ins_mode_in_table_body
1649 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1650 if is_in_table_scope 'tr'
1651 clear_stack_to_table_row_context()
1653 insertion_mode = ins_mode_in_table_body
1658 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1659 if is_in_table_scope t.name # fixfull namespace
1660 if is_in_table_scope 'tr'
1661 clear_stack_to_table_row_context()
1663 insertion_mode = ins_mode_in_table_body
1668 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1674 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1676 generate_implied_end_tags()
1677 unless open_els[0].name is 'td' or open_els[0] is 'th'
1680 el = open_els.shift()
1681 if el.name is 'td' or el.name is 'th'
1683 clear_afe_to_marker()
1684 insertion_mode = ins_mode_in_row
1686 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1687 ins_mode_in_cell = (t) ->
1688 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1689 if is_in_table_scope t.name
1690 generate_implied_end_tags()
1691 if open_els[0].name isnt t.name
1694 el = open_els.shift()
1695 if el.name is t.name
1697 clear_afe_to_marker()
1698 insertion_mode = ins_mode_in_row
1702 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1705 if el.name is 'td' or el.name is 'th'
1708 if table_scopers[el.name]
1716 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1719 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1720 if is_in_table_scope t.name # fixfull namespace
1729 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
1731 switch c = txt.charAt(cur++)
1733 return new_text_node parse_character_reference()
1735 tok_state = tok_state_tag_open
1738 return new_text_node c
1740 return new_eof_token()
1742 return new_text_node c
1745 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
1746 # not needed: tok_state_character_reference_in_data = ->
1747 # just call parse_character_reference()
1749 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
1750 tok_state_rcdata = ->
1751 switch c = txt.charAt(cur++)
1753 return new_text_node parse_character_reference()
1755 tok_state = tok_state_rcdata_less_than_sign
1758 return new_character_token "\ufffd"
1760 return new_eof_token()
1762 return new_character_token c
1765 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
1766 # not needed: tok_state_character_reference_in_rcdata = ->
1767 # just call parse_character_reference()
1769 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
1770 tok_state_rawtext = ->
1771 switch c = txt.charAt(cur++)
1773 tok_state = tok_state_rawtext_less_than_sign
1776 return new_character_token "\ufffd"
1778 return new_eof_token()
1780 return new_character_token c
1783 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
1784 tok_state_script_data = ->
1785 switch c = txt.charAt(cur++)
1787 tok_state = tok_state_script_data_less_than_sign
1790 return new_character_token "\ufffd"
1792 return new_eof_token()
1794 return new_character_token c
1797 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
1798 tok_state_plaintext = ->
1799 switch c = txt.charAt(cur++)
1802 return new_character_token "\ufffd"
1804 return new_eof_token()
1806 return new_character_token c
1810 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
1811 tok_state_tag_open = ->
1812 switch c = txt.charAt(cur++)
1814 tok_state = tok_state_markup_declaration_open
1816 tok_state = tok_state_end_tag_open
1819 tok_state = tok_state_bogus_comment
1821 if lc_alpha.indexOf(c) > -1
1822 tok_cur_tag = new_open_tag c
1823 tok_state = tok_state_tag_name
1824 else if uc_alpha.indexOf(c) > -1
1825 tok_cur_tag = new_open_tag c.toLowerCase()
1826 tok_state = tok_state_tag_name
1829 tok_state = tok_state_data
1830 cur -= 1 # we didn't parse/handle the char after <
1831 return new_text_node '<'
1834 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
1835 tok_state_end_tag_open = ->
1836 switch c = txt.charAt(cur++)
1839 tok_state = tok_state_data
1842 tok_state = tok_state_data
1843 return new_text_node '</'
1845 if uc_alpha.indexOf(c) > -1
1846 tok_cur_tag = new_end_tag c.toLowerCase()
1847 tok_state = tok_state_tag_name
1848 else if lc_alpha.indexOf(c) > -1
1849 tok_cur_tag = new_end_tag c
1850 tok_state = tok_state_tag_name
1853 tok_state = tok_state_bogus_comment
1856 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
1857 tok_state_tag_name = ->
1858 switch c = txt.charAt(cur++)
1859 when "\t", "\n", "\u000c", ' '
1860 tok_state = tok_state_before_attribute_name
1862 tok_state = tok_state_self_closing_start_tag
1864 tok_state = tok_state_data
1870 tok_cur_tag.name += "\ufffd"
1873 tok_state = tok_state_data
1875 if uc_alpha.indexOf(c) > -1
1876 tok_cur_tag.name += c.toLowerCase()
1878 tok_cur_tag.name += c
1881 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
1882 tok_state_rcdata_less_than_sign = ->
1883 c = txt.charAt(cur++)
1885 temporary_buffer = ''
1886 tok_state = tok_state_rcdata_end_tag_open
1889 tok_state = tok_state_rcdata
1890 cur -= 1 # reconsume the input character
1891 return new_character_token '<'
1893 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
1894 tok_state_rcdata_end_tag_open = ->
1895 c = txt.charAt(cur++)
1896 if uc_alpha.indexOf(c) > -1
1897 tok_cur_tag = new_end_tag c.toLowerCase()
1898 temporary_buffer += c
1899 tok_state = tok_state_rcdata_end_tag_name
1901 if lc_alpha.indexOf(c) > -1
1902 tok_cur_tag = new_end_tag c
1903 temporary_buffer += c
1904 tok_state = tok_state_rcdata_end_tag_name
1907 tok_state = tok_state_rcdata
1908 cur -= 1 # reconsume the input character
1909 return new_character_token "</" # fixfull separate these
1911 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
1912 is_appropriate_end_tag = (t) ->
1913 # spec says to check against "the tag name of the last start tag to
1914 # have been emitted from this tokenizer", but this is only called from
1915 # the various "raw" states, which I'm pretty sure all push the start
1916 # token onto open_els. TODO: verify this after the script data states
1918 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
1919 return t.type is TYPE_END_TAG and t.name is open_els[0].name
1921 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
1922 tok_state_rcdata_end_tag_name = ->
1923 c = txt.charAt(cur++)
1924 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
1925 if is_appropriate_end_tag tok_cur_tag
1926 tok_state = tok_state_before_attribute_name
1928 # else fall through to "Anything else"
1930 if is_appropriate_end_tag tok_cur_tag
1931 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
1933 # else fall through to "Anything else"
1935 if is_appropriate_end_tag tok_cur_tag
1936 tok_state = tok_state_data
1938 # else fall through to "Anything else"
1939 if uc_alpha.indexOf(c) > -1
1940 tok_cur_tag.name += c.toLowerCase()
1941 temporary_buffer += c
1943 if lc_alpha.indexOf(c) > -1
1944 tok_cur_tag.name += c
1945 temporary_buffer += c
1948 tok_state = tok_state_rcdata
1949 cur -= 1 # reconsume the input character
1950 return new_character_token '</' + temporary_buffer # fixfull separate these
1952 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
1953 tok_state_rawtext_less_than_sign = ->
1954 c = txt.charAt(cur++)
1956 temporary_buffer = ''
1957 tok_state = tok_state_rawtext_end_tag_open
1960 tok_state = tok_state_rawtext
1961 cur -= 1 # reconsume the input character
1962 return new_character_token '<'
1964 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
1965 tok_state_rawtext_end_tag_open = ->
1966 c = txt.charAt(cur++)
1967 if uc_alpha.indexOf(c) > -1
1968 tok_cur_tag = new_end_tag c.toLowerCase()
1969 temporary_buffer += c
1970 tok_state = tok_state_rawtext_end_tag_name
1972 if lc_alpha.indexOf(c) > -1
1973 tok_cur_tag = new_end_tag c
1974 temporary_buffer += c
1975 tok_state = tok_state_rawtext_end_tag_name
1978 tok_state = tok_state_rawtext
1979 cur -= 1 # reconsume the input character
1980 return new_character_token "</" # fixfull separate these
1982 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
1983 tok_state_rawtext_end_tag_name = ->
1984 c = txt.charAt(cur++)
1985 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
1986 if is_appropriate_end_tag tok_cur_tag
1987 tok_state = tok_state_before_attribute_name
1989 # else fall through to "Anything else"
1991 if is_appropriate_end_tag tok_cur_tag
1992 tok_state = tok_state_self_closing_start_tag
1994 # else fall through to "Anything else"
1996 if is_appropriate_end_tag tok_cur_tag
1997 tok_state = tok_state_data
1999 # else fall through to "Anything else"
2000 if uc_alpha.indexOf(c) > -1
2001 tok_cur_tag.name += c.toLowerCase()
2002 temporary_buffer += c
2004 if lc_alpha.indexOf(c) > -1
2005 tok_cur_tag.name += c
2006 temporary_buffer += c
2009 tok_state = tok_state_rawtext
2010 cur -= 1 # reconsume the input character
2011 return new_character_token '</' + temporary_buffer # fixfull separate these
2013 # TODO _all_ of the missing states here (17-33) are for parsing script tags
2015 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2016 tok_state_before_attribute_name = ->
2018 switch c = txt.charAt(cur++)
2019 when "\t", "\n", "\u000c", ' '
2022 tok_state = tok_state_self_closing_start_tag
2025 tok_state = tok_state_data
2031 attr_name = "\ufffd"
2032 when '"', "'", '<', '='
2037 tok_state = tok_state_data
2039 if uc_alpha.indexOf(c) > -1
2040 attr_name = c.toLowerCase()
2044 tok_cur_tag.attrs_a.unshift [attr_name, '']
2045 tok_state = tok_state_attribute_name
2048 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2049 tok_state_attribute_name = ->
2050 switch c = txt.charAt(cur++)
2051 when "\t", "\n", "\u000c", ' '
2052 tok_state = tok_state_after_attribute_name
2054 tok_state = tok_state_self_closing_start_tag
2056 tok_state = tok_state_before_attribute_value
2058 tok_state = tok_state_data
2064 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2067 tok_cur_tag.attrs_a[0][0] = c
2070 tok_state = tok_state_data
2072 if uc_alpha.indexOf(c) > -1
2073 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2075 tok_cur_tag.attrs_a[0][0] += c
2078 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2079 tok_state_after_attribute_name = ->
2080 c = txt.charAt(cur++)
2081 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2084 tok_state = tok_state_self_closing_start_tag
2087 tok_state = tok_state_before_attribute_value
2090 tok_state = tok_state_data
2092 if uc_alpha.indexOf(c) > -1
2093 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2094 tok_state = tok_state_attribute_name
2098 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2099 tok_state = tok_state_attribute_name
2103 tok_state = tok_state_data
2104 cur -= 1 # reconsume
2106 if c is '"' or c is "'" or c is '<'
2108 # fall through to Anything else
2110 tok_cur_tag.attrs_a.unshift [c, '']
2111 tok_state = tok_state_attribute_name
2113 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2114 tok_state_before_attribute_value = ->
2115 switch c = txt.charAt(cur++)
2116 when "\t", "\n", "\u000c", ' '
2119 tok_state = tok_state_attribute_value_double_quoted
2121 tok_state = tok_state_attribute_value_unquoted
2124 tok_state = tok_state_attribute_value_single_quoted
2127 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2128 tok_state = tok_state_attribute_value_unquoted
2131 tok_state = tok_state_data
2137 tok_state = tok_state_data
2139 tok_cur_tag.attrs_a[0][1] += c
2140 tok_state = tok_state_attribute_value_unquoted
2143 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2144 tok_state_attribute_value_double_quoted = ->
2145 switch c = txt.charAt(cur++)
2147 tok_state = tok_state_after_attribute_value_quoted
2149 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2152 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2155 tok_state = tok_state_data
2157 tok_cur_tag.attrs_a[0][1] += c
2160 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2161 tok_state_attribute_value_single_quoted = ->
2162 switch c = txt.charAt(cur++)
2164 tok_state = tok_state_after_attribute_value_quoted
2166 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2169 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2172 tok_state = tok_state_data
2174 tok_cur_tag.attrs_a[0][1] += c
2177 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2178 tok_state_attribute_value_unquoted = ->
2179 switch c = txt.charAt(cur++)
2180 when "\t", "\n", "\u000c", ' '
2181 tok_state = tok_state_before_attribute_name
2183 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2185 tok_state = tok_state_data
2190 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2193 tok_state = tok_state_data
2195 # Parse Error if ', <, = or ` (backtick)
2196 tok_cur_tag.attrs_a[0][1] += c
2199 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2200 tok_state_after_attribute_value_quoted = ->
2201 switch c = txt.charAt(cur++)
2202 when "\t", "\n", "\u000c", ' '
2203 tok_state = tok_state_before_attribute_name
2205 tok_state = tok_state_self_closing_start_tag
2207 tok_state = tok_state_data
2213 tok_state = tok_state_data
2216 tok_state = tok_state_before_attribute_name
2217 cur -= 1 # we didn't handle that char
2220 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
2221 # Don't set this as a state, just call it
2222 # returns a string (NOT a text node)
2223 parse_character_reference = (allowed_char = null, in_attr = false) ->
2224 if cur >= txt.length
2226 switch c = txt.charAt(cur)
2227 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
2228 # explicitly not a parse error
2231 # there has to be "one or more" alnums between & and ; to be a parse error
2234 if cur + 1 >= txt.length
2236 if txt.charAt(cur + 1).toLowerCase() is 'x'
2245 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
2249 if txt.charAt(start + i) is ';'
2251 # FIXME This is supposed to generate parse errors for some chars
2252 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
2259 if alnum.indexOf(txt.charAt(cur + i)) is -1
2262 # exit early, because parse_error() below needs at least one alnum
2264 if txt.charAt(cur + i) is ';'
2265 i += 1 # include ';' terminator in value
2266 decoded = decode_named_char_ref txt.substr(cur, i)
2273 # no ';' terminator (only legacy char refs)
2275 for i in [2..max] # no prefix matches, so ok to check shortest first
2276 c = legacy_char_refs[txt.substr(cur, i)]
2279 if txt.charAt(cur + i) is '='
2280 # "because some legacy user agents will
2281 # misinterpret the markup in those cases"
2284 if alnum.indexOf(txt.charAt(cur + i)) > -1
2285 # this makes attributes forgiving about url args
2287 # ok, and besides the weird exceptions for attributes...
2288 # return the matching char
2289 cur += i # consume entity chars
2290 parse_error() # because no terminating ";"
2294 return # never reached
2296 # tree constructor initialization
2297 # see comments on TYPE_TAG/etc for the structure of this data
2298 tree = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
2300 afe = [] # active formatting elements
2301 template_insertion_modes = []
2302 insertion_mode = ins_mode_in_body
2303 original_insertion_mode = insertion_mode # TODO check spec
2304 flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
2305 flag_frameset_ok = true
2307 flag_foster_parenting = false
2308 form_element_pointer = null
2309 temporary_buffer = null
2310 pending_table_character_tokens = []
2312 # tokenizer initialization
2313 tok_state = tok_state_data
2320 return tree.children
2322 # everything below is tests on the above
2323 test_equals = (description, output, expected_output) ->
2324 if output is expected_output
2325 console.log "passed." # don't say name, so smart consoles can merge all of these
2327 console.log "FAILED: \"#{description}\""
2328 console.log " Expected: #{expected_output}"
2329 console.log " Actual: #{output}"
2330 serialize_els = (els, shallow, show_ids) ->
2336 serialized += t.serialize shallow, show_ids
2338 test_parser = (args) ->
2343 prev_node_id = 0 # reset counter
2344 parsed = parse_html args.html, errors_cb
2345 serialized = serialize_els parsed, false, false
2346 if serialized isnt args.expected
2347 debug_log_each (str) ->
2349 console.log "FAILED: \"#{args.name}\""
2350 console.log " Input: #{args.html}"
2351 console.log " Correct: #{args.expected}"
2352 console.log " Output: #{serialized}"
2353 if parse_errors.length > 0
2354 console.log " parse errs: #{JSON.stringify parse_errors}"
2356 console.log " No parse errors"
2358 console.log "passed \"#{args.name}\""
2360 test_parser name: "empty", \
2363 test_parser name: "just text", \
2365 expected: 'text:"abc"'
2366 test_parser name: "named entity", \
2368 expected: 'text:"a&1234"'
2369 test_parser name: "broken named character references", \
2370 html: "1&2&&3&aabbcc;",
2371 expected: 'text:"1&2&&3&aabbcc;"'
2372 test_parser name: "numbered entity overrides", \
2373 html: "1€€ ƒ",
2374 expected: 'text:"1€€ ƒ"'
2375 test_parser name: "open tag", \
2376 html: "foo<span>bar",
2377 expected: 'text:"foo",tag:"span",{},[text:"bar"]'
2378 test_parser name: "open tag with attributes", \
2379 html: "foo<span style=\"foo: bar\" title=\"hi\">bar",
2380 expected: 'text:"foo",tag:"span",{"style":"foo: bar","title":"hi"},[text:"bar"]'
2381 test_parser name: "open tag with attributes of various quotings", \
2382 html: "foo<span abc=\"def\" g=hij klm='nopqrstuv\"' autofocus>bar",
2383 expected: 'text:"foo",tag:"span",{"abc":"def","autofocus":"","g":"hij","klm":"nopqrstuv\\""},[text:"bar"]'
2384 test_parser name: "attribute entity exceptions dq", \
2385 html: "foo<a href=\"foo?t=1&=2&o=3&lt=foo\">bar",
2386 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
2387 test_parser name: "attribute entity exceptions sq", \
2388 html: "foo<a href='foo?t=1&=2&o=3&lt=foo'>bar",
2389 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
2390 test_parser name: "attribute entity exceptions uq", \
2391 html: "foo<a href=foo?t=1&=2&o=3&lt=foo>bar",
2392 expected: 'text:"foo",tag:"a",{"href":"foo?t=1&=2&o=3<=foo"},[text:"bar"]'
2393 test_parser name: "matching closing tags", \
2394 html: "foo<a href=\"hi\">hi</a><div>1<div>foo</div>2</div>bar",
2395 expected: 'text:"foo",tag:"a",{"href":"hi"},[text:"hi"],tag:"div",{},[text:"1",tag:"div",{},[text:"foo"],text:"2"],text:"bar"'
2396 test_parser name: "missing closing tag inside", \
2397 html: "foo<div>bar<span>baz</div>qux",
2398 expected: 'text:"foo",tag:"div",{},[text:"bar",tag:"span",{},[text:"baz"]],text:"qux"'
2399 test_parser name: "mis-matched closing tags", \
2400 html: "<span>12<div>34</span>56</div>78",
2401 expected: 'tag:"span",{},[text:"12",tag:"div",{},[text:"3456"],text:"78"]'
2402 test_parser name: "mis-matched formatting elements", \
2403 html: "12<b>34<i>56</b>78</i>90",
2404 expected: 'text:"12",tag:"b",{},[text:"34",tag:"i",{},[text:"56"]],tag:"i",{},[text:"78"],text:"90"'
2405 test_parser name: "8.2.8.1 Misnested tags: <b><i></b></i>", \
2406 html: '<p>1<b>2<i>3</b>4</i>5</p>',
2407 expected: 'tag:"p",{},[text:"1",tag:"b",{},[text:"2",tag:"i",{},[text:"3"]],tag:"i",{},[text:"4"],text:"5"]'
2408 test_parser name: "8.2.8.2 Misnested tags: <b><p></b></p>", \
2409 html: '<b>1<p>2</b>3</p>',
2410 expected: 'tag:"b",{},[text:"1"],tag:"p",{},[tag:"b",{},[text:"2"],text:"3"]'
2411 test_parser name: "crazy formatting elements test", \
2412 html: "<b><i><a><s><tt><div></b>first</b></div></tt></s></a>second</i>",
2413 # chrome does this: expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]],text:"second"]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]]'
2414 # firefox does this:
2415 expected: 'tag:"b",{},[tag:"i",{},[tag:"a",{},[tag:"s",{},[tag:"tt",{},[]]]]],tag:"a",{},[tag:"s",{},[tag:"tt",{},[tag:"div",{},[tag:"b",{},[],text:"first"]]]],text:"second"'
2416 # tests from https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/adoption01.dat
2417 test_parser name: "html5lib aaa 1", \
2418 html: '<a><p></a></p>',
2419 expected: 'tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]'
2420 test_parser name: "html5lib aaa 2", \
2421 html: '<a>1<p>2</a>3</p>',
2422 expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"]'
2423 test_parser name: "html5lib aaa 3", \
2424 html: '<a>1<button>2</a>3</button>',
2425 expected: 'tag:"a",{},[text:"1"],tag:"button",{},[tag:"a",{},[text:"2"],text:"3"]'
2426 test_parser name: "html5lib aaa 4", \
2427 html: '<a>1<b>2</a>3</b>',
2428 expected: 'tag:"a",{},[text:"1",tag:"b",{},[text:"2"]],tag:"b",{},[text:"3"]'
2429 test_parser name: "html5lib aaa 5 (two divs deep)", \
2430 html: '<a>1<div>2<div>3</a>4</div>5</div>',
2431 expected: 'tag:"a",{},[text:"1"],tag:"div",{},[tag:"a",{},[text:"2"],tag:"div",{},[tag:"a",{},[text:"3"],text:"4"],text:"5"]'
2432 test_parser name: "html5lib aaa 6 (foster parenting)", \
2433 html: '<table><a>1<p>2</a>3</p>',
2434 expected: 'tag:"a",{},[text:"1"],tag:"p",{},[tag:"a",{},[text:"2"],text:"3"],tag:"table",{},[]'
2435 test_parser name: "html5lib aaa 7 (aaa, eof) 1", \
2436 html: '<b><b><a><p></a>',
2437 expected: 'tag:"b",{},[tag:"b",{},[tag:"a",{},[],tag:"p",{},[tag:"a",{},[]]]]'
2438 test_parser name: "html5lib aaa 8 (aaa, eof) 2", \
2439 html: '<b><a><b><p></a>',
2440 expected: 'tag:"b",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2441 test_parser name: "html5lib aaa 9 (aaa, eof) 3", \
2442 html: '<a><b><b><p></a>',
2443 expected: 'tag:"a",{},[tag:"b",{},[tag:"b",{},[]]],tag:"b",{},[tag:"b",{},[tag:"p",{},[tag:"a",{},[]]]]'
2444 test_parser name: "html5lib aaa 10 (formatting, nesting, attrs, aaa)", \
2445 html: '<p>1<s id="A">2<b id="B">3</p>4</s>5</b>',
2446 expected: 'tag:"p",{},[text:"1",tag:"s",{"id":"A"},[text:"2",tag:"b",{"id":"B"},[text:"3"]]],tag:"s",{"id":"A"},[tag:"b",{"id":"B"},[text:"4"]],tag:"b",{"id":"B"},[text:"5"]'
2447 test_parser name: "html5lib aaa 11 (table with foster parenting, formatting el and td)", \
2448 html: '<table><a>1<td>2</td>3</table>',
2449 expected: 'tag:"a",{},[text:"1"],tag:"a",{},[text:"3"],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"2"]]]]'
2450 test_parser name: "html5lib aaa 12 (table with foster parenting, split text)", \
2451 html: '<table>A<td>B</td>C</table>',
2452 expected: 'text:"AC",tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2453 # TODO implement svg and namespacing
2454 #test_parser name: "html5lib aaa 13 (svg tr input)", \
2455 # html: '<a><svg><tr><input></a>',
2456 # expected: 'tag:"a",{},[svg:"svg",{},[svg:"tr",{},[svg:"input"]]]'
2457 test_parser name: "html5lib aaa 14 (deep ?outer aaa)", \
2458 html: '<div><a><b><div><div><div><div><div><div><div><div><div><div></a>',
2459 expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[]],tag:"b",{},[tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"div",{},[tag:"div",{},[]]]]]]]]]]]]]'
2460 test_parser name: "html5lib aaa 15 (deep ?inner aaa)", \
2461 html: '<div><a><b><u><i><code><div></a>',
2462 expected: 'tag:"div",{},[tag:"a",{},[tag:"b",{},[tag:"u",{},[tag:"i",{},[tag:"code",{},[]]]]],tag:"u",{},[tag:"i",{},[tag:"code",{},[tag:"div",{},[tag:"a",{},[]]]]]]'
2463 test_parser name: "html5lib aaa 16 (correctly nested 4b)", \
2464 html: '<b><b><b><b>x</b></b></b></b>y',
2465 expected: 'tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]],text:"y"'
2466 test_parser name: "html5lib aaa 17 (formatting, implied /p, noah's ark)", \
2467 html: '<p><b><b><b><b><p>x',
2468 expected: 'tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[]]]]],tag:"p",{},[tag:"b",{},[tag:"b",{},[tag:"b",{},[text:"x"]]]]'
2469 test_parser name: "variation on html5lib aaa 17 (with attributes in various orders)", \
2470 html: '<p><b c="d" e="f"><b e="f" c="d"><b e="f" c="d"><b c="d" e="f"><p>x',
2471 expected: 'tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[]]]]],tag:"p",{},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[tag:"b",{"c":"d","e":"f"},[text:"x"]]]]'
2472 test_parser name: "junk after attribute close-quote", \
2473 html: '<p><b c="d", e="f">foo<p>x',
2474 expected: 'tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"foo"]],tag:"p",{},[tag:"b",{",":"","c":"d","e":"f"},[text:"x"]]'
2475 test_parser name: "html5lib aaa02 1", \
2476 html: '<b>1<i>2<p>3</b>4',
2477 expected: 'tag:"b",{},[text:"1",tag:"i",{},[text:"2"]],tag:"i",{},[tag:"p",{},[tag:"b",{},[text:"3"],text:"4"]]'
2478 test_parser name: "html5lib aaa02 2", \
2479 html: '<a><div><style></style><address><a>',
2480 expected: 'tag:"a",{},[],tag:"div",{},[tag:"a",{},[tag:"style",{},[]],tag:"address",{},[tag:"a",{},[],tag:"a",{},[]]]'
2481 test_parser name: "html5lib tables 1", \
2482 html: '<table><th>',
2483 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"th",{},[]]]]'
2484 test_parser name: "html5lib tables 2", \
2485 html: '<table><td>',
2486 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
2487 test_parser name: "html5lib tables 3", \
2488 html: "<table><col foo='bar'>",
2489 expected: 'tag:"table",{},[tag:"colgroup",{},[tag:"col",{"foo":"bar"},[]]]'
2490 test_parser name: "html5lib tables 4", \
2491 html: '<table><colgroup></html>foo',
2492 expected: 'text:"foo",tag:"table",{},[tag:"colgroup",{},[]]'
2493 test_parser name: "html5lib tables 5", \
2494 html: '<table></table><p>foo',
2495 expected: 'tag:"table",{},[],tag:"p",{},[text:"foo"]'
2496 test_parser name: "html5lib tables 6", \
2497 html: '<table></body></caption></col></colgroup></html></tbody></td></tfoot></th></thead></tr><td>',
2498 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]]]]'
2499 test_parser name: "html5lib tables 7", \
2500 html: '<table><select><option>3</select></table>',
2501 expected: 'tag:"select",{},[tag:"option",{},[text:"3"]],tag:"table",{},[]'
2502 test_parser name: "html5lib tables 8", \
2503 html: '<table><select><table></table></select></table>',
2504 expected: 'tag:"select",{},[],tag:"table",{},[],tag:"table",{},[]'
2505 test_parser name: "html5lib tables 9", \
2506 html: '<table><select></table>',
2507 expected: 'tag:"select",{},[],tag:"table",{},[]'
2508 test_parser name: "html5lib tables 10", \
2509 html: '<table><select><option>A<tr><td>B</td></tr></table>',
2510 expected: 'tag:"select",{},[tag:"option",{},[text:"A"]],tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"B"]]]]'
2511 test_parser name: "html5lib tables 11", \
2512 html: '<table><td></body></caption></col></colgroup></html>foo',
2513 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
2514 test_parser name: "html5lib tables 12", \
2515 html: '<table><td>A</table>B',
2516 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"A"]]]],text:"B"'
2517 test_parser name: "html5lib tables 13", \
2518 html: '<table><tr><caption>',
2519 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[]],tag:"caption",{},[]]'
2520 test_parser name: "html5lib tables 14", \
2521 html: '<table><tr></body></caption></col></colgroup></html></td></th><td>foo',
2522 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[text:"foo"]]]]'
2523 test_parser name: "html5lib tables 15", \
2524 html: '<table><td><tr>',
2525 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[]],tag:"tr",{},[]]]'
2526 test_parser name: "html5lib tables 16", \
2527 html: '<table><td><button><td>',
2528 expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[tag:"button",{},[]],tag:"td",{},[]]]]'
2529 # TODO implement svg parsing
2530 #test_parser name: "html5lib tables 17", \
2531 # html: '<table><tr><td><svg><desc><td>',
2532 # expected: 'tag:"table",{},[tag:"tbody",{},[tag:"tr",{},[tag:"td",{},[svg:"svg",{},[svg:"desc",{},[]]],tag:"td",{},[]]]]'