1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
55 module = exports: window.wheic
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
79 debug_log_each = (cb) ->
80 for str in g_debug_log
85 constructor: (type, args = {}) ->
86 @type = type # one of the TYPE_* constants above
87 @name = args.name ? '' # tag name
88 @text = args.text ? '' # contents for text/comment nodes
89 @attrs = args.attrs ? {}
90 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91 @children = args.children ? []
92 @namespace = args.namespace ? NS_HTML
93 @parent = args.parent ? null
94 @token = args.token ? null
98 @id = "#{++prev_node_id}"
99 acknowledge_self_closing: ->
101 @token.flag 'did_self_close'
103 @flag 'did_self_close', true
106 serialize: (shallow = false, show_ids = false) -> # for unit tests
111 ret += JSON.stringify @name
126 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
132 ret += c.serialize shallow, show_ids
136 ret += JSON.stringify @text
139 ret += JSON.stringify @text
141 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
144 when TYPE_AAA_BOOKMARK
145 ret += 'aaa_bookmark'
148 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
151 # helpers: (only take args that are normally known when parser creates nodes)
152 new_open_tag = (name) ->
153 return new Node TYPE_START_TAG, name: name
154 new_end_tag = (name) ->
155 return new Node TYPE_END_TAG, name: name
156 new_element = (name) ->
157 return new Node TYPE_TAG, name: name
158 new_text_node = (txt) ->
159 return new Node TYPE_TEXT, text: txt
160 new_character_token = new_text_node
161 new_comment_token = (txt) ->
162 return new Node TYPE_COMMENT, text: txt
163 new_doctype_token = (name) ->
164 return new Node TYPE_DOCTYPE, name: name
166 return new Node TYPE_EOF
168 return new Node TYPE_AFE_MARKER
169 new_aaa_bookmark = ->
170 return new Node TYPE_AAA_BOOKMARK
172 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
173 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
174 digits = "0123456789"
175 alnum = lc_alpha + uc_alpha + digits
176 hex_chars = digits + "abcdefABCDEF"
178 is_uc_alpha = (str) ->
179 return str.length is 1 and uc_alpha.indexOf(str) > -1
180 is_lc_alpha = (str) ->
181 return str.length is 1 and lc_alpha.indexOf(str) > -1
183 # some SVG elements have dashes in them
184 tag_name_chars = alnum + "-"
186 # http://www.w3.org/TR/html5/infrastructure.html#space-character
187 space_chars = "\u0009\u000a\u000c\u000d\u0020"
189 return txt.length is 1 and space_chars.indexOf(txt) > -1
190 is_space_tok = (t) ->
191 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
193 is_input_hidden_tok = (t) ->
194 return unless t.type is TYPE_START_TAG
197 if a[1].toLowerCase() is 'hidden'
202 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
203 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
205 # These are the character references that don't need a terminating semicolon
206 # min length: 2, max: 6, none are a prefix of any other.
208 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
209 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
210 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
211 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
212 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
213 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
214 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
215 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
216 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
217 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
218 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
219 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
220 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
221 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
222 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
223 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
224 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
228 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
229 raw_text_elements = ['script', 'style']
230 escapable_raw_text_elements = ['textarea', 'title']
231 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
233 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
234 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
235 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
236 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
237 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
238 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
239 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
240 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
241 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
242 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
243 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
244 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
245 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
246 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
250 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
252 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
253 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
254 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
255 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
256 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
257 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
258 'determinant', 'diff', 'divergence', 'divide', 'domain',
259 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
260 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
261 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
262 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
263 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
264 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
265 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
266 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
267 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
268 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
269 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
270 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
271 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
272 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
273 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
274 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
275 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
276 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
277 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
278 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
279 'vectorproduct', 'xor'
281 # foreign_elements = [svg_elements..., mathml_elements...]
282 #normal_elements = All other allowed HTML elements are normal elements.
286 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
287 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
288 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
289 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
290 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
291 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
292 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
293 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
294 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
295 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
296 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
297 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
298 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
299 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
300 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
301 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
302 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
303 wbr:NS_HTML, xmp:NS_HTML,
306 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
307 'annotation-xml':NS_MATHML,
310 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
313 formatting_elements = {
314 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
315 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
320 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
323 foster_parenting_targets = {
345 el_is_special = (e) ->
346 return special_elements[e.name] is e.namespace
348 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
349 el_is_special_not_adp = (el) ->
350 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
352 # decode_named_char_ref()
354 # The list of named character references is _huge_ so ask the browser to decode
355 # for us instead of wasting bandwidth/space on including the table here.
357 # Pass without the "&" but with the ";" examples:
358 # for "&" pass "amp;"
359 # for "′" pass "x2032;"
362 textarea: document.createElement('textarea')
364 # TODO test this in IE8
365 decode_named_char_ref = (txt) ->
367 decoded = g_dncr.cache[txt]
368 return decoded if decoded?
369 g_dncr.textarea.innerHTML = txt
370 decoded = g_dncr.textarea.value
371 return null if decoded is txt
372 return g_dncr.cache[txt] = decoded
374 parse_html = (txt, parse_error_cb = null) ->
375 cur = 0 # index of next char in txt to be parsed
376 # declare doc and tokenizer variables so they're in scope below
378 open_els = null # stack of open elements
379 afe = null # active formatting elements
380 template_ins_modes = null
382 original_ins_mode = null
384 tok_cur_tag = null # partially parsed tag
385 flag_scripting = null
386 flag_frameset_ok = null
388 flag_foster_parenting = null
389 form_element_pointer = null
390 temporary_buffer = null
391 pending_table_character_tokens = null
392 head_element_pointer = null
393 flag_fragment_parsing = null
394 context_element = null
403 console.log "Parse error at character #{cur} of #{txt.length}"
405 afe_push = (new_el) ->
408 if el.name is new_el.name and el.namespace is new_el.namespace
410 continue unless new_el.attrs[k] is v
411 for k, v of new_el.attrs
412 continue unless el.attrs[k] is v
419 afe.unshift new_afe_marker()
421 # the functions below impliment the Tree Contstruction algorithm
422 # http://www.w3.org/TR/html5/syntax.html#tree-construction
424 # But first... the helpers
425 template_tag_is_open = ->
427 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
430 is_in_scope_x = (tag_name, scope, namespace) ->
432 if t.name is tag_name and (namespace is null or namespace is t.namespace)
434 if scope[t.name] is t.namespace
437 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
439 if t.name is tag_name and (namespace is null or namespace is t.namespace)
441 if scope[t.name] is t.namespace
443 if scope2[t.name] is t.namespace
447 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
448 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
449 template: NS_HTML, mi: NS_MATHML,
451 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
452 'annotation-xml': NS_MATHML,
454 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
456 button_scopers = button: NS_HTML
457 li_scopers = ol: NS_HTML, ul: NS_HTML
458 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
459 is_in_scope = (tag_name, namespace = null) ->
460 return is_in_scope_x tag_name, standard_scopers, namespace
461 is_in_button_scope = (tag_name, namespace = null) ->
462 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
463 is_in_table_scope = (tag_name, namespace = null) ->
464 return is_in_scope_x tag_name, table_scopers, namespace
465 is_in_select_scope = (tag_name, namespace = null) ->
467 if t.name is tag_name and (namespace is null or namespace is t.namespace)
469 if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
472 # this checks for a particular element, not by name
473 el_is_in_scope = (el) ->
477 if standard_scopers[t.name] is t.namespace
481 clear_to_table_stopers = {
486 clear_stack_to_table_context = ->
488 if clear_to_table_stopers[open_els[0].name]?
492 clear_to_table_body_stopers = {
499 clear_stack_to_table_body_context = ->
501 if clear_to_table_body_stopers[open_els[0].name]?
505 clear_to_table_row_stopers = {
510 clear_stack_to_table_row_context = ->
512 if clear_to_table_row_stopers[open_els[0].name]?
516 clear_afe_to_marker = ->
518 return unless afe.length > 0 # this happens in fragment case, ?spec error
520 if el.type is TYPE_AFE_MARKER
525 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
527 # 1. Let last be false.
529 # 2. Let node be the last node in the stack of open elements.
531 node = open_els[node_i]
532 # 3. Loop: If node is the first node in the stack of open elements,
533 # then set last to true, and, if the parser was originally created as
534 # part of the HTML fragment parsing algorithm (fragment case) set node
535 # to the context element.
537 if node_i is open_els.length - 1
539 # fixfull (fragment case)
541 # 4. If node is a select element, run these substeps:
542 if node.name is 'select'
543 # 1. If last is true, jump to the step below labeled done.
545 # 2. Let ancestor be node.
548 # 3. Loop: If ancestor is the first node in the stack of
549 # open elements, jump to the step below labeled done.
551 if ancestor_i is open_els.length - 1
553 # 4. Let ancestor be the node before ancestor in the stack
556 ancestor = open_els[ancestor_i]
557 # 5. If ancestor is a template node, jump to the step below
559 if ancestor.name is 'template'
561 # 6. If ancestor is a table node, switch the insertion mode
562 # to "in select in table" and abort these steps.
563 if ancestor.name is 'table'
564 ins_mode = ins_mode_in_select_in_table
566 # 7. Jump back to the step labeled loop.
567 # 8. Done: Switch the insertion mode to "in select" and abort
569 ins_mode = ins_mode_in_select
571 # 5. If node is a td or th element and last is false, then switch
572 # the insertion mode to "in cell" and abort these steps.
573 if (node.name is 'td' or node.name is 'th') and last is false
574 ins_mode = ins_mode_in_cell
576 # 6. If node is a tr element, then switch the insertion mode to "in
577 # row" and abort these steps.
579 ins_mode = ins_mode_in_row
581 # 7. If node is a tbody, thead, or tfoot element, then switch the
582 # insertion mode to "in table body" and abort these steps.
583 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
584 ins_mode = ins_mode_in_table_body
586 # 8. If node is a caption element, then switch the insertion mode
587 # to "in caption" and abort these steps.
588 if node.name is 'caption'
589 ins_mode = ins_mode_in_caption
591 # 9. If node is a colgroup element, then switch the insertion mode
592 # to "in column group" and abort these steps.
593 if node.name is 'colgroup'
594 ins_mode = ins_mode_in_column_group
596 # 10. If node is a table element, then switch the insertion mode to
597 # "in table" and abort these steps.
598 if node.name is 'table'
599 ins_mode = ins_mode_in_table
601 # 11. If node is a template element, then switch the insertion mode
602 # to the current template insertion mode and abort these steps.
603 # fixfull (template insertion mode stack)
605 # 12. If node is a head element and last is true, then switch the
606 # insertion mode to "in body" ("in body"! not "in head"!) and abort
607 # these steps. (fragment case)
608 if node.name is 'head' and last
609 ins_mode = ins_mode_in_body
611 # 13. If node is a head element and last is false, then switch the
612 # insertion mode to "in head" and abort these steps.
613 if node.name is 'head' and last is false
614 ins_mode = ins_mode_in_head
616 # 14. If node is a body element, then switch the insertion mode to
617 # "in body" and abort these steps.
618 if node.name is 'body'
619 ins_mode = ins_mode_in_body
621 # 15. If node is a frameset element, then switch the insertion mode
622 # to "in frameset" and abort these steps. (fragment case)
623 if node.name is 'frameset'
624 ins_mode = ins_mode_in_frameset
626 # 16. If node is an html element, run these substeps:
627 if node.name is 'html'
628 # 1. If the head element pointer is null, switch the insertion
629 # mode to "before head" and abort these steps. (fragment case)
630 if head_element_pointer is null
631 ins_mode = ins_mode_before_head
633 # 2. Otherwise, the head element pointer is not null,
634 # switch the insertion mode to "after head" and abort these
636 ins_mode = ins_mode_after_head
638 # 17. If last is true, then switch the insertion mode to "in body"
639 # and abort these steps. (fragment case)
641 ins_mode = ins_mode_in_body
643 # 18. Let node now be the node before node in the stack of open
646 node = open_els[node_i]
647 # 19. Return to the step labeled loop.
651 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
652 adjusted_current_node = ->
653 if open_els.length is 1 and flag_fragment_parsing
654 return context_element
657 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
658 # this implementation is structured (mostly) as described at the link above.
659 # capitalized comments are the "labels" described at the link above.
660 reconstruct_active_formatting_elements = ->
661 return if afe.length is 0
662 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
667 if i is afe.length - 1
670 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
675 el = insert_html_element afe[i].token
680 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
681 # adoption agency algorithm
683 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
684 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
685 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
686 adoption_agency = (subject) ->
687 debug_log "adoption_agency()"
688 debug_log "tree: #{serialize_els doc.children, false, true}"
689 debug_log "open_els: #{serialize_els open_els, true, true}"
690 debug_log "afe: #{serialize_els afe, true, true}"
691 if open_els[0].name is subject
694 # remove it from the list of active formatting elements (if found)
699 debug_log "aaa: starting off with subject on top of stack, exiting"
706 # 5. Let formatting element be the last element in the list of
707 # active formatting elements that: is between the end of the list
708 # and the last scope marker in the list, if any, or the start of
709 # the list otherwise, and has the tag name subject.
711 for t, fe_of_afe in afe
712 if t.type is TYPE_AFE_MARKER
717 # If there is no such element, then abort these steps and instead
718 # act as described in the "any other end tag" entry above.
720 debug_log "aaa: fe not found in afe"
721 in_body_any_other_end_tag subject
723 # 6. If formatting element is not in the stack of open elements,
724 # then this is a parse error; remove the element from the list, and
727 for t, fe_of_open_els in open_els
732 debug_log "aaa: fe not found in open_els"
734 # "remove it from the list" must mean afe, since it's not in open_els
735 afe.splice fe_of_afe, 1
737 # 7. If formatting element is in the stack of open elements, but
738 # the element is not in scope, then this is a parse error; abort
740 unless el_is_in_scope fe
741 debug_log "aaa: fe not in scope"
744 # 8. If formatting element is not the current node, this is a parse
745 # error. (But do not abort these steps.)
746 unless open_els[0] is fe
749 # 9. Let furthest block be the topmost node in the stack of open
750 # elements that is lower in the stack than formatting element, and
751 # is an element in the special category. There might not be one.
753 fb_of_open_els = null
760 # and continue, to see if there's one that's more "topmost"
761 # 10. If there is no furthest block, then the UA must first pop all
762 # the nodes from the bottom of the stack of open elements, from the
763 # current node up to and including formatting element, then remove
764 # formatting element from the list of active formatting elements,
765 # and finally abort these steps.
767 debug_log "aaa: no fb"
771 afe.splice fe_of_afe, 1
773 # 11. Let common ancestor be the element immediately above
774 # formatting element in the stack of open elements.
775 ca = open_els[fe_of_open_els + 1] # common ancestor
777 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
778 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
779 bookmark = new_aaa_bookmark()
782 afe.splice i, 0, bookmark
784 node = last_node = fb
788 # 3. Let node be the element immediately above node in the
789 # stack of open elements, or if node is no longer in the stack
790 # of open elements (e.g. because it got removed by this
791 # algorithm), the element that was immediately above node in
792 # the stack of open elements before node was removed.
796 node_next = open_els[i + 1]
798 node = node_next ? node_above
799 debug_log "inner loop #{inner}"
800 debug_log "tree: #{serialize_els doc.children, false, true}"
801 debug_log "open_els: #{serialize_els open_els, true, true}"
802 debug_log "afe: #{serialize_els afe, true, true}"
803 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
804 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
805 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
806 debug_log "node: #{node.serialize true, true}"
807 # TODO make sure node_above gets re-set if/when node is removed from open_els
809 # 4. If node is formatting element, then go to the next step in
810 # the overall algorithm.
814 # 5. If inner loop counter is greater than three and node is in
815 # the list of active formatting elements, then remove node from
816 # the list of active formatting elements.
822 debug_log "max out inner"
827 # 6. If node is not in the list of active formatting elements,
828 # then remove node from the stack of open elements and then go
829 # back to the step labeled inner loop.
831 debug_log "not in afe"
834 node_above = open_els[i + 1]
838 debug_log "the bones"
839 # 7. create an element for the token for which the element node
840 # was created, in the HTML namespace, with common ancestor as
841 # the intended parent; replace the entry for node in the list
842 # of active formatting elements with an entry for the new
843 # element, replace the entry for node in the stack of open
844 # elements with an entry for the new element, and let node be
846 new_node = token_to_element node.token, NS_HTML, ca
850 debug_log "replaced in afe"
854 node_above = open_els[i + 1]
855 open_els[i] = new_node
856 debug_log "replaced in open_els"
859 # 8. If last node is furthest block, then move the
860 # aforementioned bookmark to be immediately after the new node
861 # in the list of active formatting elements.
866 debug_log "removed bookmark"
870 # "after" means lower
871 afe.splice i, 0, bookmark # "after as <-
872 debug_log "placed bookmark after node"
873 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
875 # 9. Insert last node into node, first removing it from its
876 # previous parent node if any.
878 debug_log "last_node has parent"
879 for c, i in last_node.parent.children
881 debug_log "removing last_node from parent"
882 last_node.parent.children.splice i, 1
884 node.children.push last_node
885 last_node.parent = node
886 # 10. Let last node be node.
889 # 11. Return to the step labeled inner loop.
890 # 14. Insert whatever last node ended up being in the previous step
891 # at the appropriate place for inserting a node, but using common
892 # ancestor as the override target.
894 # In the case where fe is immediately followed by fb:
895 # * inner loop exits out early (node==fe)
897 # * last_node is still in the tree (not a duplicate)
899 debug_log "FEFIRST? last_node has parent"
900 for c, i in last_node.parent.children
902 debug_log "removing last_node from parent"
903 last_node.parent.children.splice i, 1
906 debug_log "after aaa inner loop"
907 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
908 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
909 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
910 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
911 debug_log "tree: #{serialize_els doc.children, false, true}"
916 # can't use standard insert token thing, because it's already in
917 # open_els and must stay at it's current position in open_els
918 dest = adjusted_insertion_location ca
919 dest[0].children.splice dest[1], 0, last_node
920 last_node.parent = dest[0]
923 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
924 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
925 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
926 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
927 debug_log "tree: #{serialize_els doc.children, false, true}"
929 # 15. Create an element for the token for which formatting element
930 # was created, in the HTML namespace, with furthest block as the
932 new_element = token_to_element fe.token, NS_HTML, fb
933 # 16. Take all of the child nodes of furthest block and append them
934 # to the element created in the last step.
935 while fb.children.length
936 t = fb.children.shift()
937 t.parent = new_element
938 new_element.children.push t
939 # 17. Append that new element to furthest block.
940 new_element.parent = fb
941 fb.children.push new_element
942 # 18. Remove formatting element from the list of active formatting
943 # elements, and insert the new element into the list of active
944 # formatting elements at the position of the aforementioned
954 # 19. Remove formatting element from the stack of open elements,
955 # and insert the new element into the stack of open elements
956 # immediately below the position of furthest block in that stack.
963 open_els.splice i, 0, new_element
965 # 20. Jump back to the step labeled outer loop.
966 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
967 debug_log "tree: #{serialize_els doc.children, false, true}"
968 debug_log "open_els: #{serialize_els open_els, true, true}"
969 debug_log "afe: #{serialize_els afe, true, true}"
972 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
974 generate_implied_end_tags 'p' # arg is exception
975 if open_els[0].name isnt 'p'
977 while open_els.length > 1 # just in case
978 el = open_els.shift()
981 close_p_if_in_button_scope = ->
982 if is_in_button_scope 'p'
985 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
986 # aka insert_a_character = (t) ->
987 insert_character = (t) ->
988 dest = adjusted_insertion_location()
989 # fixfull check for Document node
991 prev = dest[0].children[dest[1] - 1]
992 if prev.type is TYPE_TEXT
995 dest[0].children.splice dest[1], 0, t
998 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
999 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1000 adjusted_insertion_location = (override_target = null) ->
1001 # 1. If there was an override target specified, then let target be the
1004 target = override_target
1005 else # Otherwise, let target be the current node.
1006 target = open_els[0]
1007 # 2. Determine the adjusted insertion location using the first matching
1008 # steps from the following list:
1010 # If foster parenting is enabled and target is a table, tbody, tfoot,
1011 # thead, or tr element Foster parenting happens when content is
1012 # misnested in tables.
1013 if flag_foster_parenting and foster_parenting_targets[target.name]
1014 loop # once. this is here so we can ``break`` to "abort these substeps"
1015 # 1. Let last template be the last template element in the
1016 # stack of open elements, if any.
1017 last_template = null
1018 last_template_i = null
1019 for el, i in open_els
1020 if el.name is 'template'
1024 # 2. Let last table be the last table element in the stack of
1025 # open elements, if any.
1028 for el, i in open_els
1029 if el.name is 'table'
1033 # 3. If there is a last template and either there is no last
1034 # table, or there is one, but last template is lower (more
1035 # recently added) than last table in the stack of open
1036 # elements, then: let adjusted insertion location be inside
1037 # last template's template contents, after its last child (if
1038 # any), and abort these substeps.
1039 if last_template and (last_table is null or last_template_i < last_table_i)
1040 target = last_template # fixfull should be it's contents
1041 target_i = target.children.length
1043 # 4. If there is no last table, then let adjusted insertion
1044 # location be inside the first element in the stack of open
1045 # elements (the html element), after its last child (if any),
1046 # and abort these substeps. (fragment case)
1047 if last_table is null
1049 target = open_els[open_els.length - 1]
1050 target_i = target.children.length
1051 # 5. If last table has a parent element, then let adjusted
1052 # insertion location be inside last table's parent element,
1053 # immediately before last table, and abort these substeps.
1054 if last_table.parent?
1055 for c, i in last_table.parent.children
1057 target = last_table.parent
1061 # 6. Let previous element be the element immediately above last
1062 # table in the stack of open elements.
1064 # huh? how could it not have a parent?
1065 previous_element = open_els[last_table_i + 1]
1066 # 7. Let adjusted insertion location be inside previous
1067 # element, after its last child (if any).
1068 target = previous_element
1069 target_i = target.children.length
1070 # Note: These steps are involved in part because it's possible
1071 # for elements, the table element in this case in particular,
1072 # to have been moved by a script around in the DOM, or indeed
1073 # removed from the DOM entirely, after the element was inserted
1075 break # don't really loop
1077 # Otherwise Let adjusted insertion location be inside target, after
1078 # its last child (if any).
1079 target_i = target.children.length
1081 # 3. If the adjusted insertion location is inside a template element,
1082 # let it instead be inside the template element's template contents,
1083 # after its last child (if any).
1084 # fixfull (template)
1086 # 4. Return the adjusted insertion location.
1087 return [target, target_i]
1089 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1090 # aka create_an_element_for_token
1091 token_to_element = (t, namespace, intended_parent) ->
1092 # convert attributes into a hash
1095 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1096 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1098 # TODO 2. If the newly created element has an xmlns attribute in the
1099 # XMLNS namespace whose value is not exactly the same as the element's
1100 # namespace, that is a parse error. Similarly, if the newly created
1101 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1102 # value is not the XLink Namespace, that is a parse error.
1104 # fixfull: the spec says stuff about form pointers and ownerDocument
1108 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1109 insert_foreign_element = (token, namespace) ->
1110 ail = adjusted_insertion_location()
1113 el = token_to_element token, namespace, ail_el
1114 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1116 ail_el.children.splice ail_i, 0, el
1119 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1120 insert_html_element = insert_foreign_element # (token, namespace) ->
1122 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1123 # position should be [node, index_within_children]
1124 insert_comment = (t, position = null) ->
1125 position ?= adjusted_insertion_location()
1126 position[0].children.splice position[1], 0, t
1129 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1130 parse_generic_raw_text = (t) ->
1131 insert_html_element t
1132 tok_state = tok_state_rawtext
1133 original_ins_mode = ins_mode
1134 ins_mode = ins_mode_text
1135 parse_generic_rcdata_text = (t) ->
1136 insert_html_element t
1137 tok_state = tok_state_rcdata
1138 original_ins_mode = ins_mode
1139 ins_mode = ins_mode_text
1141 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1142 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1143 generate_implied_end_tags = (except = null) ->
1144 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1147 # 8.2.5.4 The rules for parsing tokens in HTML content
1148 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1150 # 8.2.5.4.1 The "initial" insertion mode
1151 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1152 ins_mode_initial = (t) ->
1155 if t.type is TYPE_COMMENT
1159 if t.type is TYPE_DOCTYPE
1160 # FIXME check identifiers, set quirks, etc
1163 ins_mode = ins_mode_before_html
1166 #fixfull (iframe, quirks)
1167 ins_mode = ins_mode_before_html
1168 ins_mode t # reprocess the token
1171 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1172 ins_mode_before_html = (t) ->
1173 if t.type is TYPE_DOCTYPE
1176 if t.type is TYPE_COMMENT
1181 if t.type is TYPE_START_TAG and t.name is 'html'
1182 el = token_to_element t, NS_HTML, doc
1183 doc.children.push el
1184 open_els.unshift(el)
1185 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1186 ins_mode = ins_mode_before_head
1188 if t.type is TYPE_END_TAG
1189 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1190 # fall through to "anything else"
1195 html_tok = new_open_tag 'html'
1196 el = token_to_element html_tok, NS_HTML, doc
1197 doc.children.push el
1199 # ?fixfull browsing context
1200 ins_mode = ins_mode_before_head
1204 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1205 ins_mode_before_head = (t) ->
1208 if t.type is TYPE_COMMENT
1211 if t.type is TYPE_DOCTYPE
1214 if t.type is TYPE_START_TAG and t.name is 'html'
1217 if t.type is TYPE_START_TAG and t.name is 'head'
1218 el = insert_html_element t
1219 head_element_pointer = el
1220 ins_mode = ins_mode_in_head
1221 if t.type is TYPE_END_TAG
1222 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1223 # fall through to Anything else below
1228 head_tok = new_open_tag 'head'
1229 el = insert_html_element head_tok
1230 head_element_pointer = el
1231 ins_mode = ins_mode_in_head
1232 ins_mode t # reprocess current token
1234 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1235 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1236 open_els.shift() # spec says this will be a 'head' node
1237 ins_mode = ins_mode_after_head
1239 ins_mode_in_head = (t) ->
1240 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1243 if t.type is TYPE_COMMENT
1246 if t.type is TYPE_DOCTYPE
1249 if t.type is TYPE_START_TAG and t.name is 'html'
1252 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1253 el = insert_html_element t
1255 t.acknowledge_self_closing()
1257 if t.type is TYPE_START_TAG and t.name is 'meta'
1258 el = insert_html_element t
1260 t.acknowledge_self_closing()
1261 # fixfull encoding stuff
1263 if t.type is TYPE_START_TAG and t.name is 'title'
1264 parse_generic_rcdata_text t
1266 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1267 parse_generic_raw_text t
1269 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1270 insert_html_element t
1271 ins_mode = ins_mode_in_head_noscript
1273 if t.type is TYPE_START_TAG and t.name is 'script'
1274 ail = adjusted_insertion_location()
1275 el = token_to_element t, NS_HTML, ail
1276 el.flag 'parser-inserted', true
1277 # fixfull frament case
1278 ail[0].children.splice ail[1], 0, el
1280 tok_state = tok_state_script_data
1281 original_ins_mode = ins_mode # make sure orig... is defined
1282 ins_mode = ins_mode_text
1284 if t.type is TYPE_END_TAG and t.name is 'head'
1285 open_els.shift() # will be a head element... spec says so
1286 ins_mode = ins_mode_after_head
1288 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1289 ins_mode_in_head_else t
1291 if t.type is TYPE_START_TAG and t.name is 'template'
1292 insert_html_element t
1294 flag_frameset_ok = false
1295 ins_mode = ins_mode_in_template
1296 template_ins_modes.unshift ins_mode_in_template
1298 if t.type is TYPE_END_TAG and t.name is 'template'
1299 if template_tag_is_open()
1300 generate_implied_end_tags
1301 if open_els[0].name isnt 'template'
1304 el = open_els.shift()
1305 if el.name is 'template'
1307 clear_afe_to_marker()
1308 template_ins_modes.shift()
1313 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1316 ins_mode_in_head_else t
1318 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1319 ins_mode_in_head_noscript_else = (t) ->
1322 ins_mode = ins_mode_in_head
1324 ins_mode_in_head_noscript = (t) ->
1325 if t.type is TYPE_DOCTYPE
1328 if t.type is TYPE_START_TAG
1331 if t.type is TYPE_END_TAG and t.name is 'noscript'
1333 ins_mode = ins_mode_in_head
1335 if (t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\u000a" or t.text is "\u000c" or t.text is "\u000d" or t.text is ' ')) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1338 if t.type is TYPE_END_TAG and t.name is 'br'
1339 ins_mode_in_head_noscript_else t
1341 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1345 ins_mode_in_head_noscript_else t
1350 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1351 ins_mode_after_head_else = (t) ->
1352 body_tok = new_open_tag 'body'
1353 insert_html_element body_tok
1354 ins_mode = ins_mode_in_body
1355 ins_mode t # reprocess token
1357 ins_mode_after_head = (t) ->
1361 if t.type is TYPE_COMMENT
1364 if t.type is TYPE_DOCTYPE
1367 if t.type is TYPE_START_TAG and t.name is 'html'
1370 if t.type is TYPE_START_TAG and t.name is 'body'
1371 insert_html_element t
1372 flag_frameset_ok = false
1373 ins_mode = ins_mode_in_body
1375 if t.type is TYPE_START_TAG and t.name is 'frameset'
1376 insert_html_element t
1377 ins_mode = ins_mode_in_frameset
1379 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1381 open_els.unshift head_element_pointer
1383 for el, i of open_els
1384 if el is head_element_pointer
1385 open_els.splice i, 1
1387 console.log "warning: 23904 couldn't find head element in open_els"
1389 if t.type is TYPE_END_TAG and t.name is 'template'
1392 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1393 ins_mode_after_head_else t
1395 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1399 ins_mode_after_head_else t
1401 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1402 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1403 for el, i in open_els
1404 if el.namespace is NS_HTML and el.name is name
1405 generate_implied_end_tags name # arg is exception
1406 parse_error() unless i is 0
1411 if special_elements[el.name] is el.namespace
1415 ins_mode_in_body = (t) ->
1416 if t.type is TYPE_TEXT and t.text is "\u0000"
1420 reconstruct_active_formatting_elements()
1423 if t.type is TYPE_TEXT
1424 reconstruct_active_formatting_elements()
1426 flag_frameset_ok = false
1428 if t.type is TYPE_COMMENT
1431 if t.type is TYPE_DOCTYPE
1434 if t.type is TYPE_START_TAG and t.name is 'html'
1436 return if template_tag_is_open()
1437 root_attrs = open_els[open_els.length - 1].attrs
1439 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1442 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1445 if t.type is TYPE_START_TAG and t.name is 'body'
1447 return if open_els.length < 2
1448 second = open_els[open_els.length - 2]
1449 return unless second.ns is NS_HTML
1450 return unless second.name is 'body'
1451 return if template_tag_is_open()
1452 frameset_ok_flag = false
1454 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1456 if t.type is TYPE_START_TAG and t.name is 'frameset'
1458 return if open_els.length < 2
1459 second_i = open_els.length - 2
1460 second = open_els[second_i]
1461 return unless second.ns is NS_HTML
1462 return unless second.name is 'body'
1463 flag_frameset_ok = false
1465 for el, i in second.parent.children
1467 second.parent.children.splice i, 1
1469 open_els.splice second_i, 1
1470 # pop everything except the "root html element"
1471 while open_els.length > 1
1473 insert_html_element t
1474 ins_mode = ins_mode_in_frameset
1476 if t.type is TYPE_EOF
1478 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1479 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1480 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1483 unless ok_tags[t.name] is el.namespace
1486 if template_ins_modes.length > 0
1487 ins_mode_in_template t
1491 if t.type is TYPE_END_TAG and t.name is 'body'
1492 unless is_in_scope 'body'
1496 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1497 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1498 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1499 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1503 unless ok_tags[t.name] is el.namespace
1506 ins_mode = ins_mode_after_body
1508 if t.type is TYPE_END_TAG and t.name is 'html'
1509 unless is_in_scope 'body'
1513 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1514 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1515 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1516 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1520 unless ok_tags[t.name] is el.namespace
1523 ins_mode = ins_mode_after_body
1526 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1527 close_p_if_in_button_scope()
1528 insert_html_element t
1530 if t.type is TYPE_START_TAG and h_tags[t.name]?
1531 close_p_if_in_button_scope()
1532 if h_tags[open_els[0]] is NS_HTML
1535 insert_html_element t
1537 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1538 close_p_if_in_button_scope()
1539 insert_html_element t
1540 # spec: If the next token is a "LF" (U+000A) character token, then
1541 # ignore that token and move on to the next one. (Newlines at the
1542 # start of pre blocks are ignored as an authoring convenience.)
1543 if txt.charAt(cur) is "\u000a"
1545 flag_frameset_ok = false
1547 if t.type is TYPE_START_TAG and t.name is 'form'
1548 unless form_element_pointer is null or template_tag_is_open()
1551 close_p_if_in_button_scope()
1552 el = insert_html_element t
1553 unless template_tag_is_open()
1554 form_element_pointer = el
1556 if t.type is TYPE_START_TAG and t.name is 'li'
1557 flag_frameset_ok = false
1558 for node in open_els
1559 if node.name is 'li' and node.namespace is NS_HTML
1560 generate_implied_end_tags 'li' # arg is exception
1561 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1564 el = open_els.shift()
1565 if el.name is 'li' and el.namespace is NS_HTML
1568 if el_is_special_not_adp node
1570 close_p_if_in_button_scope()
1571 insert_html_element t
1573 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1574 flag_frameset_ok = false
1575 for node in open_els
1576 if node.name is 'dd' and node.namespace is NS_HTML
1577 generate_implied_end_tags 'dd' # arg is exception
1578 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1581 el = open_els.shift()
1582 if el.name is 'dd' and el.namespace is NS_HTML
1585 if node.name is 'dt' and node.namespace is NS_HTML
1586 generate_implied_end_tags 'dt' # arg is exception
1587 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1590 el = open_els.shift()
1591 if el.name is 'dt' and el.namespace is NS_HTML
1594 if el_is_special_not_adp node
1596 close_p_if_in_button_scope()
1597 insert_html_element t
1601 if t.type is TYPE_START_TAG and t.name is 'a'
1602 # If the list of active formatting elements contains an a element
1603 # between the end of the list and the last marker on the list (or
1604 # the start of the list if there is no marker on the list), then
1605 # this is a parse error; run the adoption agency algorithm for the
1606 # tag name "a", then remove that element from the list of active
1607 # formatting elements and the stack of open elements if the
1608 # adoption agency algorithm didn't already remove it (it might not
1609 # have if the element is not in table scope).
1612 if el.type is TYPE_AFE_MARKER
1622 for el, i in open_els
1624 open_els.splice i, 1
1625 reconstruct_active_formatting_elements()
1626 el = insert_html_element t
1629 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1630 reconstruct_active_formatting_elements()
1631 el = insert_html_element t
1634 if t.type is TYPE_START_TAG and t.name is 'table'
1635 # fixfull quirksmode thing
1636 close_p_if_in_button_scope()
1637 insert_html_element t
1638 ins_mode = ins_mode_in_table
1640 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1641 unless is_in_scope t.name, NS_HTML
1644 generate_implied_end_tags()
1645 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1648 el = open_els.shift()
1649 if el.name is t.name and el.namespace is NS_HTML
1652 if t.type is TYPE_END_TAG and t.name is 'p'
1653 unless is_in_button_scope 'p'
1655 insert_html_element new_open_tag 'p'
1658 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1659 adoption_agency t.name
1661 if t.type is TYPE_START_TAG # any other start tag
1662 reconstruct_active_formatting_elements()
1663 insert_html_element t
1665 if t.type is TYPE_END_TAG # any other end tag
1666 in_body_any_other_end_tag t.name
1669 ins_mode_in_table_else = (t) ->
1671 flag_foster_parenting = true # FIXME
1673 flag_foster_parenting = false
1674 can_in_table = { # FIXME do this inline like everywhere else
1682 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1683 ins_mode_text = (t) ->
1684 if t.type is TYPE_TEXT
1687 if t.type is TYPE_EOF
1689 if open_els[0].name is 'script'
1690 open_els[0].flag 'already started', true
1692 ins_mode = original_ins_mode
1695 if t.type is TYPE_END_TAG and t.name is 'script'
1697 ins_mode = original_ins_mode
1698 # fixfull the spec seems to assume that I'm going to run the script
1699 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1701 if t.type is TYPE_END_TAG
1703 ins_mode = original_ins_mode
1705 console.log 'warning: end of ins_mode_text reached'
1707 # the functions below implement the tokenizer stats described here:
1708 # http://www.w3.org/TR/html5/syntax.html#tokenization
1710 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1711 ins_mode_in_table = (t) ->
1714 if can_in_table[t.name]
1715 original_ins_mode = ins_mode
1716 ins_mode = ins_mode_in_table_text
1719 ins_mode_in_table_else t
1727 clear_stack_to_table_context()
1729 insert_html_element t
1730 ins_mode = ins_mode_in_caption
1732 clear_stack_to_table_context()
1733 insert_html_element t
1734 ins_mode = ins_mode_in_column_group
1736 clear_stack_to_table_context()
1737 insert_html_element new_open_tag 'colgroup'
1738 ins_mode = ins_mode_in_column_group
1740 when 'tbody', 'tfoot', 'thead'
1741 clear_stack_to_table_context()
1742 insert_html_element t
1743 ins_mode = ins_mode_in_table_body
1744 when 'td', 'th', 'tr'
1745 clear_stack_to_table_context()
1746 insert_html_element new_open_tag 'tbody'
1747 ins_mode = ins_mode_in_table_body
1751 if is_in_table_scope 'table'
1753 el = open_els.shift()
1754 if el.name is 'table'
1758 when 'style', 'script', 'template'
1761 if is_input_hidden_tok t
1762 ins_mode_in_table_else t
1765 el = insert_html_element t
1767 t.acknowledge_self_closing()
1770 if form_element_pointer?
1772 if template_tag_is_open()
1774 form_element_pointer = insert_html_element t
1777 ins_mode_in_table_else t
1781 if is_in_table_scope 'table'
1783 el = open_els.shift()
1784 if el.name is 'table'
1789 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1794 ins_mode_in_table_else t
1798 ins_mode_in_table_else t
1801 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1802 ins_mode_in_table_text = (t) ->
1803 if t.type is TYPE_TEXT and t.text is "\u0000"
1804 # huh? I thought the tokenizer didn't emit these
1807 if t.type is TYPE_TEXT
1808 pending_table_character_tokens.push t
1812 for old in pending_table_character_tokens
1813 unless is_space_tok old
1817 for old in pending_table_character_tokens
1818 insert_character old
1820 for old in pending_table_character_tokens
1821 ins_mode_table_else old
1822 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1823 ins_mode = original_ins_mode
1826 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1827 ins_mode_in_caption = (t) ->
1828 if t.type is TYPE_END_TAG and t.name is 'caption'
1829 if is_in_table_scope 'caption'
1830 generate_implied_end_tags()
1831 if open_els[0].name isnt 'caption'
1834 el = open_els.shift()
1835 if el.name is 'caption'
1837 clear_afe_to_marker()
1838 ins_mode = ins_mode_in_table
1843 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1845 if is_in_table_scope 'caption'
1847 el = open_els.shift()
1848 if el.name is 'caption'
1850 clear_afe_to_marker()
1851 ins_mode = ins_mode_in_table
1853 # else fragment case
1855 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1861 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1862 ins_mode_in_column_group = (t) ->
1866 if t.type is TYPE_COMMENT
1869 if t.type is TYPE_DOCTYPE
1872 if t.type is TYPE_START_TAG and t.name is 'html'
1875 if t.type is TYPE_START_TAG and t.name is 'col'
1876 el = insert_html_element t
1878 t.acknowledge_self_closing()
1880 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1881 if open_els[0].name is 'colgroup'
1883 ins_mode = ins_mode_in_table
1887 if t.type is TYPE_END_TAG and t.name is 'col'
1890 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1893 if t.type is TYPE_EOF
1897 if open_els[0].name isnt 'colgroup'
1901 ins_mode = ins_mode_in_table
1905 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1906 ins_mode_in_table_body = (t) ->
1907 if t.type is TYPE_START_TAG and t.name is 'tr'
1908 clear_stack_to_table_body_context()
1909 insert_html_element t
1910 ins_mode = ins_mode_in_row
1912 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1914 clear_stack_to_table_body_context()
1915 insert_html_element new_open_tag 'tr'
1916 ins_mode = ins_mode_in_row
1919 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1920 unless is_in_table_scope t.name # fixfull check namespace
1923 clear_stack_to_table_body_context()
1925 ins_mode = ins_mode_in_table
1927 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1930 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1933 if table_scopers[el.name]
1938 clear_stack_to_table_body_context()
1940 ins_mode = ins_mode_in_table
1943 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1949 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1950 ins_mode_in_row = (t) ->
1951 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1952 clear_stack_to_table_row_context()
1953 insert_html_element t
1954 ins_mode = ins_mode_in_cell
1957 if t.type is TYPE_END_TAG and t.name is 'tr'
1958 if is_in_table_scope 'tr'
1959 clear_stack_to_table_row_context()
1961 ins_mode = ins_mode_in_table_body
1965 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1966 if is_in_table_scope 'tr'
1967 clear_stack_to_table_row_context()
1969 ins_mode = ins_mode_in_table_body
1974 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1975 if is_in_table_scope t.name # fixfull namespace
1976 if is_in_table_scope 'tr'
1977 clear_stack_to_table_row_context()
1979 ins_mode = ins_mode_in_table_body
1984 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1990 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1992 generate_implied_end_tags()
1993 unless open_els[0].name is 'td' or open_els[0] is 'th'
1996 el = open_els.shift()
1997 if el.name is 'td' or el.name is 'th'
1999 clear_afe_to_marker()
2000 ins_mode = ins_mode_in_row
2002 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2003 ins_mode_in_cell = (t) ->
2004 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2005 if is_in_table_scope t.name
2006 generate_implied_end_tags()
2007 if open_els[0].name isnt t.name
2010 el = open_els.shift()
2011 if el.name is t.name
2013 clear_afe_to_marker()
2014 ins_mode = ins_mode_in_row
2018 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2021 if el.name is 'td' or el.name is 'th'
2024 if table_scopers[el.name]
2032 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2035 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2036 if is_in_table_scope t.name # fixfull namespace
2045 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2046 ins_mode_in_select = (t) ->
2047 if t.type is TYPE_TEXT and t.text is "\u0000"
2050 if t.type is TYPE_TEXT
2053 if t.type is TYPE_COMMENT
2056 if t.type is TYPE_DOCTYPE
2059 if t.type is TYPE_START_TAG and t.name is 'html'
2062 if t.type is TYPE_START_TAG and t.name is 'option'
2063 if open_els[0].name is 'option'
2065 insert_html_element t
2067 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2068 if open_els[0].name is 'option'
2070 if open_els[0].name is 'optgroup'
2072 insert_html_element t
2074 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2075 if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
2077 if open_els[0].name is 'optgroup'
2082 if t.type is TYPE_END_TAG and t.name is 'option'
2083 if open_els[0].name is 'option'
2088 if t.type is TYPE_END_TAG and t.name is 'select'
2089 if is_in_select_scope 'select'
2091 el = open_els.shift()
2092 if el.name is 'select'
2098 if t.type is TYPE_START_TAG and t.name is 'select'
2101 el = open_els.shift()
2102 if el.name is 'select'
2105 # spec says that this is the same as </select> but it doesn't say
2106 # to check scope first
2108 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2110 if is_in_select_scope 'select'
2113 el = open_els.shift()
2114 if el.name is 'select'
2119 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2122 if t.type is TYPE_EOF
2129 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2130 ins_mode_in_select_in_table = (t) ->
2131 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2134 el = open_els.shift()
2135 if el.name is 'select'
2140 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2142 unless is_in_table_scope t.name, NS_HTML
2145 el = open_els.shift()
2146 if el.name is 'select'
2152 ins_mode_in_select t
2155 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2156 ins_mode_in_template = (t) ->
2157 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2160 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2163 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2164 template_ins_modes.shift()
2165 template_ins_modes.unshift ins_mode_in_table
2166 ins_mode = ins_mode_in_table
2169 if t.type is TYPE_START_TAG and t.name is 'col'
2170 template_ins_modes.shift()
2171 template_ins_modes.unshift ins_mode_in_column_group
2172 ins_mode = ins_mode_in_column_group
2175 if t.type is TYPE_START_TAG and t.name is 'tr'
2176 template_ins_modes.shift()
2177 template_ins_modes.unshift ins_mode_in_table_body
2178 ins_mode = ins_mode_in_table_body
2181 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2182 template_ins_modes.shift()
2183 template_ins_modes.unshift ins_mode_in_row
2184 ins_mode = ins_mode_in_row
2187 if t.type is TYPE_START_TAG
2188 template_ins_modes.shift()
2189 template_ins_modes.unshift ins_mode_in_body
2190 ins_mode = ins_mode_in_body
2193 if t.type is TYPE_END_TAG
2196 if t.type is TYPE_EOF
2197 unless template_tag_is_open()
2202 el = open_els.shift()
2203 if el.name is 'template' # fixfull check namespace
2205 clear_afe_to_marker()
2206 template_ins_modes.shift()
2210 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2211 ins_mode_after_body = (t) ->
2215 if t.type is TYPE_COMMENT
2216 insert_comment t, [open_els[0], open_els[0].children.length]
2218 if t.type is TYPE_DOCTYPE
2221 if t.type is TYPE_START_TAG and t.name is 'html'
2224 if t.type is TYPE_END_TAG and t.name is 'html'
2225 # fixfull fragment case
2226 ins_mode = ins_mode_after_after_body
2228 if t.type is TYPE_EOF
2233 ins_mode = ins_mode_in_body
2236 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2237 ins_mode_in_frameset = (t) ->
2241 if t.type is TYPE_COMMENT
2244 if t.type is TYPE_DOCTYPE
2247 if t.type is TYPE_START_TAG and t.name is 'html'
2250 if t.type is TYPE_START_TAG and t.name is 'frameset'
2251 insert_html_element t
2253 if t.type is TYPE_END_TAG and t.name is 'frameset'
2254 # TODO ?correct for: "if the current node is the root html element"
2255 if open_els.length is 1
2257 return # fragment case
2259 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2260 ins_mode = ins_mode_after_frameset
2262 if t.type is TYPE_START_TAG and t.name is 'frame'
2263 insert_html_element t
2265 t.acknowledge_self_closing()
2267 if t.type is TYPE_START_TAG and t.name is 'noframes'
2270 if t.type is TYPE_EOF
2271 # TODO ?correct for: "if the current node is not the root html element"
2272 if open_els.length isnt 1
2280 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2281 ins_mode_after_frameset = (t) ->
2285 if t.type is TYPE_COMMENT
2288 if t.type is TYPE_DOCTYPE
2291 if t.type is TYPE_START_TAG and t.name is 'html'
2294 if t.type is TYPE_END_TAG and t.name is 'html'
2295 insert_mode = ins_mode_after_after_frameset
2297 if t.type is TYPE_START_TAG and t.name is 'noframes'
2300 if t.type is TYPE_EOF
2307 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2308 ins_mode_after_after_body = (t) ->
2309 if t.type is TYPE_COMMENT
2310 insert_comment t, [doc, doc.children.length]
2312 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2315 if t.type is TYPE_EOF
2320 ins_mode = ins_mode_in_body
2323 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2324 ins_mode_after_after_frameset = (t) ->
2325 if t.type is TYPE_COMMENT
2326 insert_comment t, [doc, doc.children.length]
2328 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2331 if t.type is TYPE_EOF
2334 if t.type is TYPE_START_TAG and t.name is 'noframes'
2345 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2347 switch c = txt.charAt(cur++)
2349 return new_text_node parse_character_reference()
2351 tok_state = tok_state_tag_open
2354 return new_text_node c
2356 return new_eof_token()
2358 return new_text_node c
2361 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2362 # not needed: tok_state_character_reference_in_data = ->
2363 # just call parse_character_reference()
2365 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2366 tok_state_rcdata = ->
2367 switch c = txt.charAt(cur++)
2369 return new_text_node parse_character_reference()
2371 tok_state = tok_state_rcdata_less_than_sign
2374 return new_character_token "\ufffd"
2376 return new_eof_token()
2378 return new_character_token c
2381 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2382 # not needed: tok_state_character_reference_in_rcdata = ->
2383 # just call parse_character_reference()
2385 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2386 tok_state_rawtext = ->
2387 switch c = txt.charAt(cur++)
2389 tok_state = tok_state_rawtext_less_than_sign
2392 return new_character_token "\ufffd"
2394 return new_eof_token()
2396 return new_character_token c
2399 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2400 tok_state_script_data = ->
2401 switch c = txt.charAt(cur++)
2403 tok_state = tok_state_script_data_less_than_sign
2406 return new_character_token "\ufffd"
2408 return new_eof_token()
2410 return new_character_token c
2413 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2414 tok_state_plaintext = ->
2415 switch c = txt.charAt(cur++)
2418 return new_character_token "\ufffd"
2420 return new_eof_token()
2422 return new_character_token c
2426 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2427 tok_state_tag_open = ->
2428 switch c = txt.charAt(cur++)
2430 tok_state = tok_state_markup_declaration_open
2432 tok_state = tok_state_end_tag_open
2435 tok_cur_tag = new_comment_token '?'
2436 tok_state = tok_state_bogus_comment
2439 tok_cur_tag = new_open_tag c
2440 tok_state = tok_state_tag_name
2441 else if is_uc_alpha(c)
2442 tok_cur_tag = new_open_tag c.toLowerCase()
2443 tok_state = tok_state_tag_name
2446 tok_state = tok_state_data
2447 cur -= 1 # we didn't parse/handle the char after <
2448 return new_text_node '<'
2451 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2452 tok_state_end_tag_open = ->
2453 switch c = txt.charAt(cur++)
2456 tok_state = tok_state_data
2459 tok_state = tok_state_data
2460 return new_text_node '</'
2463 tok_cur_tag = new_end_tag c.toLowerCase()
2464 tok_state = tok_state_tag_name
2465 else if is_lc_alpha(c)
2466 tok_cur_tag = new_end_tag c
2467 tok_state = tok_state_tag_name
2470 tok_cur_tag = new_comment_token '/'
2471 tok_state = tok_state_bogus_comment
2474 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2475 tok_state_tag_name = ->
2476 switch c = txt.charAt(cur++)
2477 when "\t", "\n", "\u000c", ' '
2478 tok_state = tok_state_before_attribute_name
2480 tok_state = tok_state_self_closing_start_tag
2482 tok_state = tok_state_data
2488 tok_cur_tag.name += "\ufffd"
2491 tok_state = tok_state_data
2494 tok_cur_tag.name += c.toLowerCase()
2496 tok_cur_tag.name += c
2499 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2500 tok_state_rcdata_less_than_sign = ->
2501 c = txt.charAt(cur++)
2503 temporary_buffer = ''
2504 tok_state = tok_state_rcdata_end_tag_open
2507 tok_state = tok_state_rcdata
2508 cur -= 1 # reconsume the input character
2509 return new_character_token '<'
2511 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2512 tok_state_rcdata_end_tag_open = ->
2513 c = txt.charAt(cur++)
2515 tok_cur_tag = new_end_tag c.toLowerCase()
2516 temporary_buffer += c
2517 tok_state = tok_state_rcdata_end_tag_name
2520 tok_cur_tag = new_end_tag c
2521 temporary_buffer += c
2522 tok_state = tok_state_rcdata_end_tag_name
2525 tok_state = tok_state_rcdata
2526 cur -= 1 # reconsume the input character
2527 return new_character_token "</" # fixfull separate these
2529 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2530 is_appropriate_end_tag = (t) ->
2531 # spec says to check against "the tag name of the last start tag to
2532 # have been emitted from this tokenizer", but this is only called from
2533 # the various "raw" states, which I'm pretty sure all push the start
2534 # token onto open_els. TODO: verify this after the script data states
2536 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2537 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2539 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2540 tok_state_rcdata_end_tag_name = ->
2541 c = txt.charAt(cur++)
2542 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2543 if is_appropriate_end_tag tok_cur_tag
2544 tok_state = tok_state_before_attribute_name
2546 # else fall through to "Anything else"
2548 if is_appropriate_end_tag tok_cur_tag
2549 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2551 # else fall through to "Anything else"
2553 if is_appropriate_end_tag tok_cur_tag
2554 tok_state = tok_state_data
2556 # else fall through to "Anything else"
2558 tok_cur_tag.name += c.toLowerCase()
2559 temporary_buffer += c
2562 tok_cur_tag.name += c
2563 temporary_buffer += c
2566 tok_state = tok_state_rcdata
2567 cur -= 1 # reconsume the input character
2568 return new_character_token '</' + temporary_buffer # fixfull separate these
2570 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2571 tok_state_rawtext_less_than_sign = ->
2572 c = txt.charAt(cur++)
2574 temporary_buffer = ''
2575 tok_state = tok_state_rawtext_end_tag_open
2578 tok_state = tok_state_rawtext
2579 cur -= 1 # reconsume the input character
2580 return new_character_token '<'
2582 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2583 tok_state_rawtext_end_tag_open = ->
2584 c = txt.charAt(cur++)
2586 tok_cur_tag = new_end_tag c.toLowerCase()
2587 temporary_buffer += c
2588 tok_state = tok_state_rawtext_end_tag_name
2591 tok_cur_tag = new_end_tag c
2592 temporary_buffer += c
2593 tok_state = tok_state_rawtext_end_tag_name
2596 tok_state = tok_state_rawtext
2597 cur -= 1 # reconsume the input character
2598 return new_character_token "</" # fixfull separate these
2600 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2601 tok_state_rawtext_end_tag_name = ->
2602 c = txt.charAt(cur++)
2603 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2604 if is_appropriate_end_tag tok_cur_tag
2605 tok_state = tok_state_before_attribute_name
2607 # else fall through to "Anything else"
2609 if is_appropriate_end_tag tok_cur_tag
2610 tok_state = tok_state_self_closing_start_tag
2612 # else fall through to "Anything else"
2614 if is_appropriate_end_tag tok_cur_tag
2615 tok_state = tok_state_data
2617 # else fall through to "Anything else"
2619 tok_cur_tag.name += c.toLowerCase()
2620 temporary_buffer += c
2623 tok_cur_tag.name += c
2624 temporary_buffer += c
2627 tok_state = tok_state_rawtext
2628 cur -= 1 # reconsume the input character
2629 return new_character_token '</' + temporary_buffer # fixfull separate these
2631 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
2632 tok_state_script_data_less_than_sign = ->
2633 c = txt.charAt(cur++)
2635 temporary_buffer = ''
2636 tok_state = tok_state_script_data_end_tag_open
2639 tok_state = tok_state_script_data_escape_start
2640 return new_character_token '<!' # fixfull split
2642 tok_state = tok_state_script_data
2643 cur -= 1 # Reconsume
2644 return new_character_token '<'
2646 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2647 tok_state_script_data_end_tag_open = ->
2648 c = txt.charAt(cur++)
2650 tok_cur_tag = new_end_tag c.toLowerCase()
2651 temporary_buffer += c
2652 tok_state = tok_state_script_data_end_tag_name
2655 tok_cur_tag = new_end_tag c
2656 temporary_buffer += c
2657 tok_state = tok_state_script_data_end_tag_name
2660 tok_state = tok_state_script_data
2661 cur -= 1 # Reconsume
2662 return new_character_token '</'
2664 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2665 tok_state_script_data_end_tag_name = ->
2666 c = txt.charAt(cur++)
2667 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2668 if is_appropriate_end_tag tok_cur_tag
2669 tok_state = tok_state_before_attribute_name
2673 if is_appropriate_end_tag tok_cur_tag
2674 tok_state = tok_state_self_closing_start_tag
2678 tok_cur_tag.name += c.toLowerCase()
2679 temporary_buffer += c
2682 tok_cur_tag.name += c
2683 temporary_buffer += c
2686 tok_state = tok_state_script_data
2687 cur -= 1 # Reconsume
2688 return new_character_token "</#{temporary_buffer}" # fixfull split
2690 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
2691 tok_state_script_data_escape_start = ->
2692 c = txt.charAt(cur++)
2694 tok_state = tok_state_script_data_escape_start_dash
2695 return new_character_token '-'
2697 tok_state = tok_state_script_data
2698 cur -= 1 # Reconsume
2701 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
2702 tok_state_script_data_escape_start_dash = ->
2703 c = txt.charAt(cur++)
2705 tok_state = tok_state_script_data_escaped_dash_dash
2706 return new_character_token '-'
2708 tok_state = tok_state_script_data
2709 cur -= 1 # Reconsume
2712 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
2713 tok_state_script_data_escaped = ->
2714 c = txt.charAt(cur++)
2716 tok_state = tok_state_script_data_escaped_dash
2717 return new_character_token '-'
2719 tok_state = tok_state_script_data_escaped_less_than_sign
2723 return new_character_token "\ufffd"
2725 tok_state = tok_state_data
2727 cur -= 1 # Reconsume
2730 return new_character_token c
2732 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
2733 tok_state_script_data_escaped_dash = ->
2734 c = txt.charAt(cur++)
2736 tok_state = tok_state_script_data_escaped_dash_dash
2737 return new_character_token '-'
2739 tok_state = tok_state_script_data_escaped_less_than_sign
2743 tok_state = tok_state_script_data_escaped
2744 return new_character_token "\ufffd"
2746 tok_state = tok_state_data
2748 cur -= 1 # Reconsume
2751 tok_state = tok_state_script_data_escaped
2752 return new_character_token c
2754 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
2755 tok_state_script_data_escaped_dash_dash = ->
2756 c = txt.charAt(cur++)
2758 return new_character_token '-'
2760 tok_state = tok_state_script_data_escaped_less_than_sign
2763 tok_state = tok_state_script_data
2764 return new_character_token '>'
2767 tok_state = tok_state_script_data_escaped
2768 return new_character_token "\ufffd"
2771 tok_state = tok_state_data
2772 cur -= 1 # Reconsume
2775 tok_state = tok_state_script_data_escaped
2776 return new_character_token c
2778 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
2779 tok_state_script_data_escaped_less_than_sign = ->
2780 c = txt.charAt(cur++)
2782 temporary_buffer = ''
2783 tok_state = tok_state_script_data_escaped_end_tag_open
2786 temporary_buffer = c.toLowerCase() # yes, really
2787 tok_state = tok_state_script_data_double_escape_start
2788 return new_character_token "<#{c}" # fixfull split
2790 temporary_buffer = c
2791 tok_state = tok_state_script_data_double_escape_start
2792 return new_character_token "<#{c}" # fixfull split
2794 tok_state = tok_state_script_data_escaped
2795 cur -= 1 # Reconsume
2796 return new_character_token c
2798 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
2799 tok_state_script_data_escaped_end_tag_open = ->
2800 c = txt.charAt(cur++)
2802 tok_cur_tag = new_end_tag c.toLowerCase()
2803 temporary_buffer += c
2804 tok_state = tok_state_script_data_escaped_end_tag_name
2807 tok_cur_tag = new_end_tag c
2808 temporary_buffer += c
2809 tok_state = tok_state_script_data_escaped_end_tag_name
2812 tok_state = tok_state_script_data_escaped
2813 cur -= 1 # Reconsume
2814 return new_character_token '</' # fixfull split
2816 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
2817 tok_state_script_data_escaped_end_tag_name = ->
2818 c = txt.charAt(cur++)
2819 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2820 if is_appropriate_end_tag tok_cur_tag
2821 tok_state = tok_state_before_attribute_name
2825 if is_appropriate_end_tag tok_cur_tag
2826 tok_state = tok_state_self_closing_start_tag
2830 tok_cur_tag.name += c.toLowerCase()
2831 temporary_buffer += c.toLowerCase()
2834 tok_cur_tag.name += c
2835 temporary_buffer += c.toLowerCase()
2838 tok_state = tok_state_script_data_escaped
2839 cur -= 1 # Reconsume
2840 return new_character_token "</#{temporary_buffer}" # fixfull split
2842 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
2843 tok_state_script_data_double_escape_start = ->
2844 c = txt.charAt(cur++)
2845 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
2846 if temporary_buffer is 'script'
2847 tok_state = tok_state_script_data_double_escaped
2849 tok_state = tok_state_script_data_escaped
2850 return new_character_token c
2852 temporary_buffer += c.toLowerCase() # yes, really lowercase
2853 return new_character_token c
2855 temporary_buffer += c
2856 return new_character_token c
2858 tok_state = tok_state_script_data_escaped
2859 cur -= 1 # Reconsume
2862 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
2863 tok_state_script_data_double_escaped = ->
2864 c = txt.charAt(cur++)
2866 tok_state = tok_state_script_data_double_escaped_dash
2867 return new_character_token '-'
2869 tok_state = tok_state_script_data_double_escaped_less_than_sign
2870 return new_character_token '<'
2873 return new_character_token "\ufffd"
2876 tok_state = tok_state_data
2877 cur -= 1 # Reconsume
2880 return new_character_token c
2882 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
2883 tok_state_script_data_double_escaped_dash = ->
2884 c = txt.charAt(cur++)
2886 tok_state = tok_state_script_data_double_escaped_dash_dash
2887 return new_character_token '-'
2889 tok_state = tok_state_script_data_double_escaped_less_than_sign
2890 return new_character_token '<'
2893 tok_state = tok_state_script_data_double_escaped
2894 return new_character_token "\ufffd"
2897 tok_state = tok_state_data
2898 cur -= 1 # Reconsume
2901 tok_state = tok_state_script_data_double_escaped
2902 return new_character_token c
2904 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
2905 tok_state_script_data_double_escaped_dash_dash = ->
2906 c = txt.charAt(cur++)
2908 return new_character_token '-'
2910 tok_state = tok_state_script_data_double_escaped_less_than_sign
2911 return new_character_token '<'
2913 tok_state = tok_state_script_data
2914 return new_character_token '>'
2917 tok_state = tok_state_script_data_double_escaped
2918 return new_character_token "\ufffd"
2921 tok_state = tok_state_data
2922 cur -= 1 # Reconsume
2925 tok_state = tok_state_script_data_double_escaped
2926 return new_character_token c
2928 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
2929 tok_state_script_data_double_escaped_less_than_sign = ->
2930 c = txt.charAt(cur++)
2932 temporary_buffer = ''
2933 tok_state = tok_state_script_data_double_escape_end
2934 return new_character_token '/'
2936 tok_state = tok_state_script_data_double_escaped
2937 cur -= 1 # Reconsume
2940 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
2941 tok_state_script_data_double_escape_end = ->
2942 c = txt.charAt(cur++)
2943 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
2944 if temporary_buffer is 'script'
2945 tok_state = tok_state_script_data_escaped
2947 tok_state = tok_state_script_data_double_escaped
2948 return new_character_token c
2950 temporary_buffer += c.toLowerCase() # yes, really lowercase
2951 return new_character_token c
2953 temporary_buffer += c
2954 return new_character_token c
2956 tok_state = tok_state_script_data_double_escaped
2957 cur -= 1 # Reconsume
2960 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2961 tok_state_before_attribute_name = ->
2963 switch c = txt.charAt(cur++)
2964 when "\t", "\n", "\u000c", ' '
2967 tok_state = tok_state_self_closing_start_tag
2970 tok_state = tok_state_data
2976 attr_name = "\ufffd"
2977 when '"', "'", '<', '='
2982 tok_state = tok_state_data
2985 attr_name = c.toLowerCase()
2989 tok_cur_tag.attrs_a.unshift [attr_name, '']
2990 tok_state = tok_state_attribute_name
2993 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2994 tok_state_attribute_name = ->
2995 switch c = txt.charAt(cur++)
2996 when "\t", "\n", "\u000c", ' '
2997 tok_state = tok_state_after_attribute_name
2999 tok_state = tok_state_self_closing_start_tag
3001 tok_state = tok_state_before_attribute_value
3003 tok_state = tok_state_data
3009 tok_cur_tag.attrs_a[0][0] = "\ufffd"
3012 tok_cur_tag.attrs_a[0][0] = c
3015 tok_state = tok_state_data
3018 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
3020 tok_cur_tag.attrs_a[0][0] += c
3023 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3024 tok_state_after_attribute_name = ->
3025 c = txt.charAt(cur++)
3026 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3029 tok_state = tok_state_self_closing_start_tag
3032 tok_state = tok_state_before_attribute_value
3035 tok_state = tok_state_data
3038 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3039 tok_state = tok_state_attribute_name
3043 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3044 tok_state = tok_state_attribute_name
3048 tok_state = tok_state_data
3049 cur -= 1 # reconsume
3051 if c is '"' or c is "'" or c is '<'
3053 # fall through to Anything else
3055 tok_cur_tag.attrs_a.unshift [c, '']
3056 tok_state = tok_state_attribute_name
3058 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3059 tok_state_before_attribute_value = ->
3060 switch c = txt.charAt(cur++)
3061 when "\t", "\n", "\u000c", ' '
3064 tok_state = tok_state_attribute_value_double_quoted
3066 tok_state = tok_state_attribute_value_unquoted
3069 tok_state = tok_state_attribute_value_single_quoted
3072 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3073 tok_state = tok_state_attribute_value_unquoted
3076 tok_state = tok_state_data
3082 tok_state = tok_state_data
3084 tok_cur_tag.attrs_a[0][1] += c
3085 tok_state = tok_state_attribute_value_unquoted
3088 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3089 tok_state_attribute_value_double_quoted = ->
3090 switch c = txt.charAt(cur++)
3092 tok_state = tok_state_after_attribute_value_quoted
3094 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3097 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3100 tok_state = tok_state_data
3102 tok_cur_tag.attrs_a[0][1] += c
3105 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3106 tok_state_attribute_value_single_quoted = ->
3107 switch c = txt.charAt(cur++)
3109 tok_state = tok_state_after_attribute_value_quoted
3111 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3114 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3117 tok_state = tok_state_data
3119 tok_cur_tag.attrs_a[0][1] += c
3122 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3123 tok_state_attribute_value_unquoted = ->
3124 switch c = txt.charAt(cur++)
3125 when "\t", "\n", "\u000c", ' '
3126 tok_state = tok_state_before_attribute_name
3128 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3130 tok_state = tok_state_data
3135 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3138 tok_state = tok_state_data
3140 # Parse Error if ', <, = or ` (backtick)
3141 tok_cur_tag.attrs_a[0][1] += c
3144 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3145 tok_state_after_attribute_value_quoted = ->
3146 switch c = txt.charAt(cur++)
3147 when "\t", "\n", "\u000c", ' '
3148 tok_state = tok_state_before_attribute_name
3150 tok_state = tok_state_self_closing_start_tag
3152 tok_state = tok_state_data
3158 tok_state = tok_state_data
3161 tok_state = tok_state_before_attribute_name
3162 cur -= 1 # we didn't handle that char
3165 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3166 tok_state_self_closing_start_tag = ->
3167 c = txt.charAt(cur++)
3169 tok_cur_tag.flag 'self-closing'
3170 tok_state = tok_state_data
3174 tok_state = tok_state_data
3175 cur -= 1 # Reconsume
3179 tok_state = tok_state_before_attribute_name
3180 cur -= 1 # Reconsume
3183 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3184 # WARNING: put a comment token in tok_cur_tag before setting this state
3185 tok_state_bogus_comment = ->
3186 next_gt = txt.indexOf '>', cur
3188 val = txt.substr cur
3191 val = txt.substr cur, (next_gt - cur)
3193 val = val.replace "\u0000", "\ufffd"
3194 tok_cur_tag.text += val
3195 tok_state = tok_state_data
3198 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3199 tok_state_markup_declaration_open = ->
3200 if txt.substr(cur, 2) is '--'
3202 tok_cur_tag = new_comment_token ''
3203 tok_state = tok_state_comment_start
3205 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3207 tok_state = tok_state_doctype
3209 acn = adjusted_current_node()
3210 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3212 tok_state = tok_state_cdata_section
3216 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
3217 tok_state = tok_state_bogus_comment
3220 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3221 tok_state_comment_start = ->
3222 switch c = txt.charAt(cur++)
3224 tok_state = tok_state_comment_start_dash
3227 return new_character_token "\ufffd"
3230 tok_state = tok_state_data
3234 tok_state = tok_state_data
3235 cur -= 1 # Reconsume
3238 tok_cur_tag.text += c
3241 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3242 tok_state_comment_start_dash = ->
3243 switch c = txt.charAt(cur++)
3245 tok_state = tok_state_comment_end
3248 tok_cur_tag.text += "-\ufffd"
3249 tok_state = tok_state_comment
3252 tok_state = tok_state_data
3256 tok_state = tok_state_data
3257 cur -= 1 # Reconsume
3260 tok_cur_tag.text += "-#{c}"
3261 tok_state = tok_state_comment
3264 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3265 tok_state_comment = ->
3266 switch c = txt.charAt(cur++)
3268 tok_state = tok_state_comment_end_dash
3271 tok_cur_tag.text += "\ufffd"
3274 tok_state = tok_state_data
3275 cur -= 1 # Reconsume
3278 tok_cur_tag.text += c
3281 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3282 tok_state_comment_end_dash = ->
3283 switch c = txt.charAt(cur++)
3285 tok_state = tok_state_comment_end
3288 tok_cur_tag.text += "-\ufffd"
3289 tok_state = tok_state_comment
3292 tok_state = tok_state_data
3293 cur -= 1 # Reconsume
3296 tok_cur_tag.text += "-#{c}"
3297 tok_state = tok_state_comment
3300 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3301 tok_state_comment_end = ->
3302 switch c = txt.charAt(cur++)
3304 tok_state = tok_state_data
3308 tok_cur_tag.text += "--\ufffd"
3309 tok_state = tok_state_comment
3312 tok_state = tok_state_comment_end_bang
3315 tok_cur_tag.text += '-'
3318 tok_state = tok_state_data
3319 cur -= 1 # Reconsume
3323 tok_cur_tag.text += "--#{c}"
3324 tok_state = tok_state_comment
3327 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3328 tok_state_comment_end_bang = ->
3329 switch c = txt.charAt(cur++)
3331 tok_cur_tag.text += "--!#{c}"
3332 tok_state = tok_state_comment_end_dash
3334 tok_state = tok_state_data
3338 tok_cur_tag.text += "--!\ufffd"
3339 tok_state = tok_state_comment
3342 tok_state = tok_state_data
3343 cur -= 1 # Reconsume
3346 tok_cur_tag.text += "--!#{c}"
3347 tok_state = tok_state_comment
3350 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3351 tok_state_doctype = ->
3352 switch c = txt.charAt(cur++)
3353 when "\t", "\u000a", "\u000c", ' '
3354 tok_state = tok_state_before_doctype_name
3357 tok_state = tok_state_data
3358 el = new_doctype_token ''
3359 el.flag 'force-quirks', true
3360 cur -= 1 # Reconsume
3364 tok_state = tok_state_before_doctype_name
3365 cur -= 1 # Reconsume
3368 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3369 tok_state_before_doctype_name = ->
3370 c = txt.charAt(cur++)
3371 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3374 tok_cur_tag = new_doctype_token c.toLowerCase()
3375 tok_state = tok_state_doctype_name
3379 tok_cur_tag = new_doctype_token "\ufffd"
3380 tok_state = tok_state_doctype_name
3384 el = new_doctype_token ''
3385 el.flag 'force-quirks', true
3386 tok_state = tok_state_data
3390 tok_state = tok_state_data
3391 el = new_doctype_token ''
3392 el.flag 'force-quirks', true
3393 cur -= 1 # Reconsume
3396 tok_cur_tag = new_doctype_token c
3397 tok_state = tok_state_doctype_name
3400 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3401 tok_state_doctype_name = ->
3402 c = txt.charAt(cur++)
3403 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3404 tok_state = tok_state_after_doctype_name
3407 tok_state = tok_state_data
3410 tok_cur_tag.name += c.toLowerCase()
3414 tok_cur_tag.name += "\ufffd"
3418 tok_state = tok_state_data
3419 tok_cur_tag.flag 'force-quirks', true
3420 cur -= 1 # Reconsume
3423 tok_cur_tag.name += c
3426 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3427 tok_state_after_doctype_name = ->
3428 c = txt.charAt(cur++)
3429 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3432 tok_state = tok_state_data
3436 tok_state = tok_state_data
3437 tok_cur_tag.flag 'force-quirks', true
3438 cur -= 1 # Reconsume
3441 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3443 tok_state = tok_state_after_doctype_public_keyword
3445 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3447 tok_state = tok_state_after_doctype_system_keyword
3450 tok_cur_tag.flag 'force-quirks', true
3451 tok_state = tok_state_bogus_doctype
3454 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3455 tok_state_after_doctype_public_keyword = ->
3456 c = txt.charAt(cur++)
3457 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3458 tok_state = tok_state_before_doctype_public_identifier
3462 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3463 tok_state = tok_state_doctype_public_identifier_double_quoted
3467 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3468 tok_state = tok_state_doctype_public_identifier_single_quoted
3472 tok_cur_tag.flag 'force-quirks', true
3473 tok_state = tok_state_data
3477 tok_state = tok_state_data
3478 tok_cur_tag.flag 'force-quirks', true
3479 cur -= 1 # Reconsume
3483 tok_cur_tag.flag 'force-quirks', true
3484 tok_state = tok_state_bogus_doctype
3487 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
3488 tok_state_before_doctype_public_identifier = ->
3489 c = txt.charAt(cur++)
3490 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3494 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3495 tok_state = tok_state_doctype_public_identifier_double_quoted
3499 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3500 tok_state = tok_state_doctype_public_identifier_single_quoted
3504 tok_cur_tag.flag 'force-quirks', true
3505 tok_state = tok_state_data
3509 tok_state = tok_state_data
3510 tok_cur_tag.flag 'force-quirks', true
3511 cur -= 1 # Reconsume
3515 tok_cur_tag.flag 'force-quirks', true
3516 tok_state = tok_state_bogus_doctype
3520 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
3521 tok_state_doctype_public_identifier_double_quoted = ->
3522 c = txt.charAt(cur++)
3524 tok_state = tok_state_after_doctype_public_identifier
3528 tok_cur_tag.public_identifier += "\ufffd"
3532 tok_cur_tag.flag 'force-quirks', true
3533 tok_state = tok_state_data
3537 tok_state = tok_state_data
3538 tok_cur_tag.flag 'force-quirks', true
3539 cur -= 1 # Reconsume
3542 tok_cur_tag.public_identifier += c
3545 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
3546 tok_state_doctype_public_identifier_single_quoted = ->
3547 c = txt.charAt(cur++)
3549 tok_state = tok_state_after_doctype_public_identifier
3553 tok_cur_tag.public_identifier += "\ufffd"
3557 tok_cur_tag.flag 'force-quirks', true
3558 tok_state = tok_state_data
3562 tok_state = tok_state_data
3563 tok_cur_tag.flag 'force-quirks', true
3564 cur -= 1 # Reconsume
3567 tok_cur_tag.public_identifier += c
3570 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
3571 tok_state_after_doctype_public_identifier = ->
3572 c = txt.charAt(cur++)
3573 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3574 tok_state = tok_state_between_doctype_public_and_system_identifiers
3577 tok_state = tok_state_data
3581 tok_cur_tag.system_identifier = ''
3582 tok_state = tok_state_doctype_system_identifier_double_quoted
3586 tok_cur_tag.system_identifier = ''
3587 tok_state = tok_state_doctype_system_identifier_single_quoted
3591 tok_state = tok_state_data
3592 tok_cur_tag.flag 'force-quirks', true
3593 cur -= 1 # Reconsume
3597 tok_cur_tag.flag 'force-quirks', true
3598 tok_state = tok_state_bogus_doctype
3601 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
3602 tok_state_between_doctype_public_and_system_identifiers = ->
3603 c = txt.charAt(cur++)
3604 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3607 tok_state = tok_state_data
3611 tok_cur_tag.system_identifier = ''
3612 tok_state = tok_state_doctype_system_identifier_double_quoted
3616 tok_cur_tag.system_identifier = ''
3617 tok_state = tok_state_doctype_system_identifier_single_quoted
3621 tok_state = tok_state_data
3622 tok_cur_tag.flag 'force-quirks', true
3623 cur -= 1 # Reconsume
3627 tok_cur_tag.flag 'force-quirks', true
3628 tok_state = tok_state_bogus_doctype
3631 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
3632 tok_state_after_doctype_system_keyword = ->
3633 c = txt.charAt(cur++)
3634 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3635 tok_state = tok_state_before_doctype_system_identifier
3639 tok_cur_tag.system_identifier = ''
3640 tok_state = tok_state_doctype_system_identifier_double_quoted
3644 tok_cur_tag.system_identifier = ''
3645 tok_state = tok_state_doctype_system_identifier_single_quoted
3649 tok_cur_tag.flag 'force-quirks', true
3650 tok_state = tok_state_data
3654 tok_state = tok_state_data
3655 tok_cur_tag.flag 'force-quirks', true
3656 cur -= 1 # Reconsume
3660 tok_cur_tag.flag 'force-quirks', true
3661 tok_state = tok_state_bogus_doctype
3664 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
3665 tok_state_before_doctype_system_identifier = ->
3666 c = txt.charAt(cur++)
3667 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3670 tok_cur_tag.system_identifier = ''
3671 tok_state = tok_state_doctype_system_identifier_double_quoted
3674 tok_cur_tag.system_identifier = ''
3675 tok_state = tok_state_doctype_system_identifier_single_quoted
3679 tok_cur_tag.flag 'force-quirks', true
3680 tok_state = tok_state_data
3684 tok_state = tok_state_data
3685 tok_cur_tag.flag 'force-quirks', true
3686 cur -= 1 # Reconsume
3690 tok_cur_tag.flag 'force-quirks', true
3691 tok_state = tok_state_bogus_doctype
3694 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
3695 tok_state_doctype_system_identifier_double_quoted = ->
3696 c = txt.charAt(cur++)
3698 tok_state = tok_state_after_doctype_system_identifier
3702 tok_cur_tag.system_identifier += "\ufffd"
3706 tok_cur_tag.flag 'force-quirks', true
3707 tok_state = tok_state_data
3711 tok_state = tok_state_data
3712 tok_cur_tag.flag 'force-quirks', true
3713 cur -= 1 # Reconsume
3716 tok_cur_tag.system_identifier += c
3719 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
3720 tok_state_doctype_system_identifier_single_quoted = ->
3721 c = txt.charAt(cur++)
3723 tok_state = tok_state_after_doctype_system_identifier
3727 tok_cur_tag.system_identifier += "\ufffd"
3731 tok_cur_tag.flag 'force-quirks', true
3732 tok_state = tok_state_data
3736 tok_state = tok_state_data
3737 tok_cur_tag.flag 'force-quirks', true
3738 cur -= 1 # Reconsume
3741 tok_cur_tag.system_identifier += c
3744 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
3745 tok_state_after_doctype_system_identifier = ->
3746 c = txt.charAt(cur++)
3747 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3750 tok_state = tok_state_data
3754 tok_state = tok_state_data
3755 tok_cur_tag.flag 'force-quirks', true
3756 cur -= 1 # Reconsume
3760 # do _not_ tok_cur_tag.flag 'force-quirks', true
3761 tok_state = tok_state_bogus_doctype
3764 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
3765 tok_state_bogus_doctype = ->
3766 c = txt.charAt(cur++)
3768 tok_state = tok_state_data
3771 tok_state = tok_state_data
3772 cur -= 1 # Reconsume
3778 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
3779 # Don't set this as a state, just call it
3780 # returns a string (NOT a text node)
3781 parse_character_reference = (allowed_char = null, in_attr = false) ->
3782 if cur >= txt.length
3784 switch c = txt.charAt(cur)
3785 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
3786 # explicitly not a parse error
3789 # there has to be "one or more" alnums between & and ; to be a parse error
3792 if cur + 1 >= txt.length
3794 if txt.charAt(cur + 1).toLowerCase() is 'x'
3803 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
3807 if txt.charAt(start + i) is ';'
3809 # FIXME This is supposed to generate parse errors for some chars
3810 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
3817 if alnum.indexOf(txt.charAt(cur + i)) is -1
3820 # exit early, because parse_error() below needs at least one alnum
3822 if txt.charAt(cur + i) is ';'
3823 i += 1 # include ';' terminator in value
3824 decoded = decode_named_char_ref txt.substr(cur, i)
3831 # no ';' terminator (only legacy char refs)
3833 for i in [2..max] # no prefix matches, so ok to check shortest first
3834 c = legacy_char_refs[txt.substr(cur, i)]
3837 if txt.charAt(cur + i) is '='
3838 # "because some legacy user agents will
3839 # misinterpret the markup in those cases"
3842 if alnum.indexOf(txt.charAt(cur + i)) > -1
3843 # this makes attributes forgiving about url args
3845 # ok, and besides the weird exceptions for attributes...
3846 # return the matching char
3847 cur += i # consume entity chars
3848 parse_error() # because no terminating ";"
3852 return # never reached
3854 # tree constructor initialization
3855 # see comments on TYPE_TAG/etc for the structure of this data
3856 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
3858 afe = [] # active formatting elements
3859 template_ins_modes = []
3860 ins_mode = ins_mode_initial
3861 original_ins_mode = ins_mode # TODO check spec
3862 flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
3863 flag_frameset_ok = true
3865 flag_foster_parenting = false
3866 form_element_pointer = null
3867 temporary_buffer = null
3868 pending_table_character_tokens = []
3869 head_element_pointer = null
3870 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
3871 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
3873 # tokenizer initialization
3874 tok_state = tok_state_data
3881 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
3884 serialize_els = (els, shallow, show_ids) ->
3890 serialized += t.serialize shallow, show_ids
3893 # TODO export TYPE_*
3894 module.exports.parse_html = parse_html
3895 module.exports.debug_log_reset = debug_log_reset
3896 module.exports.debug_log_each = debug_log_each
3897 module.exports.TYPE_TAG = TYPE_TAG
3898 module.exports.TYPE_TEXT = TYPE_TEXT
3899 module.exports.TYPE_COMMENT = TYPE_COMMENT
3900 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE