1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
55 module = exports: window.wheic
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
79 debug_log_each = (cb) ->
80 for str in g_debug_log
85 constructor: (type, args = {}) ->
86 @type = type # one of the TYPE_* constants above
87 @name = args.name ? '' # tag name
88 @text = args.text ? '' # contents for text/comment nodes
89 @attrs = args.attrs ? {}
90 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91 @children = args.children ? []
92 @namespace = args.namespace ? NS_HTML
93 @parent = args.parent ? null
94 @token = args.token ? null
98 @id = "#{++prev_node_id}"
99 shallow_clone: -> # return a new node that's the same except without the children or parent
100 # WARNING this doesn't work right on open tags that are still being parsed
102 attrs[k] = v for k, v of @attrs
103 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id, token: @token
104 acknowledge_self_closing: ->
106 @token.flag 'did_self_close'
108 @flag 'did_self_close', true
111 serialize: (shallow = false, show_ids = false) -> # for unit tests
116 ret += JSON.stringify @name
131 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
137 ret += c.serialize shallow, show_ids
141 ret += JSON.stringify @text
144 ret += JSON.stringify @text
150 when TYPE_AAA_BOOKMARK
151 ret += 'aaa_bookmark'
154 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
157 # helpers: (only take args that are normally known when parser creates nodes)
158 new_open_tag = (name) ->
159 return new Node TYPE_START_TAG, name: name
160 new_end_tag = (name) ->
161 return new Node TYPE_END_TAG, name: name
162 new_element = (name) ->
163 return new Node TYPE_TAG, name: name
164 new_text_node = (txt) ->
165 return new Node TYPE_TEXT, text: txt
166 new_character_token = new_text_node
167 new_comment_token = (txt) ->
168 return new Node TYPE_COMMENT, text: txt
169 new_doctype_token = (name) ->
170 return new Node TYPE_DOCTYPE, name: name
172 return new Node TYPE_EOF
174 return new Node TYPE_AFE_MARKER
175 new_aaa_bookmark = ->
176 return new Node TYPE_AAA_BOOKMARK
178 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
179 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
180 digits = "0123456789"
181 alnum = lc_alpha + uc_alpha + digits
182 hex_chars = digits + "abcdefABCDEF"
184 is_uc_alpha = (str) ->
185 return str.length is 1 and uc_alpha.indexOf(str) > -1
186 is_lc_alpha = (str) ->
187 return str.length is 1 and lc_alpha.indexOf(str) > -1
189 # some SVG elements have dashes in them
190 tag_name_chars = alnum + "-"
192 # http://www.w3.org/TR/html5/infrastructure.html#space-character
193 space_chars = "\u0009\u000a\u000c\u000d\u0020"
195 return txt.length is 1 and space_chars.indexOf(txt) > -1
196 is_space_tok = (t) ->
197 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
199 is_input_hidden_tok = (t) ->
200 return unless t.type is TYPE_START_TAG
203 if a[1].toLowerCase() is 'hidden'
208 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
209 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
211 # These are the character references that don't need a terminating semicolon
212 # min length: 2, max: 6, none are a prefix of any other.
214 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
215 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
216 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
217 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
218 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
219 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
220 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
221 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
222 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
223 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
224 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
225 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
226 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
227 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
228 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
229 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
230 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
234 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
235 raw_text_elements = ['script', 'style']
236 escapable_raw_text_elements = ['textarea', 'title']
237 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
239 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
240 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
241 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
242 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
243 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
244 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
245 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
246 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
247 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
248 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
249 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
250 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
251 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
252 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
256 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
258 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
259 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
260 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
261 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
262 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
263 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
264 'determinant', 'diff', 'divergence', 'divide', 'domain',
265 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
266 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
267 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
268 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
269 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
270 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
271 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
272 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
273 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
274 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
275 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
276 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
277 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
278 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
279 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
280 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
281 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
282 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
283 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
284 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
285 'vectorproduct', 'xor'
287 # foreign_elements = [svg_elements..., mathml_elements...]
288 #normal_elements = All other allowed HTML elements are normal elements.
292 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
293 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
294 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
295 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
296 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
297 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
298 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
299 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
300 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
301 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
302 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
303 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
304 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
305 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
306 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
307 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
308 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
309 wbr:NS_HTML, xmp:NS_HTML,
312 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
313 'annotation-xml':NS_MATHML,
316 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
319 formatting_elements = {
320 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
321 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
325 foster_parenting_targets = {
347 el_is_special = (e) ->
348 return special_elements[e.name] is e.namespace
350 # decode_named_char_ref()
352 # The list of named character references is _huge_ so ask the browser to decode
353 # for us instead of wasting bandwidth/space on including the table here.
355 # Pass without the "&" but with the ";" examples:
356 # for "&" pass "amp;"
357 # for "′" pass "x2032;"
360 textarea: document.createElement('textarea')
362 # TODO test this in IE8
363 decode_named_char_ref = (txt) ->
365 decoded = g_dncr.cache[txt]
366 return decoded if decoded?
367 g_dncr.textarea.innerHTML = txt
368 decoded = g_dncr.textarea.value
369 return null if decoded is txt
370 return g_dncr.cache[txt] = decoded
372 parse_html = (txt, parse_error_cb = null) ->
373 cur = 0 # index of next char in txt to be parsed
374 # declare doc and tokenizer variables so they're in scope below
376 open_els = null # stack of open elements
377 afe = null # active formatting elements
378 template_insertion_modes = null
379 insertion_mode = null
380 original_insertion_mode = null
382 tok_cur_tag = null # partially parsed tag
383 flag_scripting = null
384 flag_frameset_ok = null
386 flag_foster_parenting = null
387 form_element_pointer = null
388 temporary_buffer = null
389 pending_table_character_tokens = null
390 head_element_pointer = null
391 flag_fragment_parsing = null
392 context_element = null
401 console.log "Parse error at character #{cur} of #{txt.length}"
403 afe_push = (new_el) ->
406 if el.name is new_el.name and el.namespace is new_el.namespace
408 continue unless new_el.attrs[k] is v
409 for k, v of new_el.attrs
410 continue unless el.attrs[k] is v
417 afe.unshift new_afe_marker()
419 # the functions below impliment the Tree Contstruction algorithm
420 # http://www.w3.org/TR/html5/syntax.html#tree-construction
422 # But first... the helpers
423 template_tag_is_open = ->
425 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
428 is_in_scope_x = (tag_name, scope, namespace) ->
430 if t.name is tag_name and (namespace is null or namespace is t.namespace)
432 if scope[t.name] is t.namespace
435 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
437 if t.name is tag_name and (namespace is null or namespace is t.namespace)
439 if scope[t.name] is t.namespace
441 if scope2[t.name] is t.namespace
444 standard_scopers = { # FIXME these are supposed to be namespace specific
445 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
446 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
447 template: NS_HTML, mi: NS_MATHML,
449 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
450 'annotation-xml': NS_MATHML,
452 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
454 button_scopers = button: NS_HTML
455 li_scopers = ol: NS_HTML, ul: NS_HTML
456 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
457 is_in_scope = (tag_name, namespace = null) ->
458 return is_in_scope_x tag_name, standard_scopers, namespace
459 is_in_button_scope = (tag_name, namespace = null) ->
460 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
461 is_in_table_scope = (tag_name, namespace = null) ->
462 return is_in_scope_x tag_name, table_scopers, namespace
463 is_in_select_scope = (tag_name, namespace = null) ->
465 if t.name is tag_name and (namespace is null or namespace is t.namespace)
467 if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
470 # this checks for a particular element, not by name
471 el_is_in_scope = (el) ->
475 if standard_scopers[t.name] is t.namespace
479 clear_to_table_stopers = {
484 clear_stack_to_table_context = ->
486 if clear_to_table_stopers[open_els[0].name]?
490 clear_to_table_body_stopers = {
497 clear_stack_to_table_body_context = ->
499 if clear_to_table_body_stopers[open_els[0].name]?
503 clear_to_table_row_stopers = {
508 clear_stack_to_table_row_context = ->
510 if clear_to_table_row_stopers[open_els[0].name]?
514 clear_afe_to_marker = ->
516 return unless afe.length > 0 # this happens in fragment case, ?spec error
518 if el.type is TYPE_AFE_MARKER
523 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
524 reset_insertion_mode = ->
525 # 1. Let last be false.
527 # 2. Let node be the last node in the stack of open elements.
529 node = open_els[node_i]
530 # 3. Loop: If node is the first node in the stack of open elements,
531 # then set last to true, and, if the parser was originally created as
532 # part of the HTML fragment parsing algorithm (fragment case) set node
533 # to the context element.
535 if node_i is open_els.length - 1
537 # fixfull (fragment case)
539 # 4. If node is a select element, run these substeps:
540 if node.name is 'select'
541 # 1. If last is true, jump to the step below labeled done.
543 # 2. Let ancestor be node.
546 # 3. Loop: If ancestor is the first node in the stack of
547 # open elements, jump to the step below labeled done.
549 if ancestor_i is open_els.length - 1
551 # 4. Let ancestor be the node before ancestor in the stack
554 ancestor = open_els[ancestor_i]
555 # 5. If ancestor is a template node, jump to the step below
557 if ancestor.name is 'template'
559 # 6. If ancestor is a table node, switch the insertion mode
560 # to "in select in table" and abort these steps.
561 if ancestor.name is 'table'
562 insertion_mode = ins_mode_in_select_in_table
564 # 7. Jump back to the step labeled loop.
565 # 8. Done: Switch the insertion mode to "in select" and abort
567 insertion_mode = ins_mode_in_select
569 # 5. If node is a td or th element and last is false, then switch
570 # the insertion mode to "in cell" and abort these steps.
571 if (node.name is 'td' or node.name is 'th') and last is false
572 insertion_mode = ins_mode_in_cell
574 # 6. If node is a tr element, then switch the insertion mode to "in
575 # row" and abort these steps.
577 insertion_mode = ins_mode_in_row
579 # 7. If node is a tbody, thead, or tfoot element, then switch the
580 # insertion mode to "in table body" and abort these steps.
581 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
582 insertion_mode = ins_mode_in_table_body
584 # 8. If node is a caption element, then switch the insertion mode
585 # to "in caption" and abort these steps.
586 if node.name is 'caption'
587 insertion_mode = ins_mode_in_caption
589 # 9. If node is a colgroup element, then switch the insertion mode
590 # to "in column group" and abort these steps.
591 if node.name is 'colgroup'
592 insertion_mode = ins_mode_in_column_group
594 # 10. If node is a table element, then switch the insertion mode to
595 # "in table" and abort these steps.
596 if node.name is 'table'
597 insertion_mode = ins_mode_in_table
599 # 11. If node is a template element, then switch the insertion mode
600 # to the current template insertion mode and abort these steps.
601 # fixfull (template insertion mode stack)
603 # 12. If node is a head element and last is true, then switch the
604 # insertion mode to "in body" ("in body"! not "in head"!) and abort
605 # these steps. (fragment case)
606 if node.name is 'head' and last
607 insertion_mode = ins_mode_in_body
609 # 13. If node is a head element and last is false, then switch the
610 # insertion mode to "in head" and abort these steps.
611 if node.name is 'head' and last is false
612 insertion_mode = ins_mode_in_head
614 # 14. If node is a body element, then switch the insertion mode to
615 # "in body" and abort these steps.
616 if node.name is 'body'
617 insertion_mode = ins_mode_in_body
619 # 15. If node is a frameset element, then switch the insertion mode
620 # to "in frameset" and abort these steps. (fragment case)
621 if node.name is 'frameset'
622 insertion_mode = ins_mode_in_frameset
624 # 16. If node is an html element, run these substeps:
625 if node.name is 'html'
626 # 1. If the head element pointer is null, switch the insertion
627 # mode to "before head" and abort these steps. (fragment case)
628 # fixfull (fragment case)
630 # 2. Otherwise, the head element pointer is not null, switch
631 # the insertion mode to "after head" and abort these steps.
632 insertion_mode = ins_mode_in_body # FIXME fixfull
634 # 17. If last is true, then switch the insertion mode to "in body"
635 # and abort these steps. (fragment case)
637 insertion_mode = ins_mode_in_body
639 # 18. Let node now be the node before node in the stack of open
642 node = open_els[node_i]
643 # 19. Return to the step labeled loop.
647 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
648 adjusted_current_node = ->
649 if open_els.length is 1 and flag_fragment_parsing
650 return context_element
653 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
654 # this implementation is structured (mostly) as described at the link above.
655 # capitalized comments are the "labels" described at the link above.
656 reconstruct_active_formatting_elements = ->
657 return if afe.length is 0
658 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
663 if i is afe.length - 1
666 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
671 el = afe[i].shallow_clone()
672 tree_insert_element el
677 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
678 # adoption agency algorithm
680 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
681 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
682 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
683 adoption_agency = (subject) ->
684 debug_log "adoption_agency()"
685 debug_log "tree: #{serialize_els doc.children, false, true}"
686 debug_log "open_els: #{serialize_els open_els, true, true}"
687 debug_log "afe: #{serialize_els afe, true, true}"
688 if open_els[0].name is subject
691 # remove it from the list of active formatting elements (if found)
696 debug_log "aaa: starting off with subject on top of stack, exiting"
703 # 5. Let formatting element be the last element in the list of
704 # active formatting elements that: is between the end of the list
705 # and the last scope marker in the list, if any, or the start of
706 # the list otherwise, and has the tag name subject.
708 for t, fe_of_afe in afe
709 if t.type is TYPE_AFE_MARKER
714 # If there is no such element, then abort these steps and instead
715 # act as described in the "any other end tag" entry above.
717 debug_log "aaa: fe not found in afe"
718 in_body_any_other_end_tag subject
720 # 6. If formatting element is not in the stack of open elements,
721 # then this is a parse error; remove the element from the list, and
724 for t, fe_of_open_els in open_els
729 debug_log "aaa: fe not found in open_els"
731 # "remove it from the list" must mean afe, since it's not in open_els
732 afe.splice fe_of_afe, 1
734 # 7. If formatting element is in the stack of open elements, but
735 # the element is not in scope, then this is a parse error; abort
737 unless el_is_in_scope fe
738 debug_log "aaa: fe not in scope"
741 # 8. If formatting element is not the current node, this is a parse
742 # error. (But do not abort these steps.)
743 unless open_els[0] is fe
746 # 9. Let furthest block be the topmost node in the stack of open
747 # elements that is lower in the stack than formatting element, and
748 # is an element in the special category. There might not be one.
750 fb_of_open_els = null
757 # and continue, to see if there's one that's more "topmost"
758 # 10. If there is no furthest block, then the UA must first pop all
759 # the nodes from the bottom of the stack of open elements, from the
760 # current node up to and including formatting element, then remove
761 # formatting element from the list of active formatting elements,
762 # and finally abort these steps.
764 debug_log "aaa: no fb"
768 afe.splice fe_of_afe, 1
770 # 11. Let common ancestor be the element immediately above
771 # formatting element in the stack of open elements.
772 ca = open_els[fe_of_open_els + 1] # common ancestor
774 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
775 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
776 bookmark = new_aaa_bookmark()
779 afe.splice i, 0, bookmark
781 node = last_node = fb
785 # 3. Let node be the element immediately above node in the
786 # stack of open elements, or if node is no longer in the stack
787 # of open elements (e.g. because it got removed by this
788 # algorithm), the element that was immediately above node in
789 # the stack of open elements before node was removed.
793 node_next = open_els[i + 1]
795 node = node_next ? node_above
796 debug_log "inner loop #{inner}"
797 debug_log "tree: #{serialize_els doc.children, false, true}"
798 debug_log "open_els: #{serialize_els open_els, true, true}"
799 debug_log "afe: #{serialize_els afe, true, true}"
800 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
801 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
802 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
803 debug_log "node: #{node.serialize true, true}"
804 # TODO make sure node_above gets re-set if/when node is removed from open_els
806 # 4. If node is formatting element, then go to the next step in
807 # the overall algorithm.
811 # 5. If inner loop counter is greater than three and node is in
812 # the list of active formatting elements, then remove node from
813 # the list of active formatting elements.
819 debug_log "max out inner"
824 # 6. If node is not in the list of active formatting elements,
825 # then remove node from the stack of open elements and then go
826 # back to the step labeled inner loop.
828 debug_log "not in afe"
831 node_above = open_els[i + 1]
835 debug_log "the bones"
836 # 7. create an element for the token for which the element node
837 # was created, in the HTML namespace, with common ancestor as
838 # the intended parent; replace the entry for node in the list
839 # of active formatting elements with an entry for the new
840 # element, replace the entry for node in the stack of open
841 # elements with an entry for the new element, and let node be
843 new_node = node.shallow_clone()
847 debug_log "replaced in afe"
851 node_above = open_els[i + 1]
852 open_els[i] = new_node
853 debug_log "replaced in open_els"
856 # 8. If last node is furthest block, then move the
857 # aforementioned bookmark to be immediately after the new node
858 # in the list of active formatting elements.
863 debug_log "removed bookmark"
867 # "after" means lower
868 afe.splice i, 0, bookmark # "after as <-
869 debug_log "placed bookmark after node"
870 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
872 # 9. Insert last node into node, first removing it from its
873 # previous parent node if any.
875 debug_log "last_node has parent"
876 for c, i in last_node.parent.children
878 debug_log "removing last_node from parent"
879 last_node.parent.children.splice i, 1
881 node.children.push last_node
882 last_node.parent = node
883 # 10. Let last node be node.
886 # 11. Return to the step labeled inner loop.
887 # 14. Insert whatever last node ended up being in the previous step
888 # at the appropriate place for inserting a node, but using common
889 # ancestor as the override target.
891 # In the case where fe is immediately followed by fb:
892 # * inner loop exits out early (node==fe)
894 # * last_node is still in the tree (not a duplicate)
896 debug_log "FEFIRST? last_node has parent"
897 for c, i in last_node.parent.children
899 debug_log "removing last_node from parent"
900 last_node.parent.children.splice i, 1
903 debug_log "after aaa inner loop"
904 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
905 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
906 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
907 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
908 debug_log "tree: #{serialize_els doc.children, false, true}"
913 # can't use standard insert token thing, because it's already in
914 # open_els and must stay at it's current position in open_els
915 dest = adjusted_insertion_location ca
916 dest[0].children.splice dest[1], 0, last_node
917 last_node.parent = dest[0]
920 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
921 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
922 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
923 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
924 debug_log "tree: #{serialize_els doc.children, false, true}"
926 # 15. Create an element for the token for which formatting element
927 # was created, in the HTML namespace, with furthest block as the
929 new_element = fe.shallow_clone() # FIXME intended parent thing
930 # 16. Take all of the child nodes of furthest block and append them
931 # to the element created in the last step.
932 while fb.children.length
933 t = fb.children.shift()
934 t.parent = new_element
935 new_element.children.push t
936 # 17. Append that new element to furthest block.
937 new_element.parent = fb
938 fb.children.push new_element
939 # 18. Remove formatting element from the list of active formatting
940 # elements, and insert the new element into the list of active
941 # formatting elements at the position of the aforementioned
951 # 19. Remove formatting element from the stack of open elements,
952 # and insert the new element into the stack of open elements
953 # immediately below the position of furthest block in that stack.
960 open_els.splice i, 0, new_element
962 # 20. Jump back to the step labeled outer loop.
963 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
964 debug_log "tree: #{serialize_els doc.children, false, true}"
965 debug_log "open_els: #{serialize_els open_els, true, true}"
966 debug_log "afe: #{serialize_els afe, true, true}"
969 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
971 generate_implied_end_tags 'p' # arg is exception
972 if open_els[0].name isnt 'p'
974 while open_els.length > 1 # just in case
975 el = open_els.shift()
978 close_p_if_in_button_scope = ->
979 if is_in_button_scope 'p'
982 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
983 # aka insert_a_character = (t) ->
984 insert_character = (t) ->
985 dest = adjusted_insertion_location()
986 # fixfull check for Document node
988 prev = dest[0].children[dest[1] - 1]
989 if prev.type is TYPE_TEXT
992 dest[0].children.splice dest[1], 0, t
995 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
996 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
997 adjusted_insertion_location = (override_target = null) ->
998 # 1. If there was an override target specified, then let target be the
1001 target = override_target
1002 else # Otherwise, let target be the current node.
1003 target = open_els[0]
1004 # 2. Determine the adjusted insertion location using the first matching
1005 # steps from the following list:
1007 # If foster parenting is enabled and target is a table, tbody, tfoot,
1008 # thead, or tr element Foster parenting happens when content is
1009 # misnested in tables.
1010 if flag_foster_parenting and foster_parenting_targets[target.name]
1011 loop # once. this is here so we can ``break`` to "abort these substeps"
1012 # 1. Let last template be the last template element in the
1013 # stack of open elements, if any.
1014 last_template = null
1015 last_template_i = null
1016 for el, i in open_els
1017 if el.name is 'template'
1021 # 2. Let last table be the last table element in the stack of
1022 # open elements, if any.
1025 for el, i in open_els
1026 if el.name is 'table'
1030 # 3. If there is a last template and either there is no last
1031 # table, or there is one, but last template is lower (more
1032 # recently added) than last table in the stack of open
1033 # elements, then: let adjusted insertion location be inside
1034 # last template's template contents, after its last child (if
1035 # any), and abort these substeps.
1036 if last_template and (last_table is null or last_template_i < last_table_i)
1037 target = last_template # fixfull should be it's contents
1038 target_i = target.children.length
1040 # 4. If there is no last table, then let adjusted insertion
1041 # location be inside the first element in the stack of open
1042 # elements (the html element), after its last child (if any),
1043 # and abort these substeps. (fragment case)
1044 if last_table is null
1046 target = open_els[open_els.length - 1]
1047 target_i = target.children.length
1048 # 5. If last table has a parent element, then let adjusted
1049 # insertion location be inside last table's parent element,
1050 # immediately before last table, and abort these substeps.
1051 if last_table.parent?
1052 for c, i in last_table.parent.children
1054 target = last_table.parent
1058 # 6. Let previous element be the element immediately above last
1059 # table in the stack of open elements.
1061 # huh? how could it not have a parent?
1062 previous_element = open_els[last_table_i + 1]
1063 # 7. Let adjusted insertion location be inside previous
1064 # element, after its last child (if any).
1065 target = previous_element
1066 target_i = target.children.length
1067 # Note: These steps are involved in part because it's possible
1068 # for elements, the table element in this case in particular,
1069 # to have been moved by a script around in the DOM, or indeed
1070 # removed from the DOM entirely, after the element was inserted
1072 break # don't really loop
1074 # Otherwise Let adjusted insertion location be inside target, after
1075 # its last child (if any).
1076 target_i = target.children.length
1078 # 3. If the adjusted insertion location is inside a template element,
1079 # let it instead be inside the template element's template contents,
1080 # after its last child (if any).
1081 # fixfull (template)
1083 # 4. Return the adjusted insertion location.
1084 return [target, target_i]
1086 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1087 # aka create_an_element_for_token
1088 token_to_element = (t, namespace, intended_parent) ->
1089 t.type = TYPE_TAG # not TYPE_START_TAG
1090 # convert attributes into a hash
1092 while t.attrs_a.length
1094 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1095 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1097 # TODO 2. If the newly created element has an xmlns attribute in the
1098 # XMLNS namespace whose value is not exactly the same as the element's
1099 # namespace, that is a parse error. Similarly, if the newly created
1100 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1101 # value is not the XLink Namespace, that is a parse error.
1103 # fixfull: the spec says stuff about form pointers and ownerDocument
1107 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1108 insert_foreign_element = (token, namespace) ->
1109 ail = adjusted_insertion_location()
1112 el = token_to_element token, namespace, ail_el
1113 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1115 ail_el.children.splice ail_i, 0, el
1118 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1119 insert_html_element = insert_foreign_element # (token, namespace) ->
1121 # FIXME read implement "foster parenting" part
1122 # FIXME read spec, do this right
1123 # FIXME implement the override target thing
1124 # note: this assumes it's an open tag
1125 # FIXME what part of the spec is this?
1126 # TODO look through all callers of this, and see what they should really be doing.
1127 # eg probably insert_html_element for tokens
1128 tree_insert_element = (el, override_target = null, namespace = null) ->
1130 el.namespace = namespace
1131 dest = adjusted_insertion_location override_target
1132 if el.type is TYPE_START_TAG # means it's a "token"
1133 el = token_to_element el, namespace, dest[0]
1134 unless el.namespace?
1135 namespace = dest.namespace
1136 # fixfull: Document nodes sometimes can't accept more chidren
1137 dest[0].children.splice dest[1], 0, el
1142 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1143 # position should be [node, index_within_children]
1144 insert_comment = (t, position = null) ->
1145 position ?= adjusted_insertion_location()
1146 position[0].children.splice position[1], 0, t
1149 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1150 parse_generic_raw_text = (t) ->
1151 insert_html_element t
1152 tok_state = tok_state_rawtext
1153 original_insertion_mode = insertion_mode
1154 insertion_mode = ins_mode_text
1155 parse_generic_rcdata_text = (t) ->
1156 insert_html_element t
1157 tok_state = tok_state_rcdata
1158 original_insertion_mode = insertion_mode
1159 insertion_mode = ins_mode_text
1161 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1162 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1163 generate_implied_end_tags = (except = null) ->
1164 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1167 # 8.2.5.4 The rules for parsing tokens in HTML content
1168 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1170 # 8.2.5.4.1 The "initial" insertion mode
1171 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1172 ins_mode_initial = (t) ->
1175 if t.type is TYPE_COMMENT
1179 if t.type is TYPE_DOCTYPE
1180 # FIXME check identifiers, set quirks, etc
1183 insertion_mode = ins_mode_before_html
1186 #fixfull (iframe, quirks)
1187 insertion_mode = ins_mode_before_html
1188 insertion_mode t # reprocess the token
1191 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1192 ins_mode_before_html = (t) ->
1193 if t.type is TYPE_DOCTYPE
1196 if t.type is TYPE_COMMENT
1201 if t.type is TYPE_START_TAG and t.name is 'html'
1202 el = token_to_element t, NS_HTML, doc
1203 doc.children.push el
1204 open_els.unshift(el)
1205 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1206 insertion_mode = ins_mode_before_head
1208 if t.type is TYPE_END_TAG
1209 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1210 # fall through to "anything else"
1215 html_tok = new_open_tag 'html'
1216 el = token_to_element html_tok, NS_HTML, doc
1217 doc.children.push el
1219 # ?fixfull browsing context
1220 insertion_mode = ins_mode_before_head
1224 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1225 ins_mode_before_head = (t) ->
1228 if t.type is TYPE_COMMENT
1231 if t.type is TYPE_DOCTYPE
1234 if t.type is TYPE_START_TAG and t.name is 'html'
1237 if t.type is TYPE_START_TAG and t.name is 'head'
1238 el = insert_html_element t
1239 head_element_pointer = el
1240 insertion_mode = ins_mode_in_head
1241 if t.type is TYPE_END_TAG
1242 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1243 # fall through to Anything else below
1248 head_tok = new_open_tag 'head'
1249 el = insert_html_element head_tok
1250 head_element_pointer = el
1251 insertion_mode = ins_mode_in_head
1252 insertion_mode t # reprocess current token
1254 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1255 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1256 open_els.shift() # spec says this will be a 'head' node
1257 insertion_mode = ins_mode_after_head
1259 ins_mode_in_head = (t) ->
1260 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1263 if t.type is TYPE_COMMENT
1266 if t.type is TYPE_DOCTYPE
1269 if t.type is TYPE_START_TAG and t.name is 'html'
1272 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1273 el = insert_html_element t
1275 t.acknowledge_self_closing()
1277 if t.type is TYPE_START_TAG and t.name is 'meta'
1278 el = insert_html_element t
1280 t.acknowledge_self_closing()
1281 # fixfull encoding stuff
1283 if t.type is TYPE_START_TAG and t.name is 'title'
1284 parse_generic_rcdata_text t
1286 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1287 parse_generic_raw_text t
1289 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1290 insert_html_element t
1291 insertion_mode = ins_mode_in_head_noscript # FIXME implement
1293 if t.type is TYPE_START_TAG and t.name is 'script'
1294 ail = adjusted_insertion_location()
1295 el = token_to_element t, NS_HTML, ail
1296 el.flag 'parser-inserted', true # FIXME implement
1297 # fixfull frament case
1298 ail[0].children.splice ail[1], 0, el
1300 tok_state = tok_state_script_data
1301 original_insertion_mode = insertion_mode # make sure orig... is defined
1302 insertion_mode = ins_mode_text # FIXME implement
1304 if t.type is TYPE_END_TAG and t.name is 'head'
1305 open_els.shift() # will be a head element... spec says so
1306 insertion_mode = ins_mode_after_head
1308 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1309 ins_mode_in_head_else t
1311 if t.type is TYPE_START_TAG and t.name is 'template'
1312 insert_html_element t
1314 flag_frameset_ok = false
1315 insertion_mode = ins_mode_in_template
1316 template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1318 if t.type is TYPE_END_TAG and t.name is 'template'
1319 if template_tag_is_open()
1320 generate_implied_end_tags
1321 if open_els[0].name isnt 'template'
1324 el = open_els.shift()
1325 if el.name is 'template'
1327 clear_afe_to_marker()
1328 template_insertion_modes.shift()
1329 reset_insertion_mode()
1333 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1336 ins_mode_in_head_else t
1338 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1339 ins_mode_in_head_noscript = (t) ->
1341 console.log "ins_mode_in_head_noscript unimplemented"
1343 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1344 ins_mode_after_head_else = (t) ->
1345 body_tok = new_open_tag 'body'
1346 insert_html_element body_tok
1347 insertion_mode = ins_mode_in_body
1348 insertion_mode t # reprocess token
1350 ins_mode_after_head = (t) ->
1354 if t.type is TYPE_COMMENT
1357 if t.type is TYPE_DOCTYPE
1360 if t.type is TYPE_START_TAG and t.name is 'html'
1363 if t.type is TYPE_START_TAG and t.name is 'body'
1364 insert_html_element t
1365 flag_frameset_ok = false
1366 insertion_mode = ins_mode_in_body
1368 if t.type is TYPE_START_TAG and t.name is 'frameset'
1369 insert_html_element t
1370 insertion_mode = ins_mode_in_frameset
1372 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1374 open_els.unshift head_element_pointer
1376 for el, i of open_els
1377 if el is head_element_pointer
1378 open_els.splice i, 1
1380 console.log "warning: 23904 couldn't find head element in open_els"
1382 if t.type is TYPE_END_TAG and t.name is 'template'
1385 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1386 ins_mode_after_head_else t
1388 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1392 ins_mode_after_head_else t
1394 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1395 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1396 for node, i in open_els
1397 if node.name is name # FIXME check namespace too
1398 generate_implied_end_tags name # arg is exception
1399 parse_error() unless i is 0
1404 if special_elements[node.name]? # FIXME check namespac too
1407 ins_mode_in_body = (t) ->
1413 when "\t", "\u000a", "\u000c", "\u000d", ' '
1414 reconstruct_active_formatting_elements()
1417 reconstruct_active_formatting_elements()
1419 flag_frameset_ok = false
1428 return if template_tag_is_open()
1429 root_attrs = open_els[open_els.length - 1].attrs
1431 root_attrs[k] = v unless root_attrs[k]?
1432 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1433 # FIXME also do this for </template> (end tag)
1434 return ins_mode_in_head t
1441 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1442 close_p_if_in_button_scope()
1443 insert_html_element t
1444 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1445 close_p_if_in_button_scope()
1446 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1449 insert_html_element t
1450 # TODO lots more to implement here
1452 # If the list of active formatting elements
1453 # contains an a element between the end of the list and
1454 # the last marker on the list (or the start of the list
1455 # if there is no marker on the list), then this is a
1456 # parse error; run the adoption agency algorithm for
1457 # the tag name "a", then remove that element from the
1458 # list of active formatting elements and the stack of
1459 # open elements if the adoption agency algorithm didn't
1460 # already remove it (it might not have if the element
1461 # is not in table scope).
1464 if el.type is TYPE_AFE_MARKER
1474 for el, i in open_els
1476 open_els.splice i, 1
1477 reconstruct_active_formatting_elements()
1478 el = insert_html_element t
1480 when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1481 reconstruct_active_formatting_elements()
1482 el = insert_html_element t
1485 # fixfull quirksmode thing
1486 close_p_if_in_button_scope()
1487 insert_html_element t
1488 insertion_mode = ins_mode_in_table
1489 # TODO lots more to implement here
1490 else # any other start tag
1491 reconstruct_active_formatting_elements()
1492 insert_html_element t
1495 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1496 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1499 unless ok_tags[t.name]?
1502 # TODO stack of template insertion modes thing
1507 unless is_in_scope 'body'
1510 # TODO implement parse error and move to tree_after_body
1512 unless is_in_scope 'body' # weird, but it's what the spec says
1515 # TODO implement parse error and move to tree_after_body, reprocess
1516 when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1517 unless is_in_scope t.name, NS_HTML
1520 generate_implied_end_tags()
1521 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1524 el = open_els.shift()
1525 if el.name is t.name and el.namespace is NS_HTML
1527 # TODO lots more close tags to implement here
1529 unless is_in_button_scope 'p'
1531 insert_html_element new_open_tag 'p'
1533 # TODO lots more close tags to implement here
1534 when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1535 adoption_agency t.name
1536 # TODO lots more close tags to implement here
1538 in_body_any_other_end_tag t.name
1541 ins_mode_in_table_else = (t) ->
1543 flag_foster_parenting = true # FIXME
1545 flag_foster_parenting = false
1546 can_in_table = { # FIXME do this inline like everywhere else
1554 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1555 ins_mode_text = (t) ->
1556 if t.type is TYPE_TEXT
1559 if t.type is TYPE_EOF
1561 if open_els[0].name is 'script'
1562 open_els[0].flag 'already started', true
1564 insertion_mode = original_insertion_mode
1567 if t.type is TYPE_END_TAG and t.name is 'script'
1569 insertion_mode = original_insertion_mode
1570 # fixfull the spec seems to assume that I'm going to run the script
1571 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1573 if t.type is TYPE_END_TAG
1575 insertion_mode = original_insertion_mode
1577 console.log 'warning: end of ins_mode_text reached'
1579 # the functions below implement the tokenizer stats described here:
1580 # http://www.w3.org/TR/html5/syntax.html#tokenization
1582 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1583 ins_mode_in_table = (t) ->
1586 if can_in_table[t.name]
1587 original_insertion_mode = insertion_mode
1588 insertion_mode = ins_mode_in_table_text
1591 ins_mode_in_table_else t
1599 clear_stack_to_table_context()
1601 insert_html_element t
1602 insertion_mode = ins_mode_in_caption
1604 clear_stack_to_table_context()
1605 insert_html_element t
1606 insertion_mode = ins_mode_in_column_group
1608 clear_stack_to_table_context()
1609 insert_html_element new_open_tag 'colgroup'
1610 insertion_mode = ins_mode_in_column_group
1612 when 'tbody', 'tfoot', 'thead'
1613 clear_stack_to_table_context()
1614 insert_html_element t
1615 insertion_mode = ins_mode_in_table_body
1616 when 'td', 'th', 'tr'
1617 clear_stack_to_table_context()
1618 insert_html_element new_open_tag 'tbody'
1619 insertion_mode = ins_mode_in_table_body
1623 if is_in_table_scope 'table'
1625 el = open_els.shift()
1626 if el.name is 'table'
1628 reset_insertion_mode()
1630 when 'style', 'script', 'template'
1633 if is_input_hidden_tok t
1634 ins_mode_in_table_else t
1637 el = insert_html_element t
1639 t.acknowledge_self_closing()
1642 if form_element_pointer?
1644 if template_tag_is_open()
1646 form_element_pointer = insert_html_element t
1649 ins_mode_in_table_else t
1653 if is_in_table_scope 'table'
1655 el = open_els.shift()
1656 if el.name is 'table'
1658 reset_insertion_mode()
1661 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1666 ins_mode_in_table_else t
1670 ins_mode_in_table_else t
1673 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1674 ins_mode_in_table_text = (t) ->
1675 if t.type is TYPE_TEXT and t.text is "\u0000"
1676 # huh? I thought the tokenizer didn't emit these
1679 if t.type is TYPE_TEXT
1680 pending_table_character_tokens.push t
1684 for old in pending_table_character_tokens
1685 unless is_space_tok old
1689 for old in pending_table_character_tokens
1690 insert_character old
1692 for old in pending_table_character_tokens
1693 ins_mode_table_else old
1694 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1695 insertion_mode = original_insertion_mode
1698 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1699 ins_mode_in_caption = (t) ->
1700 if t.type is TYPE_END_TAG and t.name is 'caption'
1701 if is_in_table_scope 'caption'
1702 generate_implied_end_tags()
1703 if open_els[0].name isnt 'caption'
1706 el = open_els.shift()
1707 if el.name is 'caption'
1709 clear_afe_to_marker()
1710 insertion_mode = ins_mode_in_table
1715 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1717 if is_in_table_scope 'caption'
1719 el = open_els.shift()
1720 if el.name is 'caption'
1722 clear_afe_to_marker()
1723 insertion_mode = ins_mode_in_table
1725 # else fragment case
1727 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1733 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1734 ins_mode_in_column_group = (t) ->
1738 if t.type is TYPE_COMMENT
1741 if t.type is TYPE_DOCTYPE
1744 if t.type is TYPE_START_TAG and t.name is 'html'
1747 if t.type is TYPE_START_TAG and t.name is 'col'
1748 el = insert_html_element t
1750 t.acknowledge_self_closing()
1752 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1753 if open_els[0].name is 'colgroup'
1755 insertion_mode = ins_mode_in_table
1759 if t.type is TYPE_END_TAG and t.name is 'col'
1762 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1765 if t.type is TYPE_EOF
1769 if open_els[0].name isnt 'colgroup'
1773 insertion_mode = ins_mode_in_table
1777 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1778 ins_mode_in_table_body = (t) ->
1779 if t.type is TYPE_START_TAG and t.name is 'tr'
1780 clear_stack_to_table_body_context()
1781 insert_html_element t
1782 insertion_mode = ins_mode_in_row
1784 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1786 clear_stack_to_table_body_context()
1787 insert_html_element new_open_tag 'tr'
1788 insertion_mode = ins_mode_in_row
1791 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1792 unless is_in_table_scope t.name # fixfull check namespace
1795 clear_stack_to_table_body_context()
1797 insertion_mode = ins_mode_in_table
1799 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1802 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1805 if table_scopers[el.name]
1810 clear_stack_to_table_body_context()
1812 insertion_mode = ins_mode_in_table
1815 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1821 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1822 ins_mode_in_row = (t) ->
1823 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1824 clear_stack_to_table_row_context()
1825 insert_html_element t
1826 insertion_mode = ins_mode_in_cell
1829 if t.type is TYPE_END_TAG and t.name is 'tr'
1830 if is_in_table_scope 'tr'
1831 clear_stack_to_table_row_context()
1833 insertion_mode = ins_mode_in_table_body
1837 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1838 if is_in_table_scope 'tr'
1839 clear_stack_to_table_row_context()
1841 insertion_mode = ins_mode_in_table_body
1846 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1847 if is_in_table_scope t.name # fixfull namespace
1848 if is_in_table_scope 'tr'
1849 clear_stack_to_table_row_context()
1851 insertion_mode = ins_mode_in_table_body
1856 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1862 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1864 generate_implied_end_tags()
1865 unless open_els[0].name is 'td' or open_els[0] is 'th'
1868 el = open_els.shift()
1869 if el.name is 'td' or el.name is 'th'
1871 clear_afe_to_marker()
1872 insertion_mode = ins_mode_in_row
1874 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1875 ins_mode_in_cell = (t) ->
1876 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1877 if is_in_table_scope t.name
1878 generate_implied_end_tags()
1879 if open_els[0].name isnt t.name
1882 el = open_els.shift()
1883 if el.name is t.name
1885 clear_afe_to_marker()
1886 insertion_mode = ins_mode_in_row
1890 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1893 if el.name is 'td' or el.name is 'th'
1896 if table_scopers[el.name]
1904 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1907 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1908 if is_in_table_scope t.name # fixfull namespace
1917 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
1918 ins_mode_in_select = (t) ->
1919 if t.type is TYPE_TEXT and t.text is "\u0000"
1922 if t.type is TYPE_TEXT
1925 if t.type is TYPE_COMMENT
1928 if t.type is TYPE_DOCTYPE
1931 if t.type is TYPE_START_TAG and t.name is 'html'
1934 if t.type is TYPE_START_TAG and t.name is 'option'
1935 if open_els[0].name is 'option'
1937 insert_html_element t
1939 if t.type is TYPE_START_TAG and t.name is 'optgroup'
1940 if open_els[0].name is 'option'
1942 if open_els[0].name is 'optgroup'
1944 insert_html_element t
1946 if t.type is TYPE_END_TAG and t.name is 'optgroup'
1947 if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
1949 if open_els[0].name is 'optgroup'
1954 if t.type is TYPE_END_TAG and t.name is 'option'
1955 if open_els[0].name is 'option'
1960 if t.type is TYPE_END_TAG and t.name is 'select'
1961 if is_in_select_scope 'select'
1963 el = open_els.shift()
1964 if el.name is 'select'
1966 reset_insertion_mode()
1970 if t.type is TYPE_START_TAG and t.name is 'select'
1973 el = open_els.shift()
1974 if el.name is 'select'
1976 reset_insertion_mode()
1977 # spec says that this is the same as </select> but it doesn't say
1978 # to check scope first
1980 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
1982 if is_in_select_scope 'select'
1985 el = open_els.shift()
1986 if el.name is 'select'
1988 reset_insertion_mode()
1991 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
1994 if t.type is TYPE_EOF
2001 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2002 ins_mode_in_select_in_table = (t) ->
2003 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2006 el = open_els.shift()
2007 if el.name is 'select'
2009 reset_insertion_mode()
2012 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2014 unless is_in_table_scope t.name, NS_HTML
2017 el = open_els.shift()
2018 if el.name is 'select'
2020 reset_insertion_mode()
2024 ins_mode_in_select t
2027 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2028 ins_mode_in_template = (t) ->
2029 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2032 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2035 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2036 template_insertion_modes.shift()
2037 template_insertion_modes.unshift ins_mode_in_table
2038 insertion_mode = ins_mode_in_table
2041 if t.type is TYPE_START_TAG and t.name is 'col'
2042 template_insertion_modes.shift()
2043 template_insertion_modes.unshift ins_mode_in_column_group
2044 insertion_mode = ins_mode_in_column_group
2047 if t.type is TYPE_START_TAG and t.name is 'tr'
2048 template_insertion_modes.shift()
2049 template_insertion_modes.unshift ins_mode_in_table_body
2050 insertion_mode = ins_mode_in_table_body
2053 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2054 template_insertion_modes.shift()
2055 template_insertion_modes.unshift ins_mode_in_row
2056 insertion_mode = ins_mode_in_row
2059 if t.type is TYPE_START_TAG
2060 template_insertion_modes.shift()
2061 template_insertion_modes.unshift ins_mode_in_body
2062 insertion_mode = ins_mode_in_body
2065 if t.type is TYPE_END_TAG
2068 if t.type is TYPE_EOF
2069 unless template_tag_is_open()
2074 el = open_els.shift()
2075 if el.name is 'template' # fixfull check namespace
2077 clear_afe_to_marker()
2078 template_insertion_modes.shift()
2079 reset_insertion_mode()
2082 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2083 ins_mode_after_body = (t) ->
2087 if t.type is TYPE_COMMENT
2088 insert_comment t, [open_els[0], open_els[0].children.length]
2090 if t.type is TYPE_DOCTYPE
2093 if t.type is TYPE_START_TAG and t.name is 'html'
2096 if t.type is TYPE_END_TAG and t.name is 'html'
2097 # fixfull fragment case
2098 insertion_mode = ins_mode_after_after_body
2100 if t.type is TYPE_EOF
2105 insertion_mode = ins_mode_in_body
2108 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2109 ins_mode_in_frameset = (t) ->
2113 if t.type is TYPE_COMMENT
2116 if t.type is TYPE_DOCTYPE
2119 if t.type is TYPE_START_TAG and t.name is 'html'
2122 if t.type is TYPE_START_TAG and t.name is 'frameset'
2123 insert_html_element t
2125 if t.type is TYPE_END_TAG and t.name is 'frameset'
2126 # TODO ?correct for: "if the current node is the root html element"
2127 if open_els.length is 1
2129 return # fragment case
2131 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2132 insertion_mode = ins_mode_after_frameset
2134 if t.type is TYPE_START_TAG and t.name is 'frame'
2135 insert_html_element t
2137 t.acknowledge_self_closing()
2139 if t.type is TYPE_START_TAG and t.name is 'noframes'
2142 if t.type is TYPE_EOF
2143 # TODO ?correct for: "if the current node is not the root html element"
2144 if open_els.length isnt 1
2152 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2153 ins_mode_after_frameset = (t) ->
2157 if t.type is TYPE_COMMENT
2160 if t.type is TYPE_DOCTYPE
2163 if t.type is TYPE_START_TAG and t.name is 'html'
2166 if t.type is TYPE_END_TAG and t.name is 'html'
2167 insert_mode = ins_mode_after_after_frameset
2169 if t.type is TYPE_START_TAG and t.name is 'noframes'
2172 if t.type is TYPE_EOF
2179 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2180 ins_mode_after_after_body = (t) ->
2181 if t.type is TYPE_COMMENT
2182 insert_comment t, [doc, doc.children.length]
2184 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2187 if t.type is TYPE_EOF
2192 insertion_mode = ins_mode_in_body
2195 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2196 ins_mode_after_after_frameset = (t) ->
2197 if t.type is TYPE_COMMENT
2198 insert_comment t, [doc, doc.children.length]
2200 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2203 if t.type is TYPE_EOF
2206 if t.type is TYPE_START_TAG and t.name is 'noframes'
2217 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2219 switch c = txt.charAt(cur++)
2221 return new_text_node parse_character_reference()
2223 tok_state = tok_state_tag_open
2226 return new_text_node c
2228 return new_eof_token()
2230 return new_text_node c
2233 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2234 # not needed: tok_state_character_reference_in_data = ->
2235 # just call parse_character_reference()
2237 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2238 tok_state_rcdata = ->
2239 switch c = txt.charAt(cur++)
2241 return new_text_node parse_character_reference()
2243 tok_state = tok_state_rcdata_less_than_sign
2246 return new_character_token "\ufffd"
2248 return new_eof_token()
2250 return new_character_token c
2253 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2254 # not needed: tok_state_character_reference_in_rcdata = ->
2255 # just call parse_character_reference()
2257 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2258 tok_state_rawtext = ->
2259 switch c = txt.charAt(cur++)
2261 tok_state = tok_state_rawtext_less_than_sign
2264 return new_character_token "\ufffd"
2266 return new_eof_token()
2268 return new_character_token c
2271 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2272 tok_state_script_data = ->
2273 switch c = txt.charAt(cur++)
2275 tok_state = tok_state_script_data_less_than_sign
2278 return new_character_token "\ufffd"
2280 return new_eof_token()
2282 return new_character_token c
2285 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2286 tok_state_plaintext = ->
2287 switch c = txt.charAt(cur++)
2290 return new_character_token "\ufffd"
2292 return new_eof_token()
2294 return new_character_token c
2298 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2299 tok_state_tag_open = ->
2300 switch c = txt.charAt(cur++)
2302 tok_state = tok_state_markup_declaration_open
2304 tok_state = tok_state_end_tag_open
2307 tok_cur_tag = new_comment_token '?'
2308 tok_state = tok_state_bogus_comment
2311 tok_cur_tag = new_open_tag c
2312 tok_state = tok_state_tag_name
2313 else if is_uc_alpha(c)
2314 tok_cur_tag = new_open_tag c.toLowerCase()
2315 tok_state = tok_state_tag_name
2318 tok_state = tok_state_data
2319 cur -= 1 # we didn't parse/handle the char after <
2320 return new_text_node '<'
2323 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2324 tok_state_end_tag_open = ->
2325 switch c = txt.charAt(cur++)
2328 tok_state = tok_state_data
2331 tok_state = tok_state_data
2332 return new_text_node '</'
2335 tok_cur_tag = new_end_tag c.toLowerCase()
2336 tok_state = tok_state_tag_name
2337 else if is_lc_alpha(c)
2338 tok_cur_tag = new_end_tag c
2339 tok_state = tok_state_tag_name
2342 tok_cur_tag = new_comment_token '/'
2343 tok_state = tok_state_bogus_comment
2346 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2347 tok_state_tag_name = ->
2348 switch c = txt.charAt(cur++)
2349 when "\t", "\n", "\u000c", ' '
2350 tok_state = tok_state_before_attribute_name
2352 tok_state = tok_state_self_closing_start_tag
2354 tok_state = tok_state_data
2360 tok_cur_tag.name += "\ufffd"
2363 tok_state = tok_state_data
2366 tok_cur_tag.name += c.toLowerCase()
2368 tok_cur_tag.name += c
2371 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2372 tok_state_rcdata_less_than_sign = ->
2373 c = txt.charAt(cur++)
2375 temporary_buffer = ''
2376 tok_state = tok_state_rcdata_end_tag_open
2379 tok_state = tok_state_rcdata
2380 cur -= 1 # reconsume the input character
2381 return new_character_token '<'
2383 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2384 tok_state_rcdata_end_tag_open = ->
2385 c = txt.charAt(cur++)
2387 tok_cur_tag = new_end_tag c.toLowerCase()
2388 temporary_buffer += c
2389 tok_state = tok_state_rcdata_end_tag_name
2392 tok_cur_tag = new_end_tag c
2393 temporary_buffer += c
2394 tok_state = tok_state_rcdata_end_tag_name
2397 tok_state = tok_state_rcdata
2398 cur -= 1 # reconsume the input character
2399 return new_character_token "</" # fixfull separate these
2401 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2402 is_appropriate_end_tag = (t) ->
2403 # spec says to check against "the tag name of the last start tag to
2404 # have been emitted from this tokenizer", but this is only called from
2405 # the various "raw" states, which I'm pretty sure all push the start
2406 # token onto open_els. TODO: verify this after the script data states
2408 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2409 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2411 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2412 tok_state_rcdata_end_tag_name = ->
2413 c = txt.charAt(cur++)
2414 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2415 if is_appropriate_end_tag tok_cur_tag
2416 tok_state = tok_state_before_attribute_name
2418 # else fall through to "Anything else"
2420 if is_appropriate_end_tag tok_cur_tag
2421 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2423 # else fall through to "Anything else"
2425 if is_appropriate_end_tag tok_cur_tag
2426 tok_state = tok_state_data
2428 # else fall through to "Anything else"
2430 tok_cur_tag.name += c.toLowerCase()
2431 temporary_buffer += c
2434 tok_cur_tag.name += c
2435 temporary_buffer += c
2438 tok_state = tok_state_rcdata
2439 cur -= 1 # reconsume the input character
2440 return new_character_token '</' + temporary_buffer # fixfull separate these
2442 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2443 tok_state_rawtext_less_than_sign = ->
2444 c = txt.charAt(cur++)
2446 temporary_buffer = ''
2447 tok_state = tok_state_rawtext_end_tag_open
2450 tok_state = tok_state_rawtext
2451 cur -= 1 # reconsume the input character
2452 return new_character_token '<'
2454 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2455 tok_state_rawtext_end_tag_open = ->
2456 c = txt.charAt(cur++)
2458 tok_cur_tag = new_end_tag c.toLowerCase()
2459 temporary_buffer += c
2460 tok_state = tok_state_rawtext_end_tag_name
2463 tok_cur_tag = new_end_tag c
2464 temporary_buffer += c
2465 tok_state = tok_state_rawtext_end_tag_name
2468 tok_state = tok_state_rawtext
2469 cur -= 1 # reconsume the input character
2470 return new_character_token "</" # fixfull separate these
2472 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2473 tok_state_rawtext_end_tag_name = ->
2474 c = txt.charAt(cur++)
2475 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2476 if is_appropriate_end_tag tok_cur_tag
2477 tok_state = tok_state_before_attribute_name
2479 # else fall through to "Anything else"
2481 if is_appropriate_end_tag tok_cur_tag
2482 tok_state = tok_state_self_closing_start_tag
2484 # else fall through to "Anything else"
2486 if is_appropriate_end_tag tok_cur_tag
2487 tok_state = tok_state_data
2489 # else fall through to "Anything else"
2491 tok_cur_tag.name += c.toLowerCase()
2492 temporary_buffer += c
2495 tok_cur_tag.name += c
2496 temporary_buffer += c
2499 tok_state = tok_state_rawtext
2500 cur -= 1 # reconsume the input character
2501 return new_character_token '</' + temporary_buffer # fixfull separate these
2503 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
2504 tok_state_script_data_less_than_sign = ->
2505 c = txt.charAt(cur++)
2507 temporary_buffer = ''
2508 tok_state = tok_state_script_data_end_tag_open
2511 tok_state = tok_state_script_data_escape_start
2512 return new_character_token '<!' # fixfull split
2514 tok_state = tok_state_script_data
2515 cur -= 1 # Reconsume
2516 return new_character_token '<'
2518 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2519 tok_state_script_data_end_tag_open = ->
2520 c = txt.charAt(cur++)
2522 tok_cur_tag = new_end_tag c.toLowerCase()
2523 temporary_buffer += c
2524 tok_state = tok_state_script_data_end_tag_name
2527 tok_cur_tag = new_end_tag c
2528 temporary_buffer += c
2529 tok_state = tok_state_script_data_end_tag_name
2532 tok_state = tok_state_script_data
2533 cur -= 1 # Reconsume
2534 return new_character_token '</'
2536 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2537 tok_state_script_data_end_tag_name = ->
2538 c = txt.charAt(cur++)
2539 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2540 if is_appropriate_end_tag tok_cur_tag
2541 tok_state = tok_state_before_attribute_name
2545 if is_appropriate_end_tag tok_cur_tag
2546 tok_state = tok_state_self_closing_start_tag
2550 tok_cur_tag.name += c.toLowerCase()
2551 temporary_buffer += c
2554 tok_cur_tag.name += c
2555 temporary_buffer += c
2558 tok_state = tok_state_script_data
2559 cur -= 1 # Reconsume
2560 return new_character_token "</#{temporary_buffer}" # fixfull split
2562 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
2563 tok_state_script_data_escape_start = ->
2564 c = txt.charAt(cur++)
2566 tok_state = tok_state_script_data_escape_start_dash
2567 return new_character_token '-'
2569 tok_state = tok_state_script_data
2570 cur -= 1 # Reconsume
2573 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
2574 tok_state_script_data_escape_start_dash = ->
2575 c = txt.charAt(cur++)
2577 tok_state = tok_state_script_data_escaped_dash_dash
2578 return new_character_token '-'
2580 tok_state = tok_state_script_data
2581 cur -= 1 # Reconsume
2584 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
2585 tok_state_script_data_escaped = ->
2586 c = txt.charAt(cur++)
2588 tok_state = tok_state_script_data_escaped_dash
2589 return new_character_token '-'
2591 tok_state = tok_state_script_data_escaped_less_than_sign
2595 return new_character_token "\ufffd"
2597 tok_state = tok_state_data
2599 cur -= 1 # Reconsume
2602 return new_character_token c
2604 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
2605 tok_state_script_data_escaped_dash = ->
2606 c = txt.charAt(cur++)
2608 tok_state = tok_state_script_data_escaped_dash_dash
2609 return new_character_token '-'
2611 tok_state = tok_state_script_data_escaped_less_than_sign
2615 tok_state = tok_state_script_data_escaped
2616 return new_character_token "\ufffd"
2618 tok_state = tok_state_data
2620 cur -= 1 # Reconsume
2623 tok_state = tok_state_script_data_escaped
2624 return new_character_token c
2626 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
2627 tok_state_script_data_escaped_dash_dash = ->
2628 c = txt.charAt(cur++)
2630 return new_character_token '-'
2632 tok_state = tok_state_script_data_escaped_less_than_sign
2635 tok_state = tok_state_script_data
2636 return new_character_token '>'
2639 tok_state = tok_state_script_data_escaped
2640 return new_character_token "\ufffd"
2643 tok_state = tok_state_data
2644 cur -= 1 # Reconsume
2647 tok_state = tok_state_script_data_escaped
2648 return new_character_token c
2650 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
2651 tok_state_script_data_escaped_less_than_sign = ->
2652 c = txt.charAt(cur++)
2654 temporary_buffer = ''
2655 tok_state = tok_state_script_data_escaped_end_tag_open
2658 temporary_buffer = c.toLowerCase() # yes, really
2659 tok_state = tok_state_script_data_double_escape_start
2660 return new_character_token "<#{c}" # fixfull split
2662 temporary_buffer = c
2663 tok_state = tok_state_script_data_double_escape_start
2664 return new_character_token "<#{c}" # fixfull split
2666 tok_state = tok_state_script_data_escaped
2667 cur -= 1 # Reconsume
2668 return new_character_token c
2670 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
2671 tok_state_script_data_escaped_end_tag_open = ->
2672 c = txt.charAt(cur++)
2674 tok_cur_tag = new_end_tag c.toLowerCase()
2675 temporary_buffer += c
2676 tok_state = tok_state_script_data_escaped_end_tag_name
2679 tok_cur_tag = new_end_tag c
2680 temporary_buffer += c
2681 tok_state = tok_state_script_data_escaped_end_tag_name
2684 tok_state = tok_state_script_data_escaped
2685 cur -= 1 # Reconsume
2686 return new_character_token '</' # fixfull split
2688 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
2689 tok_state_script_data_escaped_end_tag_name = ->
2690 c = txt.charAt(cur++)
2691 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2692 if is_appropriate_end_tag tok_cur_tag
2693 tok_state = tok_state_before_attribute_name
2697 if is_appropriate_end_tag tok_cur_tag
2698 tok_state = tok_state_self_closing_start_tag
2702 tok_cur_tag.name += c.toLowerCase()
2703 temporary_buffer += c.toLowerCase()
2706 tok_cur_tag.name += c
2707 temporary_buffer += c.toLowerCase()
2710 tok_state = tok_state_script_data_escaped
2711 cur -= 1 # Reconsume
2712 return new_character_token "</#{temporary_buffer}" # fixfull split
2714 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
2715 tok_state_script_data_double_escape_start = ->
2716 c = txt.charAt(cur++)
2717 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
2718 if temporary_buffer is 'script'
2719 tok_state = tok_state_script_data_double_escaped
2721 tok_state = tok_state_script_data_escaped
2722 return new_character_token c
2724 temporary_buffer += c.toLowerCase() # yes, really lowercase
2725 return new_character_token c
2727 temporary_buffer += c
2728 return new_character_token c
2730 tok_state = tok_state_script_data_escaped
2731 cur -= 1 # Reconsume
2734 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
2735 tok_state_script_data_double_escaped = ->
2736 c = txt.charAt(cur++)
2738 tok_state = tok_state_script_data_double_escaped_dash
2739 return new_character_token '-'
2741 tok_state = tok_state_script_data_double_escaped_less_than_sign
2742 return new_character_token '<'
2745 return new_character_token "\ufffd"
2748 tok_state = tok_state_data
2749 cur -= 1 # Reconsume
2752 return new_character_token c
2754 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
2755 tok_state_script_data_double_escaped_dash = ->
2756 c = txt.charAt(cur++)
2758 tok_state = tok_state_script_data_double_escaped_dash_dash
2759 return new_character_token '-'
2761 tok_state = tok_state_script_data_double_escaped_less_than_sign
2762 return new_character_token '<'
2765 tok_state = tok_state_script_data_double_escaped
2766 return new_character_token "\ufffd"
2769 tok_state = tok_state_data
2770 cur -= 1 # Reconsume
2773 tok_state = tok_state_script_data_double_escaped
2774 return new_character_token c
2776 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
2777 tok_state_script_data_double_escaped_dash_dash = ->
2778 c = txt.charAt(cur++)
2780 return new_character_token '-'
2782 tok_state = tok_state_script_data_double_escaped_less_than_sign
2783 return new_character_token '<'
2785 tok_state = tok_state_script_data
2786 return new_character_token '>'
2789 tok_state = tok_state_script_data_double_escaped
2790 return new_character_token "\ufffd"
2793 tok_state = tok_state_data
2794 cur -= 1 # Reconsume
2797 tok_state = tok_state_script_data_double_escaped
2798 return new_character_token c
2800 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
2801 tok_state_script_data_double_escaped_less_than_sign = ->
2802 c = txt.charAt(cur++)
2804 temporary_buffer = ''
2805 tok_state = tok_state_script_data_double_escape_end
2806 return new_character_token '/'
2808 tok_state = tok_state_script_data_double_escaped
2809 cur -= 1 # Reconsume
2812 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
2813 tok_state_script_data_double_escape_end = ->
2814 c = txt.charAt(cur++)
2815 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
2816 if temporary_buffer is 'script'
2817 tok_state = tok_state_script_data_escaped
2819 tok_state = tok_state_script_data_double_escaped
2820 return new_character_token c
2822 temporary_buffer += c.toLowerCase() # yes, really lowercase
2823 return new_character_token c
2825 temporary_buffer += c
2826 return new_character_token c
2828 tok_state = tok_state_script_data_double_escaped
2829 cur -= 1 # Reconsume
2832 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2833 tok_state_before_attribute_name = ->
2835 switch c = txt.charAt(cur++)
2836 when "\t", "\n", "\u000c", ' '
2839 tok_state = tok_state_self_closing_start_tag
2842 tok_state = tok_state_data
2848 attr_name = "\ufffd"
2849 when '"', "'", '<', '='
2854 tok_state = tok_state_data
2857 attr_name = c.toLowerCase()
2861 tok_cur_tag.attrs_a.unshift [attr_name, '']
2862 tok_state = tok_state_attribute_name
2865 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2866 tok_state_attribute_name = ->
2867 switch c = txt.charAt(cur++)
2868 when "\t", "\n", "\u000c", ' '
2869 tok_state = tok_state_after_attribute_name
2871 tok_state = tok_state_self_closing_start_tag
2873 tok_state = tok_state_before_attribute_value
2875 tok_state = tok_state_data
2881 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2884 tok_cur_tag.attrs_a[0][0] = c
2887 tok_state = tok_state_data
2890 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2892 tok_cur_tag.attrs_a[0][0] += c
2895 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2896 tok_state_after_attribute_name = ->
2897 c = txt.charAt(cur++)
2898 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2901 tok_state = tok_state_self_closing_start_tag
2904 tok_state = tok_state_before_attribute_value
2907 tok_state = tok_state_data
2910 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2911 tok_state = tok_state_attribute_name
2915 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2916 tok_state = tok_state_attribute_name
2920 tok_state = tok_state_data
2921 cur -= 1 # reconsume
2923 if c is '"' or c is "'" or c is '<'
2925 # fall through to Anything else
2927 tok_cur_tag.attrs_a.unshift [c, '']
2928 tok_state = tok_state_attribute_name
2930 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2931 tok_state_before_attribute_value = ->
2932 switch c = txt.charAt(cur++)
2933 when "\t", "\n", "\u000c", ' '
2936 tok_state = tok_state_attribute_value_double_quoted
2938 tok_state = tok_state_attribute_value_unquoted
2941 tok_state = tok_state_attribute_value_single_quoted
2944 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2945 tok_state = tok_state_attribute_value_unquoted
2948 tok_state = tok_state_data
2954 tok_state = tok_state_data
2956 tok_cur_tag.attrs_a[0][1] += c
2957 tok_state = tok_state_attribute_value_unquoted
2960 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2961 tok_state_attribute_value_double_quoted = ->
2962 switch c = txt.charAt(cur++)
2964 tok_state = tok_state_after_attribute_value_quoted
2966 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2969 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2972 tok_state = tok_state_data
2974 tok_cur_tag.attrs_a[0][1] += c
2977 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2978 tok_state_attribute_value_single_quoted = ->
2979 switch c = txt.charAt(cur++)
2981 tok_state = tok_state_after_attribute_value_quoted
2983 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2986 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2989 tok_state = tok_state_data
2991 tok_cur_tag.attrs_a[0][1] += c
2994 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2995 tok_state_attribute_value_unquoted = ->
2996 switch c = txt.charAt(cur++)
2997 when "\t", "\n", "\u000c", ' '
2998 tok_state = tok_state_before_attribute_name
3000 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3002 tok_state = tok_state_data
3007 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3010 tok_state = tok_state_data
3012 # Parse Error if ', <, = or ` (backtick)
3013 tok_cur_tag.attrs_a[0][1] += c
3016 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3017 tok_state_after_attribute_value_quoted = ->
3018 switch c = txt.charAt(cur++)
3019 when "\t", "\n", "\u000c", ' '
3020 tok_state = tok_state_before_attribute_name
3022 tok_state = tok_state_self_closing_start_tag
3024 tok_state = tok_state_data
3030 tok_state = tok_state_data
3033 tok_state = tok_state_before_attribute_name
3034 cur -= 1 # we didn't handle that char
3037 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3038 tok_state_self_closing_start_tag = ->
3039 c = txt.charAt(cur++)
3041 tok_cur_tag.flag 'self-closing'
3042 tok_state = tok_state_data
3046 tok_state = tok_state_data
3047 cur -= 1 # Reconsume
3051 tok_state = tok_state_before_attribute_name
3052 cur -= 1 # Reconsume
3055 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3056 # WARNING: put a comment token in tok_cur_tag before setting this state
3057 tok_state_bogus_comment = ->
3058 next_gt = txt.indexOf '>', cur
3060 val = txt.substr cur
3063 val = txt.substr cur, (next_gt - cur)
3065 val = val.replace "\u0000", "\ufffd"
3066 tok_cur_tag.text += val
3067 tok_state = tok_state_data
3070 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3071 tok_state_markup_declaration_open = ->
3072 if txt.substr(cur, 2) is '--'
3074 tok_cur_tag = new_comment_token ''
3075 tok_state = tok_state_comment_start
3077 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3079 tok_state = tok_state_doctype
3081 acn = adjusted_current_node()
3082 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3084 tok_state = tok_state_cdata_section
3088 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
3089 tok_state = tok_state_bogus_comment
3092 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3093 tok_state_comment_start = ->
3094 switch c = txt.charAt(cur++)
3096 tok_state = tok_state_comment_start_dash
3099 return new_character_token "\ufffd"
3102 tok_state = tok_state_data
3106 tok_state = tok_state_data
3107 cur -= 1 # Reconsume
3110 tok_cur_tag.text += c
3113 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3114 tok_state_comment_start_dash = ->
3115 switch c = txt.charAt(cur++)
3117 tok_state = tok_state_comment_end
3120 tok_cur_tag.text += "-\ufffd"
3121 tok_state = tok_state_comment
3124 tok_state = tok_state_data
3128 tok_state = tok_state_data
3129 cur -= 1 # Reconsume
3132 tok_cur_tag.text += "-#{c}"
3133 tok_state = tok_state_comment
3136 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3137 tok_state_comment = ->
3138 switch c = txt.charAt(cur++)
3140 tok_state = tok_state_comment_end_dash
3143 tok_cur_tag.text += "\ufffd"
3146 tok_state = tok_state_data
3147 cur -= 1 # Reconsume
3150 tok_cur_tag.text += c
3153 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3154 tok_state_comment_end_dash = ->
3155 switch c = txt.charAt(cur++)
3157 tok_state = tok_state_comment_end
3160 tok_cur_tag.text += "-\ufffd"
3161 tok_state = tok_state_comment
3164 tok_state = tok_state_data
3165 cur -= 1 # Reconsume
3168 tok_cur_tag.text += "-#{c}"
3169 tok_state = tok_state_comment
3172 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3173 tok_state_comment_end = ->
3174 switch c = txt.charAt(cur++)
3176 tok_state = tok_state_data
3180 tok_cur_tag.text += "--\ufffd"
3181 tok_state = tok_state_comment
3184 tok_state = tok_state_comment_end_bang
3187 tok_cur_tag.text += '-'
3190 tok_state = tok_state_data
3191 cur -= 1 # Reconsume
3195 tok_cur_tag.text += "--#{c}"
3196 tok_state = tok_state_comment
3199 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3200 tok_state_comment_end_bang = ->
3201 switch c = txt.charAt(cur++)
3203 tok_cur_tag.text += "--!#{c}"
3204 tok_state = tok_state_comment_end_dash
3206 tok_state = tok_state_data
3210 tok_cur_tag.text += "--!\ufffd"
3211 tok_state = tok_state_comment
3214 tok_state = tok_state_data
3215 cur -= 1 # Reconsume
3218 tok_cur_tag.text += "--!#{c}"
3219 tok_state = tok_state_comment
3222 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3223 tok_state_doctype = ->
3224 switch c = txt.charAt(cur++)
3225 when "\t", "\u000a", "\u000c", ' '
3226 tok_state = tok_state_before_doctype_name
3229 tok_state = tok_state_data
3230 el = new_doctype_token ''
3231 el.flag 'force-quirks', true
3232 cur -= 1 # Reconsume
3236 tok_state = tok_state_before_doctype_name
3237 cur -= 1 # Reconsume
3240 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3241 tok_state_before_doctype_name = ->
3242 c = txt.charAt(cur++)
3243 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3246 tok_cur_tag = new_doctype_token c.toLowerCase()
3247 tok_state = tok_state_doctype_name
3251 tok_cur_tag = new_doctype_token "\ufffd"
3252 tok_state = tok_state_doctype_name
3256 el = new_doctype_token ''
3257 el.flag 'force-quirks', true
3258 tok_state = tok_state_data
3262 tok_state = tok_state_data
3263 el = new_doctype_token ''
3264 el.flag 'force-quirks', true
3265 cur -= 1 # Reconsume
3268 tok_cur_tag = new_doctype_token c
3269 tok_state = tok_state_doctype_name
3272 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3273 tok_state_doctype_name = ->
3274 c = txt.charAt(cur++)
3275 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3276 tok_state = tok_state_after_doctype_name
3279 tok_state = tok_state_data
3282 tok_cur_tag.name += c.toLowerCase()
3286 tok_cur_tag.name += "\ufffd"
3290 tok_state = tok_state_data
3291 tok_cur_tag.flag 'force-quirks', true
3292 cur -= 1 # Reconsume
3295 tok_cur_tag.name += c
3298 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3299 tok_state_after_doctype_name = ->
3300 c = txt.charAt(cur++)
3301 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3304 tok_state = tok_state_data
3308 tok_state = tok_state_data
3309 tok_cur_tag.flag 'force-quirks', true
3310 cur -= 1 # Reconsume
3313 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3315 tok_state = tok_state_after_doctype_public_keyword
3317 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3319 tok_state = tok_state_after_doctype_system_keyword
3322 tok_cur_tag.flag 'force-quirks', true
3323 tok_state = tok_state_bogus_doctype
3326 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3327 tok_state_after_doctype_public_keyword = ->
3328 c = txt.charAt(cur++)
3329 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3330 tok_state = tok_state_before_doctype_public_identifier
3334 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3335 tok_state = tok_state_doctype_public_identifier_double_quoted
3339 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3340 tok_state = tok_state_doctype_public_identifier_single_quoted
3344 tok_cur_tag.flag 'force-quirks', true
3345 tok_state = tok_state_data
3349 tok_state = tok_state_data
3350 tok_cur_tag.flag 'force-quirks', true
3351 cur -= 1 # Reconsume
3355 tok_cur_tag.flag 'force-quirks', true
3356 tok_state = tok_state_bogus_doctype
3359 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
3360 tok_state_before_doctype_public_identifier = ->
3361 c = txt.charAt(cur++)
3362 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3366 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3367 tok_state = tok_state_doctype_public_identifier_double_quoted
3371 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3372 tok_state = tok_state_doctype_public_identifier_single_quoted
3376 tok_cur_tag.flag 'force-quirks', true
3377 tok_state = tok_state_data
3381 tok_state = tok_state_data
3382 tok_cur_tag.flag 'force-quirks', true
3383 cur -= 1 # Reconsume
3387 tok_cur_tag.flag 'force-quirks', true
3388 tok_state = tok_state_bogus_doctype
3392 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
3393 tok_state_doctype_public_identifier_double_quoted = ->
3394 c = txt.charAt(cur++)
3396 tok_state = tok_state_after_doctype_public_identifier
3400 tok_cur_tag.public_identifier += "\ufffd"
3404 tok_cur_tag.flag 'force-quirks', true
3405 tok_state = tok_state_data
3409 tok_state = tok_state_data
3410 tok_cur_tag.flag 'force-quirks', true
3411 cur -= 1 # Reconsume
3414 tok_cur_tag.public_identifier += c
3417 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
3418 tok_state_doctype_public_identifier_single_quoted = ->
3419 c = txt.charAt(cur++)
3421 tok_state = tok_state_after_doctype_public_identifier
3425 tok_cur_tag.public_identifier += "\ufffd"
3429 tok_cur_tag.flag 'force-quirks', true
3430 tok_state = tok_state_data
3434 tok_state = tok_state_data
3435 tok_cur_tag.flag 'force-quirks', true
3436 cur -= 1 # Reconsume
3439 tok_cur_tag.public_identifier += c
3442 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
3443 tok_state_after_doctype_public_identifier = ->
3444 c = txt.charAt(cur++)
3445 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3446 tok_state = tok_state_between_doctype_public_and_system_identifiers
3449 tok_state = tok_state_data
3453 tok_cur_tag.system_identifier = ''
3454 tok_state = tok_state_doctype_system_identifier_double_quoted
3458 tok_cur_tag.system_identifier = ''
3459 tok_state = tok_state_doctype_system_identifier_single_quoted
3463 tok_state = tok_state_data
3464 tok_cur_tag.flag 'force-quirks', true
3465 cur -= 1 # Reconsume
3469 tok_cur_tag.flag 'force-quirks', true
3470 tok_state = tok_state_bogus_doctype
3473 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
3474 tok_state_between_doctype_public_and_system_identifiers = ->
3475 c = txt.charAt(cur++)
3476 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3479 tok_state = tok_state_data
3483 tok_cur_tag.system_identifier = ''
3484 tok_state = tok_state_doctype_system_identifier_double_quoted
3488 tok_cur_tag.system_identifier = ''
3489 tok_state = tok_state_doctype_system_identifier_single_quoted
3493 tok_state = tok_state_data
3494 tok_cur_tag.flag 'force-quirks', true
3495 cur -= 1 # Reconsume
3499 tok_cur_tag.flag 'force-quirks', true
3500 tok_state = tok_state_bogus_doctype
3503 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
3504 tok_state_after_doctype_system_keyword = ->
3505 c = txt.charAt(cur++)
3506 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3507 tok_state = tok_state_before_doctype_system_identifier
3511 tok_cur_tag.system_identifier = ''
3512 tok_state = tok_state_doctype_system_identifier_double_quoted
3516 tok_cur_tag.system_identifier = ''
3517 tok_state = tok_state_doctype_system_identifier_single_quoted
3521 tok_cur_tag.flag 'force-quirks', true
3522 tok_state = tok_state_data
3526 tok_state = tok_state_data
3527 tok_cur_tag.flag 'force-quirks', true
3528 cur -= 1 # Reconsume
3532 tok_cur_tag.flag 'force-quirks', true
3533 tok_state = tok_state_bogus_doctype
3536 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
3537 tok_state_before_doctype_system_identifier = ->
3538 c = txt.charAt(cur++)
3539 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3542 tok_cur_tag.system_identifier = ''
3543 tok_state = tok_state_doctype_system_identifier_double_quoted
3546 tok_cur_tag.system_identifier = ''
3547 tok_state = tok_state_doctype_system_identifier_single_quoted
3551 tok_cur_tag.flag 'force-quirks', true
3552 tok_state = tok_state_data
3556 tok_state = tok_state_data
3557 tok_cur_tag.flag 'force-quirks', true
3558 cur -= 1 # Reconsume
3562 tok_cur_tag.flag 'force-quirks', true
3563 tok_state = tok_state_bogus_doctype
3566 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
3567 tok_state_doctype_system_identifier_double_quoted = ->
3568 c = txt.charAt(cur++)
3570 tok_state = tok_state_after_doctype_system_identifier
3574 tok_cur_tag.system_identifier += "\ufffd"
3578 tok_cur_tag.flag 'force-quirks', true
3579 tok_state = tok_state_data
3583 tok_state = tok_state_data
3584 tok_cur_tag.flag 'force-quirks', true
3585 cur -= 1 # Reconsume
3588 tok_cur_tag.system_identifier += c
3591 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
3592 tok_state_doctype_system_identifier_single_quoted = ->
3593 c = txt.charAt(cur++)
3595 tok_state = tok_state_after_doctype_system_identifier
3599 tok_cur_tag.system_identifier += "\ufffd"
3603 tok_cur_tag.flag 'force-quirks', true
3604 tok_state = tok_state_data
3608 tok_state = tok_state_data
3609 tok_cur_tag.flag 'force-quirks', true
3610 cur -= 1 # Reconsume
3613 tok_cur_tag.system_identifier += c
3616 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
3617 tok_state_after_doctype_system_identifier = ->
3618 c = txt.charAt(cur++)
3619 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3622 tok_state = tok_state_data
3626 tok_state = tok_state_data
3627 tok_cur_tag.flag 'force-quirks', true
3628 cur -= 1 # Reconsume
3632 # do _not_ tok_cur_tag.flag 'force-quirks', true
3633 tok_state = tok_state_bogus_doctype
3636 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
3637 tok_state_bogus_doctype = ->
3638 c = txt.charAt(cur++)
3640 tok_state = tok_state_data
3643 tok_state = tok_state_data
3644 cur -= 1 # Reconsume
3650 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
3651 # Don't set this as a state, just call it
3652 # returns a string (NOT a text node)
3653 parse_character_reference = (allowed_char = null, in_attr = false) ->
3654 if cur >= txt.length
3656 switch c = txt.charAt(cur)
3657 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
3658 # explicitly not a parse error
3661 # there has to be "one or more" alnums between & and ; to be a parse error
3664 if cur + 1 >= txt.length
3666 if txt.charAt(cur + 1).toLowerCase() is 'x'
3675 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
3679 if txt.charAt(start + i) is ';'
3681 # FIXME This is supposed to generate parse errors for some chars
3682 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
3689 if alnum.indexOf(txt.charAt(cur + i)) is -1
3692 # exit early, because parse_error() below needs at least one alnum
3694 if txt.charAt(cur + i) is ';'
3695 i += 1 # include ';' terminator in value
3696 decoded = decode_named_char_ref txt.substr(cur, i)
3703 # no ';' terminator (only legacy char refs)
3705 for i in [2..max] # no prefix matches, so ok to check shortest first
3706 c = legacy_char_refs[txt.substr(cur, i)]
3709 if txt.charAt(cur + i) is '='
3710 # "because some legacy user agents will
3711 # misinterpret the markup in those cases"
3714 if alnum.indexOf(txt.charAt(cur + i)) > -1
3715 # this makes attributes forgiving about url args
3717 # ok, and besides the weird exceptions for attributes...
3718 # return the matching char
3719 cur += i # consume entity chars
3720 parse_error() # because no terminating ";"
3724 return # never reached
3726 # tree constructor initialization
3727 # see comments on TYPE_TAG/etc for the structure of this data
3728 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
3730 afe = [] # active formatting elements
3731 template_insertion_modes = []
3732 insertion_mode = ins_mode_initial
3733 original_insertion_mode = insertion_mode # TODO check spec
3734 flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
3735 flag_frameset_ok = true
3737 flag_foster_parenting = false
3738 form_element_pointer = null
3739 temporary_buffer = null
3740 pending_table_character_tokens = []
3741 head_element_pointer = null
3742 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
3743 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
3745 # tokenizer initialization
3746 tok_state = tok_state_data
3753 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
3756 serialize_els = (els, shallow, show_ids) ->
3762 serialized += t.serialize shallow, show_ids
3765 # TODO export TYPE_*
3766 module.exports.parse_html = parse_html
3767 module.exports.debug_log_reset = debug_log_reset
3768 module.exports.debug_log_each = debug_log_each
3769 module.exports.TYPE_TAG = TYPE_TAG
3770 module.exports.TYPE_TEXT = TYPE_TEXT
3771 module.exports.TYPE_COMMENT = TYPE_COMMENT
3772 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE