1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
55 module = exports: window.wheic
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
79 debug_log_each = (cb) ->
80 for str in g_debug_log
85 constructor: (type, args = {}) ->
86 @type = type # one of the TYPE_* constants above
87 @name = args.name ? '' # tag name
88 @text = args.text ? '' # contents for text/comment nodes
89 @attrs = args.attrs ? {}
90 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91 @children = args.children ? []
92 @namespace = args.namespace ? NS_HTML
93 @parent = args.parent ? null
94 @token = args.token ? null
98 @id = "#{++prev_node_id}"
99 acknowledge_self_closing: ->
101 @token.flag 'did_self_close'
103 @flag 'did_self_close', true
106 serialize: (shallow = false, show_ids = false) -> # for unit tests
111 ret += JSON.stringify @name
126 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
132 ret += c.serialize shallow, show_ids
136 ret += JSON.stringify @text
139 ret += JSON.stringify @text
141 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
144 when TYPE_AAA_BOOKMARK
145 ret += 'aaa_bookmark'
148 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
151 # helpers: (only take args that are normally known when parser creates nodes)
152 new_open_tag = (name) ->
153 return new Node TYPE_START_TAG, name: name
154 new_end_tag = (name) ->
155 return new Node TYPE_END_TAG, name: name
156 new_element = (name) ->
157 return new Node TYPE_TAG, name: name
158 new_text_node = (txt) ->
159 return new Node TYPE_TEXT, text: txt
160 new_character_token = new_text_node
161 new_comment_token = (txt) ->
162 return new Node TYPE_COMMENT, text: txt
163 new_doctype_token = (name) ->
164 return new Node TYPE_DOCTYPE, name: name
166 return new Node TYPE_EOF
168 return new Node TYPE_AFE_MARKER
169 new_aaa_bookmark = ->
170 return new Node TYPE_AAA_BOOKMARK
172 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
173 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
174 digits = "0123456789"
175 alnum = lc_alpha + uc_alpha + digits
176 hex_chars = digits + "abcdefABCDEF"
178 is_uc_alpha = (str) ->
179 return str.length is 1 and uc_alpha.indexOf(str) > -1
180 is_lc_alpha = (str) ->
181 return str.length is 1 and lc_alpha.indexOf(str) > -1
183 # some SVG elements have dashes in them
184 tag_name_chars = alnum + "-"
186 # http://www.w3.org/TR/html5/infrastructure.html#space-character
187 space_chars = "\u0009\u000a\u000c\u000d\u0020"
189 return txt.length is 1 and space_chars.indexOf(txt) > -1
190 is_space_tok = (t) ->
191 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
193 is_input_hidden_tok = (t) ->
194 return unless t.type is TYPE_START_TAG
197 if a[1].toLowerCase() is 'hidden'
202 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
203 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
205 # These are the character references that don't need a terminating semicolon
206 # min length: 2, max: 6, none are a prefix of any other.
208 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
209 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
210 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
211 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
212 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
213 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
214 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
215 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
216 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
217 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
218 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
219 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
220 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
221 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
222 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
223 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
224 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
228 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
229 raw_text_elements = ['script', 'style']
230 escapable_raw_text_elements = ['textarea', 'title']
231 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
233 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
234 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
235 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
236 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
237 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
238 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
239 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
240 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
241 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
242 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
243 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
244 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
245 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
246 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
250 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
252 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
253 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
254 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
255 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
256 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
257 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
258 'determinant', 'diff', 'divergence', 'divide', 'domain',
259 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
260 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
261 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
262 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
263 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
264 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
265 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
266 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
267 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
268 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
269 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
270 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
271 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
272 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
273 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
274 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
275 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
276 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
277 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
278 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
279 'vectorproduct', 'xor'
281 # foreign_elements = [svg_elements..., mathml_elements...]
282 #normal_elements = All other allowed HTML elements are normal elements.
286 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
287 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
288 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
289 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
290 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
291 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
292 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
293 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
294 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
295 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
296 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
297 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
298 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
299 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
300 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
301 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
302 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
303 wbr:NS_HTML, xmp:NS_HTML,
306 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
307 'annotation-xml':NS_MATHML,
310 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
313 formatting_elements = {
314 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
315 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
319 foster_parenting_targets = {
341 el_is_special = (e) ->
342 return special_elements[e.name] is e.namespace
344 # decode_named_char_ref()
346 # The list of named character references is _huge_ so ask the browser to decode
347 # for us instead of wasting bandwidth/space on including the table here.
349 # Pass without the "&" but with the ";" examples:
350 # for "&" pass "amp;"
351 # for "′" pass "x2032;"
354 textarea: document.createElement('textarea')
356 # TODO test this in IE8
357 decode_named_char_ref = (txt) ->
359 decoded = g_dncr.cache[txt]
360 return decoded if decoded?
361 g_dncr.textarea.innerHTML = txt
362 decoded = g_dncr.textarea.value
363 return null if decoded is txt
364 return g_dncr.cache[txt] = decoded
366 parse_html = (txt, parse_error_cb = null) ->
367 cur = 0 # index of next char in txt to be parsed
368 # declare doc and tokenizer variables so they're in scope below
370 open_els = null # stack of open elements
371 afe = null # active formatting elements
372 template_insertion_modes = null
373 insertion_mode = null
374 original_insertion_mode = null
376 tok_cur_tag = null # partially parsed tag
377 flag_scripting = null
378 flag_frameset_ok = null
380 flag_foster_parenting = null
381 form_element_pointer = null
382 temporary_buffer = null
383 pending_table_character_tokens = null
384 head_element_pointer = null
385 flag_fragment_parsing = null
386 context_element = null
395 console.log "Parse error at character #{cur} of #{txt.length}"
397 afe_push = (new_el) ->
400 if el.name is new_el.name and el.namespace is new_el.namespace
402 continue unless new_el.attrs[k] is v
403 for k, v of new_el.attrs
404 continue unless el.attrs[k] is v
411 afe.unshift new_afe_marker()
413 # the functions below impliment the Tree Contstruction algorithm
414 # http://www.w3.org/TR/html5/syntax.html#tree-construction
416 # But first... the helpers
417 template_tag_is_open = ->
419 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
422 is_in_scope_x = (tag_name, scope, namespace) ->
424 if t.name is tag_name and (namespace is null or namespace is t.namespace)
426 if scope[t.name] is t.namespace
429 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
431 if t.name is tag_name and (namespace is null or namespace is t.namespace)
433 if scope[t.name] is t.namespace
435 if scope2[t.name] is t.namespace
439 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
440 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
441 template: NS_HTML, mi: NS_MATHML,
443 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
444 'annotation-xml': NS_MATHML,
446 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
448 button_scopers = button: NS_HTML
449 li_scopers = ol: NS_HTML, ul: NS_HTML
450 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
451 is_in_scope = (tag_name, namespace = null) ->
452 return is_in_scope_x tag_name, standard_scopers, namespace
453 is_in_button_scope = (tag_name, namespace = null) ->
454 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
455 is_in_table_scope = (tag_name, namespace = null) ->
456 return is_in_scope_x tag_name, table_scopers, namespace
457 is_in_select_scope = (tag_name, namespace = null) ->
459 if t.name is tag_name and (namespace is null or namespace is t.namespace)
461 if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
464 # this checks for a particular element, not by name
465 el_is_in_scope = (el) ->
469 if standard_scopers[t.name] is t.namespace
473 clear_to_table_stopers = {
478 clear_stack_to_table_context = ->
480 if clear_to_table_stopers[open_els[0].name]?
484 clear_to_table_body_stopers = {
491 clear_stack_to_table_body_context = ->
493 if clear_to_table_body_stopers[open_els[0].name]?
497 clear_to_table_row_stopers = {
502 clear_stack_to_table_row_context = ->
504 if clear_to_table_row_stopers[open_els[0].name]?
508 clear_afe_to_marker = ->
510 return unless afe.length > 0 # this happens in fragment case, ?spec error
512 if el.type is TYPE_AFE_MARKER
517 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
518 reset_insertion_mode = ->
519 # 1. Let last be false.
521 # 2. Let node be the last node in the stack of open elements.
523 node = open_els[node_i]
524 # 3. Loop: If node is the first node in the stack of open elements,
525 # then set last to true, and, if the parser was originally created as
526 # part of the HTML fragment parsing algorithm (fragment case) set node
527 # to the context element.
529 if node_i is open_els.length - 1
531 # fixfull (fragment case)
533 # 4. If node is a select element, run these substeps:
534 if node.name is 'select'
535 # 1. If last is true, jump to the step below labeled done.
537 # 2. Let ancestor be node.
540 # 3. Loop: If ancestor is the first node in the stack of
541 # open elements, jump to the step below labeled done.
543 if ancestor_i is open_els.length - 1
545 # 4. Let ancestor be the node before ancestor in the stack
548 ancestor = open_els[ancestor_i]
549 # 5. If ancestor is a template node, jump to the step below
551 if ancestor.name is 'template'
553 # 6. If ancestor is a table node, switch the insertion mode
554 # to "in select in table" and abort these steps.
555 if ancestor.name is 'table'
556 insertion_mode = ins_mode_in_select_in_table
558 # 7. Jump back to the step labeled loop.
559 # 8. Done: Switch the insertion mode to "in select" and abort
561 insertion_mode = ins_mode_in_select
563 # 5. If node is a td or th element and last is false, then switch
564 # the insertion mode to "in cell" and abort these steps.
565 if (node.name is 'td' or node.name is 'th') and last is false
566 insertion_mode = ins_mode_in_cell
568 # 6. If node is a tr element, then switch the insertion mode to "in
569 # row" and abort these steps.
571 insertion_mode = ins_mode_in_row
573 # 7. If node is a tbody, thead, or tfoot element, then switch the
574 # insertion mode to "in table body" and abort these steps.
575 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
576 insertion_mode = ins_mode_in_table_body
578 # 8. If node is a caption element, then switch the insertion mode
579 # to "in caption" and abort these steps.
580 if node.name is 'caption'
581 insertion_mode = ins_mode_in_caption
583 # 9. If node is a colgroup element, then switch the insertion mode
584 # to "in column group" and abort these steps.
585 if node.name is 'colgroup'
586 insertion_mode = ins_mode_in_column_group
588 # 10. If node is a table element, then switch the insertion mode to
589 # "in table" and abort these steps.
590 if node.name is 'table'
591 insertion_mode = ins_mode_in_table
593 # 11. If node is a template element, then switch the insertion mode
594 # to the current template insertion mode and abort these steps.
595 # fixfull (template insertion mode stack)
597 # 12. If node is a head element and last is true, then switch the
598 # insertion mode to "in body" ("in body"! not "in head"!) and abort
599 # these steps. (fragment case)
600 if node.name is 'head' and last
601 insertion_mode = ins_mode_in_body
603 # 13. If node is a head element and last is false, then switch the
604 # insertion mode to "in head" and abort these steps.
605 if node.name is 'head' and last is false
606 insertion_mode = ins_mode_in_head
608 # 14. If node is a body element, then switch the insertion mode to
609 # "in body" and abort these steps.
610 if node.name is 'body'
611 insertion_mode = ins_mode_in_body
613 # 15. If node is a frameset element, then switch the insertion mode
614 # to "in frameset" and abort these steps. (fragment case)
615 if node.name is 'frameset'
616 insertion_mode = ins_mode_in_frameset
618 # 16. If node is an html element, run these substeps:
619 if node.name is 'html'
620 # 1. If the head element pointer is null, switch the insertion
621 # mode to "before head" and abort these steps. (fragment case)
622 if head_element_pointer is null
623 ins_mode = ins_mode_before_head
625 # 2. Otherwise, the head element pointer is not null,
626 # switch the insertion mode to "after head" and abort these
628 insertion_mode = ins_mode_after_head
630 # 17. If last is true, then switch the insertion mode to "in body"
631 # and abort these steps. (fragment case)
633 insertion_mode = ins_mode_in_body
635 # 18. Let node now be the node before node in the stack of open
638 node = open_els[node_i]
639 # 19. Return to the step labeled loop.
643 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
644 adjusted_current_node = ->
645 if open_els.length is 1 and flag_fragment_parsing
646 return context_element
649 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
650 # this implementation is structured (mostly) as described at the link above.
651 # capitalized comments are the "labels" described at the link above.
652 reconstruct_active_formatting_elements = ->
653 return if afe.length is 0
654 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
659 if i is afe.length - 1
662 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
667 el = insert_html_element afe[i].token
672 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
673 # adoption agency algorithm
675 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
676 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
677 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
678 adoption_agency = (subject) ->
679 debug_log "adoption_agency()"
680 debug_log "tree: #{serialize_els doc.children, false, true}"
681 debug_log "open_els: #{serialize_els open_els, true, true}"
682 debug_log "afe: #{serialize_els afe, true, true}"
683 if open_els[0].name is subject
686 # remove it from the list of active formatting elements (if found)
691 debug_log "aaa: starting off with subject on top of stack, exiting"
698 # 5. Let formatting element be the last element in the list of
699 # active formatting elements that: is between the end of the list
700 # and the last scope marker in the list, if any, or the start of
701 # the list otherwise, and has the tag name subject.
703 for t, fe_of_afe in afe
704 if t.type is TYPE_AFE_MARKER
709 # If there is no such element, then abort these steps and instead
710 # act as described in the "any other end tag" entry above.
712 debug_log "aaa: fe not found in afe"
713 in_body_any_other_end_tag subject
715 # 6. If formatting element is not in the stack of open elements,
716 # then this is a parse error; remove the element from the list, and
719 for t, fe_of_open_els in open_els
724 debug_log "aaa: fe not found in open_els"
726 # "remove it from the list" must mean afe, since it's not in open_els
727 afe.splice fe_of_afe, 1
729 # 7. If formatting element is in the stack of open elements, but
730 # the element is not in scope, then this is a parse error; abort
732 unless el_is_in_scope fe
733 debug_log "aaa: fe not in scope"
736 # 8. If formatting element is not the current node, this is a parse
737 # error. (But do not abort these steps.)
738 unless open_els[0] is fe
741 # 9. Let furthest block be the topmost node in the stack of open
742 # elements that is lower in the stack than formatting element, and
743 # is an element in the special category. There might not be one.
745 fb_of_open_els = null
752 # and continue, to see if there's one that's more "topmost"
753 # 10. If there is no furthest block, then the UA must first pop all
754 # the nodes from the bottom of the stack of open elements, from the
755 # current node up to and including formatting element, then remove
756 # formatting element from the list of active formatting elements,
757 # and finally abort these steps.
759 debug_log "aaa: no fb"
763 afe.splice fe_of_afe, 1
765 # 11. Let common ancestor be the element immediately above
766 # formatting element in the stack of open elements.
767 ca = open_els[fe_of_open_els + 1] # common ancestor
769 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
770 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
771 bookmark = new_aaa_bookmark()
774 afe.splice i, 0, bookmark
776 node = last_node = fb
780 # 3. Let node be the element immediately above node in the
781 # stack of open elements, or if node is no longer in the stack
782 # of open elements (e.g. because it got removed by this
783 # algorithm), the element that was immediately above node in
784 # the stack of open elements before node was removed.
788 node_next = open_els[i + 1]
790 node = node_next ? node_above
791 debug_log "inner loop #{inner}"
792 debug_log "tree: #{serialize_els doc.children, false, true}"
793 debug_log "open_els: #{serialize_els open_els, true, true}"
794 debug_log "afe: #{serialize_els afe, true, true}"
795 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
796 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
797 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
798 debug_log "node: #{node.serialize true, true}"
799 # TODO make sure node_above gets re-set if/when node is removed from open_els
801 # 4. If node is formatting element, then go to the next step in
802 # the overall algorithm.
806 # 5. If inner loop counter is greater than three and node is in
807 # the list of active formatting elements, then remove node from
808 # the list of active formatting elements.
814 debug_log "max out inner"
819 # 6. If node is not in the list of active formatting elements,
820 # then remove node from the stack of open elements and then go
821 # back to the step labeled inner loop.
823 debug_log "not in afe"
826 node_above = open_els[i + 1]
830 debug_log "the bones"
831 # 7. create an element for the token for which the element node
832 # was created, in the HTML namespace, with common ancestor as
833 # the intended parent; replace the entry for node in the list
834 # of active formatting elements with an entry for the new
835 # element, replace the entry for node in the stack of open
836 # elements with an entry for the new element, and let node be
838 new_node = token_to_element node.token, NS_HTML, ca
842 debug_log "replaced in afe"
846 node_above = open_els[i + 1]
847 open_els[i] = new_node
848 debug_log "replaced in open_els"
851 # 8. If last node is furthest block, then move the
852 # aforementioned bookmark to be immediately after the new node
853 # in the list of active formatting elements.
858 debug_log "removed bookmark"
862 # "after" means lower
863 afe.splice i, 0, bookmark # "after as <-
864 debug_log "placed bookmark after node"
865 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
867 # 9. Insert last node into node, first removing it from its
868 # previous parent node if any.
870 debug_log "last_node has parent"
871 for c, i in last_node.parent.children
873 debug_log "removing last_node from parent"
874 last_node.parent.children.splice i, 1
876 node.children.push last_node
877 last_node.parent = node
878 # 10. Let last node be node.
881 # 11. Return to the step labeled inner loop.
882 # 14. Insert whatever last node ended up being in the previous step
883 # at the appropriate place for inserting a node, but using common
884 # ancestor as the override target.
886 # In the case where fe is immediately followed by fb:
887 # * inner loop exits out early (node==fe)
889 # * last_node is still in the tree (not a duplicate)
891 debug_log "FEFIRST? last_node has parent"
892 for c, i in last_node.parent.children
894 debug_log "removing last_node from parent"
895 last_node.parent.children.splice i, 1
898 debug_log "after aaa inner loop"
899 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
900 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
901 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
902 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
903 debug_log "tree: #{serialize_els doc.children, false, true}"
908 # can't use standard insert token thing, because it's already in
909 # open_els and must stay at it's current position in open_els
910 dest = adjusted_insertion_location ca
911 dest[0].children.splice dest[1], 0, last_node
912 last_node.parent = dest[0]
915 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
916 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
917 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
918 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
919 debug_log "tree: #{serialize_els doc.children, false, true}"
921 # 15. Create an element for the token for which formatting element
922 # was created, in the HTML namespace, with furthest block as the
924 new_element = token_to_element fe.token, NS_HTML, fb
925 # 16. Take all of the child nodes of furthest block and append them
926 # to the element created in the last step.
927 while fb.children.length
928 t = fb.children.shift()
929 t.parent = new_element
930 new_element.children.push t
931 # 17. Append that new element to furthest block.
932 new_element.parent = fb
933 fb.children.push new_element
934 # 18. Remove formatting element from the list of active formatting
935 # elements, and insert the new element into the list of active
936 # formatting elements at the position of the aforementioned
946 # 19. Remove formatting element from the stack of open elements,
947 # and insert the new element into the stack of open elements
948 # immediately below the position of furthest block in that stack.
955 open_els.splice i, 0, new_element
957 # 20. Jump back to the step labeled outer loop.
958 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
959 debug_log "tree: #{serialize_els doc.children, false, true}"
960 debug_log "open_els: #{serialize_els open_els, true, true}"
961 debug_log "afe: #{serialize_els afe, true, true}"
964 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
966 generate_implied_end_tags 'p' # arg is exception
967 if open_els[0].name isnt 'p'
969 while open_els.length > 1 # just in case
970 el = open_els.shift()
973 close_p_if_in_button_scope = ->
974 if is_in_button_scope 'p'
977 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
978 # aka insert_a_character = (t) ->
979 insert_character = (t) ->
980 dest = adjusted_insertion_location()
981 # fixfull check for Document node
983 prev = dest[0].children[dest[1] - 1]
984 if prev.type is TYPE_TEXT
987 dest[0].children.splice dest[1], 0, t
990 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
991 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
992 adjusted_insertion_location = (override_target = null) ->
993 # 1. If there was an override target specified, then let target be the
996 target = override_target
997 else # Otherwise, let target be the current node.
999 # 2. Determine the adjusted insertion location using the first matching
1000 # steps from the following list:
1002 # If foster parenting is enabled and target is a table, tbody, tfoot,
1003 # thead, or tr element Foster parenting happens when content is
1004 # misnested in tables.
1005 if flag_foster_parenting and foster_parenting_targets[target.name]
1006 loop # once. this is here so we can ``break`` to "abort these substeps"
1007 # 1. Let last template be the last template element in the
1008 # stack of open elements, if any.
1009 last_template = null
1010 last_template_i = null
1011 for el, i in open_els
1012 if el.name is 'template'
1016 # 2. Let last table be the last table element in the stack of
1017 # open elements, if any.
1020 for el, i in open_els
1021 if el.name is 'table'
1025 # 3. If there is a last template and either there is no last
1026 # table, or there is one, but last template is lower (more
1027 # recently added) than last table in the stack of open
1028 # elements, then: let adjusted insertion location be inside
1029 # last template's template contents, after its last child (if
1030 # any), and abort these substeps.
1031 if last_template and (last_table is null or last_template_i < last_table_i)
1032 target = last_template # fixfull should be it's contents
1033 target_i = target.children.length
1035 # 4. If there is no last table, then let adjusted insertion
1036 # location be inside the first element in the stack of open
1037 # elements (the html element), after its last child (if any),
1038 # and abort these substeps. (fragment case)
1039 if last_table is null
1041 target = open_els[open_els.length - 1]
1042 target_i = target.children.length
1043 # 5. If last table has a parent element, then let adjusted
1044 # insertion location be inside last table's parent element,
1045 # immediately before last table, and abort these substeps.
1046 if last_table.parent?
1047 for c, i in last_table.parent.children
1049 target = last_table.parent
1053 # 6. Let previous element be the element immediately above last
1054 # table in the stack of open elements.
1056 # huh? how could it not have a parent?
1057 previous_element = open_els[last_table_i + 1]
1058 # 7. Let adjusted insertion location be inside previous
1059 # element, after its last child (if any).
1060 target = previous_element
1061 target_i = target.children.length
1062 # Note: These steps are involved in part because it's possible
1063 # for elements, the table element in this case in particular,
1064 # to have been moved by a script around in the DOM, or indeed
1065 # removed from the DOM entirely, after the element was inserted
1067 break # don't really loop
1069 # Otherwise Let adjusted insertion location be inside target, after
1070 # its last child (if any).
1071 target_i = target.children.length
1073 # 3. If the adjusted insertion location is inside a template element,
1074 # let it instead be inside the template element's template contents,
1075 # after its last child (if any).
1076 # fixfull (template)
1078 # 4. Return the adjusted insertion location.
1079 return [target, target_i]
1081 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1082 # aka create_an_element_for_token
1083 token_to_element = (t, namespace, intended_parent) ->
1084 # convert attributes into a hash
1087 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1088 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1090 # TODO 2. If the newly created element has an xmlns attribute in the
1091 # XMLNS namespace whose value is not exactly the same as the element's
1092 # namespace, that is a parse error. Similarly, if the newly created
1093 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1094 # value is not the XLink Namespace, that is a parse error.
1096 # fixfull: the spec says stuff about form pointers and ownerDocument
1100 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1101 insert_foreign_element = (token, namespace) ->
1102 ail = adjusted_insertion_location()
1105 el = token_to_element token, namespace, ail_el
1106 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1108 ail_el.children.splice ail_i, 0, el
1111 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1112 insert_html_element = insert_foreign_element # (token, namespace) ->
1114 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1115 # position should be [node, index_within_children]
1116 insert_comment = (t, position = null) ->
1117 position ?= adjusted_insertion_location()
1118 position[0].children.splice position[1], 0, t
1121 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1122 parse_generic_raw_text = (t) ->
1123 insert_html_element t
1124 tok_state = tok_state_rawtext
1125 original_insertion_mode = insertion_mode
1126 insertion_mode = ins_mode_text
1127 parse_generic_rcdata_text = (t) ->
1128 insert_html_element t
1129 tok_state = tok_state_rcdata
1130 original_insertion_mode = insertion_mode
1131 insertion_mode = ins_mode_text
1133 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1134 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1135 generate_implied_end_tags = (except = null) ->
1136 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1139 # 8.2.5.4 The rules for parsing tokens in HTML content
1140 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1142 # 8.2.5.4.1 The "initial" insertion mode
1143 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1144 ins_mode_initial = (t) ->
1147 if t.type is TYPE_COMMENT
1151 if t.type is TYPE_DOCTYPE
1152 # FIXME check identifiers, set quirks, etc
1155 insertion_mode = ins_mode_before_html
1158 #fixfull (iframe, quirks)
1159 insertion_mode = ins_mode_before_html
1160 insertion_mode t # reprocess the token
1163 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1164 ins_mode_before_html = (t) ->
1165 if t.type is TYPE_DOCTYPE
1168 if t.type is TYPE_COMMENT
1173 if t.type is TYPE_START_TAG and t.name is 'html'
1174 el = token_to_element t, NS_HTML, doc
1175 doc.children.push el
1176 open_els.unshift(el)
1177 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1178 insertion_mode = ins_mode_before_head
1180 if t.type is TYPE_END_TAG
1181 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1182 # fall through to "anything else"
1187 html_tok = new_open_tag 'html'
1188 el = token_to_element html_tok, NS_HTML, doc
1189 doc.children.push el
1191 # ?fixfull browsing context
1192 insertion_mode = ins_mode_before_head
1196 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1197 ins_mode_before_head = (t) ->
1200 if t.type is TYPE_COMMENT
1203 if t.type is TYPE_DOCTYPE
1206 if t.type is TYPE_START_TAG and t.name is 'html'
1209 if t.type is TYPE_START_TAG and t.name is 'head'
1210 el = insert_html_element t
1211 head_element_pointer = el
1212 insertion_mode = ins_mode_in_head
1213 if t.type is TYPE_END_TAG
1214 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1215 # fall through to Anything else below
1220 head_tok = new_open_tag 'head'
1221 el = insert_html_element head_tok
1222 head_element_pointer = el
1223 insertion_mode = ins_mode_in_head
1224 insertion_mode t # reprocess current token
1226 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1227 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1228 open_els.shift() # spec says this will be a 'head' node
1229 insertion_mode = ins_mode_after_head
1231 ins_mode_in_head = (t) ->
1232 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1235 if t.type is TYPE_COMMENT
1238 if t.type is TYPE_DOCTYPE
1241 if t.type is TYPE_START_TAG and t.name is 'html'
1244 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1245 el = insert_html_element t
1247 t.acknowledge_self_closing()
1249 if t.type is TYPE_START_TAG and t.name is 'meta'
1250 el = insert_html_element t
1252 t.acknowledge_self_closing()
1253 # fixfull encoding stuff
1255 if t.type is TYPE_START_TAG and t.name is 'title'
1256 parse_generic_rcdata_text t
1258 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1259 parse_generic_raw_text t
1261 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1262 insert_html_element t
1263 insertion_mode = ins_mode_in_head_noscript
1265 if t.type is TYPE_START_TAG and t.name is 'script'
1266 ail = adjusted_insertion_location()
1267 el = token_to_element t, NS_HTML, ail
1268 el.flag 'parser-inserted', true
1269 # fixfull frament case
1270 ail[0].children.splice ail[1], 0, el
1272 tok_state = tok_state_script_data
1273 original_insertion_mode = insertion_mode # make sure orig... is defined
1274 insertion_mode = ins_mode_text
1276 if t.type is TYPE_END_TAG and t.name is 'head'
1277 open_els.shift() # will be a head element... spec says so
1278 insertion_mode = ins_mode_after_head
1280 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1281 ins_mode_in_head_else t
1283 if t.type is TYPE_START_TAG and t.name is 'template'
1284 insert_html_element t
1286 flag_frameset_ok = false
1287 insertion_mode = ins_mode_in_template
1288 template_insertion_modes.unshift ins_mode_in_template
1290 if t.type is TYPE_END_TAG and t.name is 'template'
1291 if template_tag_is_open()
1292 generate_implied_end_tags
1293 if open_els[0].name isnt 'template'
1296 el = open_els.shift()
1297 if el.name is 'template'
1299 clear_afe_to_marker()
1300 template_insertion_modes.shift()
1301 reset_insertion_mode()
1305 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1308 ins_mode_in_head_else t
1310 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1311 ins_mode_in_head_noscript_else = (t) ->
1314 insertion_mode = ins_mode_in_head
1316 ins_mode_in_head_noscript = (t) ->
1317 if t.type is TYPE_DOCTYPE
1320 if t.type is TYPE_START_TAG
1323 if t.type is TYPE_END_TAG and t.name is 'noscript'
1325 insertion_mode = ins_mode_in_head
1327 if (t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\u000a" or t.text is "\u000c" or t.text is "\u000d" or t.text is ' ')) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1330 if t.type is TYPE_END_TAG and t.name is 'br'
1331 ins_mode_in_head_noscript_else t
1333 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1337 ins_mode_in_head_noscript_else t
1342 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1343 ins_mode_after_head_else = (t) ->
1344 body_tok = new_open_tag 'body'
1345 insert_html_element body_tok
1346 insertion_mode = ins_mode_in_body
1347 insertion_mode t # reprocess token
1349 ins_mode_after_head = (t) ->
1353 if t.type is TYPE_COMMENT
1356 if t.type is TYPE_DOCTYPE
1359 if t.type is TYPE_START_TAG and t.name is 'html'
1362 if t.type is TYPE_START_TAG and t.name is 'body'
1363 insert_html_element t
1364 flag_frameset_ok = false
1365 insertion_mode = ins_mode_in_body
1367 if t.type is TYPE_START_TAG and t.name is 'frameset'
1368 insert_html_element t
1369 insertion_mode = ins_mode_in_frameset
1371 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1373 open_els.unshift head_element_pointer
1375 for el, i of open_els
1376 if el is head_element_pointer
1377 open_els.splice i, 1
1379 console.log "warning: 23904 couldn't find head element in open_els"
1381 if t.type is TYPE_END_TAG and t.name is 'template'
1384 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1385 ins_mode_after_head_else t
1387 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1391 ins_mode_after_head_else t
1393 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1394 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1395 for el, i in open_els
1396 if el.namespace is NS_HTML and el.name is name
1397 generate_implied_end_tags name # arg is exception
1398 parse_error() unless i is 0
1403 if special_elements[el.name] is el.namespace
1407 ins_mode_in_body = (t) ->
1408 if t.type is TYPE_TEXT and t.text is "\u0000"
1412 reconstruct_active_formatting_elements()
1415 if t.type is TYPE_TEXT
1416 reconstruct_active_formatting_elements()
1418 flag_frameset_ok = false
1420 if t.type is TYPE_COMMENT
1423 if t.type is TYPE_DOCTYPE
1426 if t.type is TYPE_START_TAG and t.name is 'html'
1428 return if template_tag_is_open()
1429 root_attrs = open_els[open_els.length - 1].attrs
1431 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1434 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1437 if t.type is TYPE_START_TAG and t.name is 'body'
1439 return if open_els.length < 2
1440 second = open_els[open_els.length - 2]
1441 return unless second.ns is NS_HTML
1442 return unless second.name is 'body'
1443 return if template_tag_is_open()
1444 frameset_ok_flag = false
1446 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1448 if t.type is TYPE_START_TAG and t.name is 'frameset'
1453 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1454 close_p_if_in_button_scope()
1455 insert_html_element t
1457 if t.type is TYPE_START_TAG and (t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6')
1458 close_p_if_in_button_scope()
1459 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1462 insert_html_element t
1465 if t.type is TYPE_START_TAG and t.name is 'a'
1466 # If the list of active formatting elements contains an a element
1467 # between the end of the list and the last marker on the list (or
1468 # the start of the list if there is no marker on the list), then
1469 # this is a parse error; run the adoption agency algorithm for the
1470 # tag name "a", then remove that element from the list of active
1471 # formatting elements and the stack of open elements if the
1472 # adoption agency algorithm didn't already remove it (it might not
1473 # have if the element is not in table scope).
1476 if el.type is TYPE_AFE_MARKER
1486 for el, i in open_els
1488 open_els.splice i, 1
1489 reconstruct_active_formatting_elements()
1490 el = insert_html_element t
1493 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1494 reconstruct_active_formatting_elements()
1495 el = insert_html_element t
1498 if t.type is TYPE_START_TAG and t.name is 'table'
1499 # fixfull quirksmode thing
1500 close_p_if_in_button_scope()
1501 insert_html_element t
1502 insertion_mode = ins_mode_in_table
1505 if t.type is TYPE_EOF
1507 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1508 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1511 unless ok_tags[t.name]?
1514 # FIXME stack of template insertion modes thing
1517 # FIXME CONTINUE some of these next ones are out of order I think
1518 if t.type is TYPE_END_TAG and t.name is 'body'
1519 unless is_in_scope 'body'
1522 # fixme implement parse error and move to tree_after_body
1524 if t.type is TYPE_END_TAG and t.name is 'html'
1525 unless is_in_scope 'body' # weird, but it's what the spec says
1528 # TODO implement parse error and move to tree_after_body, reprocess
1530 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1531 unless is_in_scope t.name, NS_HTML
1534 generate_implied_end_tags()
1535 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1538 el = open_els.shift()
1539 if el.name is t.name and el.namespace is NS_HTML
1542 if t.type is TYPE_END_TAG and t.name is 'p'
1543 unless is_in_button_scope 'p'
1545 insert_html_element new_open_tag 'p'
1548 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1549 adoption_agency t.name
1551 if t.type is TYPE_START_TAG # any other start tag
1552 reconstruct_active_formatting_elements()
1553 insert_html_element t
1555 if t.type is TYPE_END_TAG # any other end tag
1556 in_body_any_other_end_tag t.name
1559 ins_mode_in_table_else = (t) ->
1561 flag_foster_parenting = true # FIXME
1563 flag_foster_parenting = false
1564 can_in_table = { # FIXME do this inline like everywhere else
1572 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1573 ins_mode_text = (t) ->
1574 if t.type is TYPE_TEXT
1577 if t.type is TYPE_EOF
1579 if open_els[0].name is 'script'
1580 open_els[0].flag 'already started', true
1582 insertion_mode = original_insertion_mode
1585 if t.type is TYPE_END_TAG and t.name is 'script'
1587 insertion_mode = original_insertion_mode
1588 # fixfull the spec seems to assume that I'm going to run the script
1589 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1591 if t.type is TYPE_END_TAG
1593 insertion_mode = original_insertion_mode
1595 console.log 'warning: end of ins_mode_text reached'
1597 # the functions below implement the tokenizer stats described here:
1598 # http://www.w3.org/TR/html5/syntax.html#tokenization
1600 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1601 ins_mode_in_table = (t) ->
1604 if can_in_table[t.name]
1605 original_insertion_mode = insertion_mode
1606 insertion_mode = ins_mode_in_table_text
1609 ins_mode_in_table_else t
1617 clear_stack_to_table_context()
1619 insert_html_element t
1620 insertion_mode = ins_mode_in_caption
1622 clear_stack_to_table_context()
1623 insert_html_element t
1624 insertion_mode = ins_mode_in_column_group
1626 clear_stack_to_table_context()
1627 insert_html_element new_open_tag 'colgroup'
1628 insertion_mode = ins_mode_in_column_group
1630 when 'tbody', 'tfoot', 'thead'
1631 clear_stack_to_table_context()
1632 insert_html_element t
1633 insertion_mode = ins_mode_in_table_body
1634 when 'td', 'th', 'tr'
1635 clear_stack_to_table_context()
1636 insert_html_element new_open_tag 'tbody'
1637 insertion_mode = ins_mode_in_table_body
1641 if is_in_table_scope 'table'
1643 el = open_els.shift()
1644 if el.name is 'table'
1646 reset_insertion_mode()
1648 when 'style', 'script', 'template'
1651 if is_input_hidden_tok t
1652 ins_mode_in_table_else t
1655 el = insert_html_element t
1657 t.acknowledge_self_closing()
1660 if form_element_pointer?
1662 if template_tag_is_open()
1664 form_element_pointer = insert_html_element t
1667 ins_mode_in_table_else t
1671 if is_in_table_scope 'table'
1673 el = open_els.shift()
1674 if el.name is 'table'
1676 reset_insertion_mode()
1679 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1684 ins_mode_in_table_else t
1688 ins_mode_in_table_else t
1691 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1692 ins_mode_in_table_text = (t) ->
1693 if t.type is TYPE_TEXT and t.text is "\u0000"
1694 # huh? I thought the tokenizer didn't emit these
1697 if t.type is TYPE_TEXT
1698 pending_table_character_tokens.push t
1702 for old in pending_table_character_tokens
1703 unless is_space_tok old
1707 for old in pending_table_character_tokens
1708 insert_character old
1710 for old in pending_table_character_tokens
1711 ins_mode_table_else old
1712 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1713 insertion_mode = original_insertion_mode
1716 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1717 ins_mode_in_caption = (t) ->
1718 if t.type is TYPE_END_TAG and t.name is 'caption'
1719 if is_in_table_scope 'caption'
1720 generate_implied_end_tags()
1721 if open_els[0].name isnt 'caption'
1724 el = open_els.shift()
1725 if el.name is 'caption'
1727 clear_afe_to_marker()
1728 insertion_mode = ins_mode_in_table
1733 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1735 if is_in_table_scope 'caption'
1737 el = open_els.shift()
1738 if el.name is 'caption'
1740 clear_afe_to_marker()
1741 insertion_mode = ins_mode_in_table
1743 # else fragment case
1745 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1751 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1752 ins_mode_in_column_group = (t) ->
1756 if t.type is TYPE_COMMENT
1759 if t.type is TYPE_DOCTYPE
1762 if t.type is TYPE_START_TAG and t.name is 'html'
1765 if t.type is TYPE_START_TAG and t.name is 'col'
1766 el = insert_html_element t
1768 t.acknowledge_self_closing()
1770 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1771 if open_els[0].name is 'colgroup'
1773 insertion_mode = ins_mode_in_table
1777 if t.type is TYPE_END_TAG and t.name is 'col'
1780 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1783 if t.type is TYPE_EOF
1787 if open_els[0].name isnt 'colgroup'
1791 insertion_mode = ins_mode_in_table
1795 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1796 ins_mode_in_table_body = (t) ->
1797 if t.type is TYPE_START_TAG and t.name is 'tr'
1798 clear_stack_to_table_body_context()
1799 insert_html_element t
1800 insertion_mode = ins_mode_in_row
1802 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1804 clear_stack_to_table_body_context()
1805 insert_html_element new_open_tag 'tr'
1806 insertion_mode = ins_mode_in_row
1809 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1810 unless is_in_table_scope t.name # fixfull check namespace
1813 clear_stack_to_table_body_context()
1815 insertion_mode = ins_mode_in_table
1817 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1820 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1823 if table_scopers[el.name]
1828 clear_stack_to_table_body_context()
1830 insertion_mode = ins_mode_in_table
1833 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1839 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1840 ins_mode_in_row = (t) ->
1841 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1842 clear_stack_to_table_row_context()
1843 insert_html_element t
1844 insertion_mode = ins_mode_in_cell
1847 if t.type is TYPE_END_TAG and t.name is 'tr'
1848 if is_in_table_scope 'tr'
1849 clear_stack_to_table_row_context()
1851 insertion_mode = ins_mode_in_table_body
1855 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1856 if is_in_table_scope 'tr'
1857 clear_stack_to_table_row_context()
1859 insertion_mode = ins_mode_in_table_body
1864 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1865 if is_in_table_scope t.name # fixfull namespace
1866 if is_in_table_scope 'tr'
1867 clear_stack_to_table_row_context()
1869 insertion_mode = ins_mode_in_table_body
1874 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1880 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1882 generate_implied_end_tags()
1883 unless open_els[0].name is 'td' or open_els[0] is 'th'
1886 el = open_els.shift()
1887 if el.name is 'td' or el.name is 'th'
1889 clear_afe_to_marker()
1890 insertion_mode = ins_mode_in_row
1892 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1893 ins_mode_in_cell = (t) ->
1894 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1895 if is_in_table_scope t.name
1896 generate_implied_end_tags()
1897 if open_els[0].name isnt t.name
1900 el = open_els.shift()
1901 if el.name is t.name
1903 clear_afe_to_marker()
1904 insertion_mode = ins_mode_in_row
1908 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1911 if el.name is 'td' or el.name is 'th'
1914 if table_scopers[el.name]
1922 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1925 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1926 if is_in_table_scope t.name # fixfull namespace
1935 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
1936 ins_mode_in_select = (t) ->
1937 if t.type is TYPE_TEXT and t.text is "\u0000"
1940 if t.type is TYPE_TEXT
1943 if t.type is TYPE_COMMENT
1946 if t.type is TYPE_DOCTYPE
1949 if t.type is TYPE_START_TAG and t.name is 'html'
1952 if t.type is TYPE_START_TAG and t.name is 'option'
1953 if open_els[0].name is 'option'
1955 insert_html_element t
1957 if t.type is TYPE_START_TAG and t.name is 'optgroup'
1958 if open_els[0].name is 'option'
1960 if open_els[0].name is 'optgroup'
1962 insert_html_element t
1964 if t.type is TYPE_END_TAG and t.name is 'optgroup'
1965 if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
1967 if open_els[0].name is 'optgroup'
1972 if t.type is TYPE_END_TAG and t.name is 'option'
1973 if open_els[0].name is 'option'
1978 if t.type is TYPE_END_TAG and t.name is 'select'
1979 if is_in_select_scope 'select'
1981 el = open_els.shift()
1982 if el.name is 'select'
1984 reset_insertion_mode()
1988 if t.type is TYPE_START_TAG and t.name is 'select'
1991 el = open_els.shift()
1992 if el.name is 'select'
1994 reset_insertion_mode()
1995 # spec says that this is the same as </select> but it doesn't say
1996 # to check scope first
1998 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2000 if is_in_select_scope 'select'
2003 el = open_els.shift()
2004 if el.name is 'select'
2006 reset_insertion_mode()
2009 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2012 if t.type is TYPE_EOF
2019 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2020 ins_mode_in_select_in_table = (t) ->
2021 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2024 el = open_els.shift()
2025 if el.name is 'select'
2027 reset_insertion_mode()
2030 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2032 unless is_in_table_scope t.name, NS_HTML
2035 el = open_els.shift()
2036 if el.name is 'select'
2038 reset_insertion_mode()
2042 ins_mode_in_select t
2045 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2046 ins_mode_in_template = (t) ->
2047 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2050 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2053 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2054 template_insertion_modes.shift()
2055 template_insertion_modes.unshift ins_mode_in_table
2056 insertion_mode = ins_mode_in_table
2059 if t.type is TYPE_START_TAG and t.name is 'col'
2060 template_insertion_modes.shift()
2061 template_insertion_modes.unshift ins_mode_in_column_group
2062 insertion_mode = ins_mode_in_column_group
2065 if t.type is TYPE_START_TAG and t.name is 'tr'
2066 template_insertion_modes.shift()
2067 template_insertion_modes.unshift ins_mode_in_table_body
2068 insertion_mode = ins_mode_in_table_body
2071 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2072 template_insertion_modes.shift()
2073 template_insertion_modes.unshift ins_mode_in_row
2074 insertion_mode = ins_mode_in_row
2077 if t.type is TYPE_START_TAG
2078 template_insertion_modes.shift()
2079 template_insertion_modes.unshift ins_mode_in_body
2080 insertion_mode = ins_mode_in_body
2083 if t.type is TYPE_END_TAG
2086 if t.type is TYPE_EOF
2087 unless template_tag_is_open()
2092 el = open_els.shift()
2093 if el.name is 'template' # fixfull check namespace
2095 clear_afe_to_marker()
2096 template_insertion_modes.shift()
2097 reset_insertion_mode()
2100 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2101 ins_mode_after_body = (t) ->
2105 if t.type is TYPE_COMMENT
2106 insert_comment t, [open_els[0], open_els[0].children.length]
2108 if t.type is TYPE_DOCTYPE
2111 if t.type is TYPE_START_TAG and t.name is 'html'
2114 if t.type is TYPE_END_TAG and t.name is 'html'
2115 # fixfull fragment case
2116 insertion_mode = ins_mode_after_after_body
2118 if t.type is TYPE_EOF
2123 insertion_mode = ins_mode_in_body
2126 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2127 ins_mode_in_frameset = (t) ->
2131 if t.type is TYPE_COMMENT
2134 if t.type is TYPE_DOCTYPE
2137 if t.type is TYPE_START_TAG and t.name is 'html'
2140 if t.type is TYPE_START_TAG and t.name is 'frameset'
2141 insert_html_element t
2143 if t.type is TYPE_END_TAG and t.name is 'frameset'
2144 # TODO ?correct for: "if the current node is the root html element"
2145 if open_els.length is 1
2147 return # fragment case
2149 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2150 insertion_mode = ins_mode_after_frameset
2152 if t.type is TYPE_START_TAG and t.name is 'frame'
2153 insert_html_element t
2155 t.acknowledge_self_closing()
2157 if t.type is TYPE_START_TAG and t.name is 'noframes'
2160 if t.type is TYPE_EOF
2161 # TODO ?correct for: "if the current node is not the root html element"
2162 if open_els.length isnt 1
2170 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2171 ins_mode_after_frameset = (t) ->
2175 if t.type is TYPE_COMMENT
2178 if t.type is TYPE_DOCTYPE
2181 if t.type is TYPE_START_TAG and t.name is 'html'
2184 if t.type is TYPE_END_TAG and t.name is 'html'
2185 insert_mode = ins_mode_after_after_frameset
2187 if t.type is TYPE_START_TAG and t.name is 'noframes'
2190 if t.type is TYPE_EOF
2197 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2198 ins_mode_after_after_body = (t) ->
2199 if t.type is TYPE_COMMENT
2200 insert_comment t, [doc, doc.children.length]
2202 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2205 if t.type is TYPE_EOF
2210 insertion_mode = ins_mode_in_body
2213 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2214 ins_mode_after_after_frameset = (t) ->
2215 if t.type is TYPE_COMMENT
2216 insert_comment t, [doc, doc.children.length]
2218 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2221 if t.type is TYPE_EOF
2224 if t.type is TYPE_START_TAG and t.name is 'noframes'
2235 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2237 switch c = txt.charAt(cur++)
2239 return new_text_node parse_character_reference()
2241 tok_state = tok_state_tag_open
2244 return new_text_node c
2246 return new_eof_token()
2248 return new_text_node c
2251 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2252 # not needed: tok_state_character_reference_in_data = ->
2253 # just call parse_character_reference()
2255 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2256 tok_state_rcdata = ->
2257 switch c = txt.charAt(cur++)
2259 return new_text_node parse_character_reference()
2261 tok_state = tok_state_rcdata_less_than_sign
2264 return new_character_token "\ufffd"
2266 return new_eof_token()
2268 return new_character_token c
2271 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2272 # not needed: tok_state_character_reference_in_rcdata = ->
2273 # just call parse_character_reference()
2275 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2276 tok_state_rawtext = ->
2277 switch c = txt.charAt(cur++)
2279 tok_state = tok_state_rawtext_less_than_sign
2282 return new_character_token "\ufffd"
2284 return new_eof_token()
2286 return new_character_token c
2289 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2290 tok_state_script_data = ->
2291 switch c = txt.charAt(cur++)
2293 tok_state = tok_state_script_data_less_than_sign
2296 return new_character_token "\ufffd"
2298 return new_eof_token()
2300 return new_character_token c
2303 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2304 tok_state_plaintext = ->
2305 switch c = txt.charAt(cur++)
2308 return new_character_token "\ufffd"
2310 return new_eof_token()
2312 return new_character_token c
2316 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2317 tok_state_tag_open = ->
2318 switch c = txt.charAt(cur++)
2320 tok_state = tok_state_markup_declaration_open
2322 tok_state = tok_state_end_tag_open
2325 tok_cur_tag = new_comment_token '?'
2326 tok_state = tok_state_bogus_comment
2329 tok_cur_tag = new_open_tag c
2330 tok_state = tok_state_tag_name
2331 else if is_uc_alpha(c)
2332 tok_cur_tag = new_open_tag c.toLowerCase()
2333 tok_state = tok_state_tag_name
2336 tok_state = tok_state_data
2337 cur -= 1 # we didn't parse/handle the char after <
2338 return new_text_node '<'
2341 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2342 tok_state_end_tag_open = ->
2343 switch c = txt.charAt(cur++)
2346 tok_state = tok_state_data
2349 tok_state = tok_state_data
2350 return new_text_node '</'
2353 tok_cur_tag = new_end_tag c.toLowerCase()
2354 tok_state = tok_state_tag_name
2355 else if is_lc_alpha(c)
2356 tok_cur_tag = new_end_tag c
2357 tok_state = tok_state_tag_name
2360 tok_cur_tag = new_comment_token '/'
2361 tok_state = tok_state_bogus_comment
2364 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2365 tok_state_tag_name = ->
2366 switch c = txt.charAt(cur++)
2367 when "\t", "\n", "\u000c", ' '
2368 tok_state = tok_state_before_attribute_name
2370 tok_state = tok_state_self_closing_start_tag
2372 tok_state = tok_state_data
2378 tok_cur_tag.name += "\ufffd"
2381 tok_state = tok_state_data
2384 tok_cur_tag.name += c.toLowerCase()
2386 tok_cur_tag.name += c
2389 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2390 tok_state_rcdata_less_than_sign = ->
2391 c = txt.charAt(cur++)
2393 temporary_buffer = ''
2394 tok_state = tok_state_rcdata_end_tag_open
2397 tok_state = tok_state_rcdata
2398 cur -= 1 # reconsume the input character
2399 return new_character_token '<'
2401 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2402 tok_state_rcdata_end_tag_open = ->
2403 c = txt.charAt(cur++)
2405 tok_cur_tag = new_end_tag c.toLowerCase()
2406 temporary_buffer += c
2407 tok_state = tok_state_rcdata_end_tag_name
2410 tok_cur_tag = new_end_tag c
2411 temporary_buffer += c
2412 tok_state = tok_state_rcdata_end_tag_name
2415 tok_state = tok_state_rcdata
2416 cur -= 1 # reconsume the input character
2417 return new_character_token "</" # fixfull separate these
2419 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2420 is_appropriate_end_tag = (t) ->
2421 # spec says to check against "the tag name of the last start tag to
2422 # have been emitted from this tokenizer", but this is only called from
2423 # the various "raw" states, which I'm pretty sure all push the start
2424 # token onto open_els. TODO: verify this after the script data states
2426 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2427 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2429 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2430 tok_state_rcdata_end_tag_name = ->
2431 c = txt.charAt(cur++)
2432 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2433 if is_appropriate_end_tag tok_cur_tag
2434 tok_state = tok_state_before_attribute_name
2436 # else fall through to "Anything else"
2438 if is_appropriate_end_tag tok_cur_tag
2439 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2441 # else fall through to "Anything else"
2443 if is_appropriate_end_tag tok_cur_tag
2444 tok_state = tok_state_data
2446 # else fall through to "Anything else"
2448 tok_cur_tag.name += c.toLowerCase()
2449 temporary_buffer += c
2452 tok_cur_tag.name += c
2453 temporary_buffer += c
2456 tok_state = tok_state_rcdata
2457 cur -= 1 # reconsume the input character
2458 return new_character_token '</' + temporary_buffer # fixfull separate these
2460 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2461 tok_state_rawtext_less_than_sign = ->
2462 c = txt.charAt(cur++)
2464 temporary_buffer = ''
2465 tok_state = tok_state_rawtext_end_tag_open
2468 tok_state = tok_state_rawtext
2469 cur -= 1 # reconsume the input character
2470 return new_character_token '<'
2472 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2473 tok_state_rawtext_end_tag_open = ->
2474 c = txt.charAt(cur++)
2476 tok_cur_tag = new_end_tag c.toLowerCase()
2477 temporary_buffer += c
2478 tok_state = tok_state_rawtext_end_tag_name
2481 tok_cur_tag = new_end_tag c
2482 temporary_buffer += c
2483 tok_state = tok_state_rawtext_end_tag_name
2486 tok_state = tok_state_rawtext
2487 cur -= 1 # reconsume the input character
2488 return new_character_token "</" # fixfull separate these
2490 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2491 tok_state_rawtext_end_tag_name = ->
2492 c = txt.charAt(cur++)
2493 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2494 if is_appropriate_end_tag tok_cur_tag
2495 tok_state = tok_state_before_attribute_name
2497 # else fall through to "Anything else"
2499 if is_appropriate_end_tag tok_cur_tag
2500 tok_state = tok_state_self_closing_start_tag
2502 # else fall through to "Anything else"
2504 if is_appropriate_end_tag tok_cur_tag
2505 tok_state = tok_state_data
2507 # else fall through to "Anything else"
2509 tok_cur_tag.name += c.toLowerCase()
2510 temporary_buffer += c
2513 tok_cur_tag.name += c
2514 temporary_buffer += c
2517 tok_state = tok_state_rawtext
2518 cur -= 1 # reconsume the input character
2519 return new_character_token '</' + temporary_buffer # fixfull separate these
2521 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
2522 tok_state_script_data_less_than_sign = ->
2523 c = txt.charAt(cur++)
2525 temporary_buffer = ''
2526 tok_state = tok_state_script_data_end_tag_open
2529 tok_state = tok_state_script_data_escape_start
2530 return new_character_token '<!' # fixfull split
2532 tok_state = tok_state_script_data
2533 cur -= 1 # Reconsume
2534 return new_character_token '<'
2536 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2537 tok_state_script_data_end_tag_open = ->
2538 c = txt.charAt(cur++)
2540 tok_cur_tag = new_end_tag c.toLowerCase()
2541 temporary_buffer += c
2542 tok_state = tok_state_script_data_end_tag_name
2545 tok_cur_tag = new_end_tag c
2546 temporary_buffer += c
2547 tok_state = tok_state_script_data_end_tag_name
2550 tok_state = tok_state_script_data
2551 cur -= 1 # Reconsume
2552 return new_character_token '</'
2554 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2555 tok_state_script_data_end_tag_name = ->
2556 c = txt.charAt(cur++)
2557 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2558 if is_appropriate_end_tag tok_cur_tag
2559 tok_state = tok_state_before_attribute_name
2563 if is_appropriate_end_tag tok_cur_tag
2564 tok_state = tok_state_self_closing_start_tag
2568 tok_cur_tag.name += c.toLowerCase()
2569 temporary_buffer += c
2572 tok_cur_tag.name += c
2573 temporary_buffer += c
2576 tok_state = tok_state_script_data
2577 cur -= 1 # Reconsume
2578 return new_character_token "</#{temporary_buffer}" # fixfull split
2580 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
2581 tok_state_script_data_escape_start = ->
2582 c = txt.charAt(cur++)
2584 tok_state = tok_state_script_data_escape_start_dash
2585 return new_character_token '-'
2587 tok_state = tok_state_script_data
2588 cur -= 1 # Reconsume
2591 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
2592 tok_state_script_data_escape_start_dash = ->
2593 c = txt.charAt(cur++)
2595 tok_state = tok_state_script_data_escaped_dash_dash
2596 return new_character_token '-'
2598 tok_state = tok_state_script_data
2599 cur -= 1 # Reconsume
2602 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
2603 tok_state_script_data_escaped = ->
2604 c = txt.charAt(cur++)
2606 tok_state = tok_state_script_data_escaped_dash
2607 return new_character_token '-'
2609 tok_state = tok_state_script_data_escaped_less_than_sign
2613 return new_character_token "\ufffd"
2615 tok_state = tok_state_data
2617 cur -= 1 # Reconsume
2620 return new_character_token c
2622 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
2623 tok_state_script_data_escaped_dash = ->
2624 c = txt.charAt(cur++)
2626 tok_state = tok_state_script_data_escaped_dash_dash
2627 return new_character_token '-'
2629 tok_state = tok_state_script_data_escaped_less_than_sign
2633 tok_state = tok_state_script_data_escaped
2634 return new_character_token "\ufffd"
2636 tok_state = tok_state_data
2638 cur -= 1 # Reconsume
2641 tok_state = tok_state_script_data_escaped
2642 return new_character_token c
2644 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
2645 tok_state_script_data_escaped_dash_dash = ->
2646 c = txt.charAt(cur++)
2648 return new_character_token '-'
2650 tok_state = tok_state_script_data_escaped_less_than_sign
2653 tok_state = tok_state_script_data
2654 return new_character_token '>'
2657 tok_state = tok_state_script_data_escaped
2658 return new_character_token "\ufffd"
2661 tok_state = tok_state_data
2662 cur -= 1 # Reconsume
2665 tok_state = tok_state_script_data_escaped
2666 return new_character_token c
2668 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
2669 tok_state_script_data_escaped_less_than_sign = ->
2670 c = txt.charAt(cur++)
2672 temporary_buffer = ''
2673 tok_state = tok_state_script_data_escaped_end_tag_open
2676 temporary_buffer = c.toLowerCase() # yes, really
2677 tok_state = tok_state_script_data_double_escape_start
2678 return new_character_token "<#{c}" # fixfull split
2680 temporary_buffer = c
2681 tok_state = tok_state_script_data_double_escape_start
2682 return new_character_token "<#{c}" # fixfull split
2684 tok_state = tok_state_script_data_escaped
2685 cur -= 1 # Reconsume
2686 return new_character_token c
2688 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
2689 tok_state_script_data_escaped_end_tag_open = ->
2690 c = txt.charAt(cur++)
2692 tok_cur_tag = new_end_tag c.toLowerCase()
2693 temporary_buffer += c
2694 tok_state = tok_state_script_data_escaped_end_tag_name
2697 tok_cur_tag = new_end_tag c
2698 temporary_buffer += c
2699 tok_state = tok_state_script_data_escaped_end_tag_name
2702 tok_state = tok_state_script_data_escaped
2703 cur -= 1 # Reconsume
2704 return new_character_token '</' # fixfull split
2706 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
2707 tok_state_script_data_escaped_end_tag_name = ->
2708 c = txt.charAt(cur++)
2709 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2710 if is_appropriate_end_tag tok_cur_tag
2711 tok_state = tok_state_before_attribute_name
2715 if is_appropriate_end_tag tok_cur_tag
2716 tok_state = tok_state_self_closing_start_tag
2720 tok_cur_tag.name += c.toLowerCase()
2721 temporary_buffer += c.toLowerCase()
2724 tok_cur_tag.name += c
2725 temporary_buffer += c.toLowerCase()
2728 tok_state = tok_state_script_data_escaped
2729 cur -= 1 # Reconsume
2730 return new_character_token "</#{temporary_buffer}" # fixfull split
2732 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
2733 tok_state_script_data_double_escape_start = ->
2734 c = txt.charAt(cur++)
2735 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
2736 if temporary_buffer is 'script'
2737 tok_state = tok_state_script_data_double_escaped
2739 tok_state = tok_state_script_data_escaped
2740 return new_character_token c
2742 temporary_buffer += c.toLowerCase() # yes, really lowercase
2743 return new_character_token c
2745 temporary_buffer += c
2746 return new_character_token c
2748 tok_state = tok_state_script_data_escaped
2749 cur -= 1 # Reconsume
2752 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
2753 tok_state_script_data_double_escaped = ->
2754 c = txt.charAt(cur++)
2756 tok_state = tok_state_script_data_double_escaped_dash
2757 return new_character_token '-'
2759 tok_state = tok_state_script_data_double_escaped_less_than_sign
2760 return new_character_token '<'
2763 return new_character_token "\ufffd"
2766 tok_state = tok_state_data
2767 cur -= 1 # Reconsume
2770 return new_character_token c
2772 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
2773 tok_state_script_data_double_escaped_dash = ->
2774 c = txt.charAt(cur++)
2776 tok_state = tok_state_script_data_double_escaped_dash_dash
2777 return new_character_token '-'
2779 tok_state = tok_state_script_data_double_escaped_less_than_sign
2780 return new_character_token '<'
2783 tok_state = tok_state_script_data_double_escaped
2784 return new_character_token "\ufffd"
2787 tok_state = tok_state_data
2788 cur -= 1 # Reconsume
2791 tok_state = tok_state_script_data_double_escaped
2792 return new_character_token c
2794 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
2795 tok_state_script_data_double_escaped_dash_dash = ->
2796 c = txt.charAt(cur++)
2798 return new_character_token '-'
2800 tok_state = tok_state_script_data_double_escaped_less_than_sign
2801 return new_character_token '<'
2803 tok_state = tok_state_script_data
2804 return new_character_token '>'
2807 tok_state = tok_state_script_data_double_escaped
2808 return new_character_token "\ufffd"
2811 tok_state = tok_state_data
2812 cur -= 1 # Reconsume
2815 tok_state = tok_state_script_data_double_escaped
2816 return new_character_token c
2818 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
2819 tok_state_script_data_double_escaped_less_than_sign = ->
2820 c = txt.charAt(cur++)
2822 temporary_buffer = ''
2823 tok_state = tok_state_script_data_double_escape_end
2824 return new_character_token '/'
2826 tok_state = tok_state_script_data_double_escaped
2827 cur -= 1 # Reconsume
2830 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
2831 tok_state_script_data_double_escape_end = ->
2832 c = txt.charAt(cur++)
2833 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
2834 if temporary_buffer is 'script'
2835 tok_state = tok_state_script_data_escaped
2837 tok_state = tok_state_script_data_double_escaped
2838 return new_character_token c
2840 temporary_buffer += c.toLowerCase() # yes, really lowercase
2841 return new_character_token c
2843 temporary_buffer += c
2844 return new_character_token c
2846 tok_state = tok_state_script_data_double_escaped
2847 cur -= 1 # Reconsume
2850 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2851 tok_state_before_attribute_name = ->
2853 switch c = txt.charAt(cur++)
2854 when "\t", "\n", "\u000c", ' '
2857 tok_state = tok_state_self_closing_start_tag
2860 tok_state = tok_state_data
2866 attr_name = "\ufffd"
2867 when '"', "'", '<', '='
2872 tok_state = tok_state_data
2875 attr_name = c.toLowerCase()
2879 tok_cur_tag.attrs_a.unshift [attr_name, '']
2880 tok_state = tok_state_attribute_name
2883 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2884 tok_state_attribute_name = ->
2885 switch c = txt.charAt(cur++)
2886 when "\t", "\n", "\u000c", ' '
2887 tok_state = tok_state_after_attribute_name
2889 tok_state = tok_state_self_closing_start_tag
2891 tok_state = tok_state_before_attribute_value
2893 tok_state = tok_state_data
2899 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2902 tok_cur_tag.attrs_a[0][0] = c
2905 tok_state = tok_state_data
2908 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2910 tok_cur_tag.attrs_a[0][0] += c
2913 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2914 tok_state_after_attribute_name = ->
2915 c = txt.charAt(cur++)
2916 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2919 tok_state = tok_state_self_closing_start_tag
2922 tok_state = tok_state_before_attribute_value
2925 tok_state = tok_state_data
2928 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2929 tok_state = tok_state_attribute_name
2933 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2934 tok_state = tok_state_attribute_name
2938 tok_state = tok_state_data
2939 cur -= 1 # reconsume
2941 if c is '"' or c is "'" or c is '<'
2943 # fall through to Anything else
2945 tok_cur_tag.attrs_a.unshift [c, '']
2946 tok_state = tok_state_attribute_name
2948 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2949 tok_state_before_attribute_value = ->
2950 switch c = txt.charAt(cur++)
2951 when "\t", "\n", "\u000c", ' '
2954 tok_state = tok_state_attribute_value_double_quoted
2956 tok_state = tok_state_attribute_value_unquoted
2959 tok_state = tok_state_attribute_value_single_quoted
2962 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2963 tok_state = tok_state_attribute_value_unquoted
2966 tok_state = tok_state_data
2972 tok_state = tok_state_data
2974 tok_cur_tag.attrs_a[0][1] += c
2975 tok_state = tok_state_attribute_value_unquoted
2978 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2979 tok_state_attribute_value_double_quoted = ->
2980 switch c = txt.charAt(cur++)
2982 tok_state = tok_state_after_attribute_value_quoted
2984 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2987 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2990 tok_state = tok_state_data
2992 tok_cur_tag.attrs_a[0][1] += c
2995 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2996 tok_state_attribute_value_single_quoted = ->
2997 switch c = txt.charAt(cur++)
2999 tok_state = tok_state_after_attribute_value_quoted
3001 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3004 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3007 tok_state = tok_state_data
3009 tok_cur_tag.attrs_a[0][1] += c
3012 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3013 tok_state_attribute_value_unquoted = ->
3014 switch c = txt.charAt(cur++)
3015 when "\t", "\n", "\u000c", ' '
3016 tok_state = tok_state_before_attribute_name
3018 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3020 tok_state = tok_state_data
3025 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3028 tok_state = tok_state_data
3030 # Parse Error if ', <, = or ` (backtick)
3031 tok_cur_tag.attrs_a[0][1] += c
3034 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3035 tok_state_after_attribute_value_quoted = ->
3036 switch c = txt.charAt(cur++)
3037 when "\t", "\n", "\u000c", ' '
3038 tok_state = tok_state_before_attribute_name
3040 tok_state = tok_state_self_closing_start_tag
3042 tok_state = tok_state_data
3048 tok_state = tok_state_data
3051 tok_state = tok_state_before_attribute_name
3052 cur -= 1 # we didn't handle that char
3055 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3056 tok_state_self_closing_start_tag = ->
3057 c = txt.charAt(cur++)
3059 tok_cur_tag.flag 'self-closing'
3060 tok_state = tok_state_data
3064 tok_state = tok_state_data
3065 cur -= 1 # Reconsume
3069 tok_state = tok_state_before_attribute_name
3070 cur -= 1 # Reconsume
3073 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3074 # WARNING: put a comment token in tok_cur_tag before setting this state
3075 tok_state_bogus_comment = ->
3076 next_gt = txt.indexOf '>', cur
3078 val = txt.substr cur
3081 val = txt.substr cur, (next_gt - cur)
3083 val = val.replace "\u0000", "\ufffd"
3084 tok_cur_tag.text += val
3085 tok_state = tok_state_data
3088 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3089 tok_state_markup_declaration_open = ->
3090 if txt.substr(cur, 2) is '--'
3092 tok_cur_tag = new_comment_token ''
3093 tok_state = tok_state_comment_start
3095 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3097 tok_state = tok_state_doctype
3099 acn = adjusted_current_node()
3100 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3102 tok_state = tok_state_cdata_section
3106 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
3107 tok_state = tok_state_bogus_comment
3110 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3111 tok_state_comment_start = ->
3112 switch c = txt.charAt(cur++)
3114 tok_state = tok_state_comment_start_dash
3117 return new_character_token "\ufffd"
3120 tok_state = tok_state_data
3124 tok_state = tok_state_data
3125 cur -= 1 # Reconsume
3128 tok_cur_tag.text += c
3131 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3132 tok_state_comment_start_dash = ->
3133 switch c = txt.charAt(cur++)
3135 tok_state = tok_state_comment_end
3138 tok_cur_tag.text += "-\ufffd"
3139 tok_state = tok_state_comment
3142 tok_state = tok_state_data
3146 tok_state = tok_state_data
3147 cur -= 1 # Reconsume
3150 tok_cur_tag.text += "-#{c}"
3151 tok_state = tok_state_comment
3154 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3155 tok_state_comment = ->
3156 switch c = txt.charAt(cur++)
3158 tok_state = tok_state_comment_end_dash
3161 tok_cur_tag.text += "\ufffd"
3164 tok_state = tok_state_data
3165 cur -= 1 # Reconsume
3168 tok_cur_tag.text += c
3171 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3172 tok_state_comment_end_dash = ->
3173 switch c = txt.charAt(cur++)
3175 tok_state = tok_state_comment_end
3178 tok_cur_tag.text += "-\ufffd"
3179 tok_state = tok_state_comment
3182 tok_state = tok_state_data
3183 cur -= 1 # Reconsume
3186 tok_cur_tag.text += "-#{c}"
3187 tok_state = tok_state_comment
3190 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3191 tok_state_comment_end = ->
3192 switch c = txt.charAt(cur++)
3194 tok_state = tok_state_data
3198 tok_cur_tag.text += "--\ufffd"
3199 tok_state = tok_state_comment
3202 tok_state = tok_state_comment_end_bang
3205 tok_cur_tag.text += '-'
3208 tok_state = tok_state_data
3209 cur -= 1 # Reconsume
3213 tok_cur_tag.text += "--#{c}"
3214 tok_state = tok_state_comment
3217 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3218 tok_state_comment_end_bang = ->
3219 switch c = txt.charAt(cur++)
3221 tok_cur_tag.text += "--!#{c}"
3222 tok_state = tok_state_comment_end_dash
3224 tok_state = tok_state_data
3228 tok_cur_tag.text += "--!\ufffd"
3229 tok_state = tok_state_comment
3232 tok_state = tok_state_data
3233 cur -= 1 # Reconsume
3236 tok_cur_tag.text += "--!#{c}"
3237 tok_state = tok_state_comment
3240 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3241 tok_state_doctype = ->
3242 switch c = txt.charAt(cur++)
3243 when "\t", "\u000a", "\u000c", ' '
3244 tok_state = tok_state_before_doctype_name
3247 tok_state = tok_state_data
3248 el = new_doctype_token ''
3249 el.flag 'force-quirks', true
3250 cur -= 1 # Reconsume
3254 tok_state = tok_state_before_doctype_name
3255 cur -= 1 # Reconsume
3258 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3259 tok_state_before_doctype_name = ->
3260 c = txt.charAt(cur++)
3261 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3264 tok_cur_tag = new_doctype_token c.toLowerCase()
3265 tok_state = tok_state_doctype_name
3269 tok_cur_tag = new_doctype_token "\ufffd"
3270 tok_state = tok_state_doctype_name
3274 el = new_doctype_token ''
3275 el.flag 'force-quirks', true
3276 tok_state = tok_state_data
3280 tok_state = tok_state_data
3281 el = new_doctype_token ''
3282 el.flag 'force-quirks', true
3283 cur -= 1 # Reconsume
3286 tok_cur_tag = new_doctype_token c
3287 tok_state = tok_state_doctype_name
3290 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3291 tok_state_doctype_name = ->
3292 c = txt.charAt(cur++)
3293 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3294 tok_state = tok_state_after_doctype_name
3297 tok_state = tok_state_data
3300 tok_cur_tag.name += c.toLowerCase()
3304 tok_cur_tag.name += "\ufffd"
3308 tok_state = tok_state_data
3309 tok_cur_tag.flag 'force-quirks', true
3310 cur -= 1 # Reconsume
3313 tok_cur_tag.name += c
3316 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3317 tok_state_after_doctype_name = ->
3318 c = txt.charAt(cur++)
3319 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3322 tok_state = tok_state_data
3326 tok_state = tok_state_data
3327 tok_cur_tag.flag 'force-quirks', true
3328 cur -= 1 # Reconsume
3331 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3333 tok_state = tok_state_after_doctype_public_keyword
3335 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3337 tok_state = tok_state_after_doctype_system_keyword
3340 tok_cur_tag.flag 'force-quirks', true
3341 tok_state = tok_state_bogus_doctype
3344 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3345 tok_state_after_doctype_public_keyword = ->
3346 c = txt.charAt(cur++)
3347 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3348 tok_state = tok_state_before_doctype_public_identifier
3352 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3353 tok_state = tok_state_doctype_public_identifier_double_quoted
3357 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3358 tok_state = tok_state_doctype_public_identifier_single_quoted
3362 tok_cur_tag.flag 'force-quirks', true
3363 tok_state = tok_state_data
3367 tok_state = tok_state_data
3368 tok_cur_tag.flag 'force-quirks', true
3369 cur -= 1 # Reconsume
3373 tok_cur_tag.flag 'force-quirks', true
3374 tok_state = tok_state_bogus_doctype
3377 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
3378 tok_state_before_doctype_public_identifier = ->
3379 c = txt.charAt(cur++)
3380 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3384 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3385 tok_state = tok_state_doctype_public_identifier_double_quoted
3389 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3390 tok_state = tok_state_doctype_public_identifier_single_quoted
3394 tok_cur_tag.flag 'force-quirks', true
3395 tok_state = tok_state_data
3399 tok_state = tok_state_data
3400 tok_cur_tag.flag 'force-quirks', true
3401 cur -= 1 # Reconsume
3405 tok_cur_tag.flag 'force-quirks', true
3406 tok_state = tok_state_bogus_doctype
3410 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
3411 tok_state_doctype_public_identifier_double_quoted = ->
3412 c = txt.charAt(cur++)
3414 tok_state = tok_state_after_doctype_public_identifier
3418 tok_cur_tag.public_identifier += "\ufffd"
3422 tok_cur_tag.flag 'force-quirks', true
3423 tok_state = tok_state_data
3427 tok_state = tok_state_data
3428 tok_cur_tag.flag 'force-quirks', true
3429 cur -= 1 # Reconsume
3432 tok_cur_tag.public_identifier += c
3435 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
3436 tok_state_doctype_public_identifier_single_quoted = ->
3437 c = txt.charAt(cur++)
3439 tok_state = tok_state_after_doctype_public_identifier
3443 tok_cur_tag.public_identifier += "\ufffd"
3447 tok_cur_tag.flag 'force-quirks', true
3448 tok_state = tok_state_data
3452 tok_state = tok_state_data
3453 tok_cur_tag.flag 'force-quirks', true
3454 cur -= 1 # Reconsume
3457 tok_cur_tag.public_identifier += c
3460 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
3461 tok_state_after_doctype_public_identifier = ->
3462 c = txt.charAt(cur++)
3463 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3464 tok_state = tok_state_between_doctype_public_and_system_identifiers
3467 tok_state = tok_state_data
3471 tok_cur_tag.system_identifier = ''
3472 tok_state = tok_state_doctype_system_identifier_double_quoted
3476 tok_cur_tag.system_identifier = ''
3477 tok_state = tok_state_doctype_system_identifier_single_quoted
3481 tok_state = tok_state_data
3482 tok_cur_tag.flag 'force-quirks', true
3483 cur -= 1 # Reconsume
3487 tok_cur_tag.flag 'force-quirks', true
3488 tok_state = tok_state_bogus_doctype
3491 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
3492 tok_state_between_doctype_public_and_system_identifiers = ->
3493 c = txt.charAt(cur++)
3494 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3497 tok_state = tok_state_data
3501 tok_cur_tag.system_identifier = ''
3502 tok_state = tok_state_doctype_system_identifier_double_quoted
3506 tok_cur_tag.system_identifier = ''
3507 tok_state = tok_state_doctype_system_identifier_single_quoted
3511 tok_state = tok_state_data
3512 tok_cur_tag.flag 'force-quirks', true
3513 cur -= 1 # Reconsume
3517 tok_cur_tag.flag 'force-quirks', true
3518 tok_state = tok_state_bogus_doctype
3521 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
3522 tok_state_after_doctype_system_keyword = ->
3523 c = txt.charAt(cur++)
3524 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3525 tok_state = tok_state_before_doctype_system_identifier
3529 tok_cur_tag.system_identifier = ''
3530 tok_state = tok_state_doctype_system_identifier_double_quoted
3534 tok_cur_tag.system_identifier = ''
3535 tok_state = tok_state_doctype_system_identifier_single_quoted
3539 tok_cur_tag.flag 'force-quirks', true
3540 tok_state = tok_state_data
3544 tok_state = tok_state_data
3545 tok_cur_tag.flag 'force-quirks', true
3546 cur -= 1 # Reconsume
3550 tok_cur_tag.flag 'force-quirks', true
3551 tok_state = tok_state_bogus_doctype
3554 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
3555 tok_state_before_doctype_system_identifier = ->
3556 c = txt.charAt(cur++)
3557 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3560 tok_cur_tag.system_identifier = ''
3561 tok_state = tok_state_doctype_system_identifier_double_quoted
3564 tok_cur_tag.system_identifier = ''
3565 tok_state = tok_state_doctype_system_identifier_single_quoted
3569 tok_cur_tag.flag 'force-quirks', true
3570 tok_state = tok_state_data
3574 tok_state = tok_state_data
3575 tok_cur_tag.flag 'force-quirks', true
3576 cur -= 1 # Reconsume
3580 tok_cur_tag.flag 'force-quirks', true
3581 tok_state = tok_state_bogus_doctype
3584 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
3585 tok_state_doctype_system_identifier_double_quoted = ->
3586 c = txt.charAt(cur++)
3588 tok_state = tok_state_after_doctype_system_identifier
3592 tok_cur_tag.system_identifier += "\ufffd"
3596 tok_cur_tag.flag 'force-quirks', true
3597 tok_state = tok_state_data
3601 tok_state = tok_state_data
3602 tok_cur_tag.flag 'force-quirks', true
3603 cur -= 1 # Reconsume
3606 tok_cur_tag.system_identifier += c
3609 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
3610 tok_state_doctype_system_identifier_single_quoted = ->
3611 c = txt.charAt(cur++)
3613 tok_state = tok_state_after_doctype_system_identifier
3617 tok_cur_tag.system_identifier += "\ufffd"
3621 tok_cur_tag.flag 'force-quirks', true
3622 tok_state = tok_state_data
3626 tok_state = tok_state_data
3627 tok_cur_tag.flag 'force-quirks', true
3628 cur -= 1 # Reconsume
3631 tok_cur_tag.system_identifier += c
3634 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
3635 tok_state_after_doctype_system_identifier = ->
3636 c = txt.charAt(cur++)
3637 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3640 tok_state = tok_state_data
3644 tok_state = tok_state_data
3645 tok_cur_tag.flag 'force-quirks', true
3646 cur -= 1 # Reconsume
3650 # do _not_ tok_cur_tag.flag 'force-quirks', true
3651 tok_state = tok_state_bogus_doctype
3654 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
3655 tok_state_bogus_doctype = ->
3656 c = txt.charAt(cur++)
3658 tok_state = tok_state_data
3661 tok_state = tok_state_data
3662 cur -= 1 # Reconsume
3668 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
3669 # Don't set this as a state, just call it
3670 # returns a string (NOT a text node)
3671 parse_character_reference = (allowed_char = null, in_attr = false) ->
3672 if cur >= txt.length
3674 switch c = txt.charAt(cur)
3675 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
3676 # explicitly not a parse error
3679 # there has to be "one or more" alnums between & and ; to be a parse error
3682 if cur + 1 >= txt.length
3684 if txt.charAt(cur + 1).toLowerCase() is 'x'
3693 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
3697 if txt.charAt(start + i) is ';'
3699 # FIXME This is supposed to generate parse errors for some chars
3700 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
3707 if alnum.indexOf(txt.charAt(cur + i)) is -1
3710 # exit early, because parse_error() below needs at least one alnum
3712 if txt.charAt(cur + i) is ';'
3713 i += 1 # include ';' terminator in value
3714 decoded = decode_named_char_ref txt.substr(cur, i)
3721 # no ';' terminator (only legacy char refs)
3723 for i in [2..max] # no prefix matches, so ok to check shortest first
3724 c = legacy_char_refs[txt.substr(cur, i)]
3727 if txt.charAt(cur + i) is '='
3728 # "because some legacy user agents will
3729 # misinterpret the markup in those cases"
3732 if alnum.indexOf(txt.charAt(cur + i)) > -1
3733 # this makes attributes forgiving about url args
3735 # ok, and besides the weird exceptions for attributes...
3736 # return the matching char
3737 cur += i # consume entity chars
3738 parse_error() # because no terminating ";"
3742 return # never reached
3744 # tree constructor initialization
3745 # see comments on TYPE_TAG/etc for the structure of this data
3746 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
3748 afe = [] # active formatting elements
3749 template_insertion_modes = []
3750 insertion_mode = ins_mode_initial
3751 original_insertion_mode = insertion_mode # TODO check spec
3752 flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
3753 flag_frameset_ok = true
3755 flag_foster_parenting = false
3756 form_element_pointer = null
3757 temporary_buffer = null
3758 pending_table_character_tokens = []
3759 head_element_pointer = null
3760 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
3761 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
3763 # tokenizer initialization
3764 tok_state = tok_state_data
3771 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
3774 serialize_els = (els, shallow, show_ids) ->
3780 serialized += t.serialize shallow, show_ids
3783 # TODO export TYPE_*
3784 module.exports.parse_html = parse_html
3785 module.exports.debug_log_reset = debug_log_reset
3786 module.exports.debug_log_each = debug_log_each
3787 module.exports.TYPE_TAG = TYPE_TAG
3788 module.exports.TYPE_TEXT = TYPE_TEXT
3789 module.exports.TYPE_COMMENT = TYPE_COMMENT
3790 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE