1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
55 module = exports: window.wheic
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
79 debug_log_each = (cb) ->
80 for str in g_debug_log
85 constructor: (type, args = {}) ->
86 @type = type # one of the TYPE_* constants above
87 @name = args.name ? '' # tag name
88 @text = args.text ? '' # contents for text/comment nodes
89 @attrs = args.attrs ? {}
90 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91 @children = args.children ? []
92 @namespace = args.namespace ? NS_HTML
93 @parent = args.parent ? null
94 @token = args.token ? null
98 @id = "#{++prev_node_id}"
99 acknowledge_self_closing: ->
101 @token.flag 'did_self_close'
103 @flag 'did_self_close', true
106 serialize: (shallow = false, show_ids = false) -> # for unit tests
111 ret += JSON.stringify @name
126 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
132 ret += c.serialize shallow, show_ids
136 ret += JSON.stringify @text
139 ret += JSON.stringify @text
141 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
144 when TYPE_AAA_BOOKMARK
145 ret += 'aaa_bookmark'
148 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
151 # helpers: (only take args that are normally known when parser creates nodes)
152 new_open_tag = (name) ->
153 return new Node TYPE_START_TAG, name: name
154 new_end_tag = (name) ->
155 return new Node TYPE_END_TAG, name: name
156 new_element = (name) ->
157 return new Node TYPE_TAG, name: name
158 new_text_node = (txt) ->
159 return new Node TYPE_TEXT, text: txt
160 new_character_token = new_text_node
161 new_comment_token = (txt) ->
162 return new Node TYPE_COMMENT, text: txt
163 new_doctype_token = (name) ->
164 return new Node TYPE_DOCTYPE, name: name
166 return new Node TYPE_EOF
168 return new Node TYPE_AFE_MARKER
169 new_aaa_bookmark = ->
170 return new Node TYPE_AAA_BOOKMARK
172 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
173 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
174 digits = "0123456789"
175 alnum = lc_alpha + uc_alpha + digits
176 hex_chars = digits + "abcdefABCDEF"
178 is_uc_alpha = (str) ->
179 return str.length is 1 and uc_alpha.indexOf(str) > -1
180 is_lc_alpha = (str) ->
181 return str.length is 1 and lc_alpha.indexOf(str) > -1
183 # some SVG elements have dashes in them
184 tag_name_chars = alnum + "-"
186 # http://www.w3.org/TR/html5/infrastructure.html#space-character
187 space_chars = "\u0009\u000a\u000c\u000d\u0020"
189 return txt.length is 1 and space_chars.indexOf(txt) > -1
190 is_space_tok = (t) ->
191 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
193 is_input_hidden_tok = (t) ->
194 return unless t.type is TYPE_START_TAG
197 if a[1].toLowerCase() is 'hidden'
202 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
203 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
205 # These are the character references that don't need a terminating semicolon
206 # min length: 2, max: 6, none are a prefix of any other.
208 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
209 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
210 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
211 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
212 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
213 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
214 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
215 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
216 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
217 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
218 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
219 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
220 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
221 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
222 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
223 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
224 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
228 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
229 raw_text_elements = ['script', 'style']
230 escapable_raw_text_elements = ['textarea', 'title']
231 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
233 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
234 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
235 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
236 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
237 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
238 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
239 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
240 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
241 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
242 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
243 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
244 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
245 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
246 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
250 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
252 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
253 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
254 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
255 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
256 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
257 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
258 'determinant', 'diff', 'divergence', 'divide', 'domain',
259 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
260 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
261 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
262 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
263 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
264 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
265 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
266 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
267 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
268 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
269 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
270 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
271 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
272 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
273 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
274 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
275 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
276 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
277 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
278 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
279 'vectorproduct', 'xor'
281 # foreign_elements = [svg_elements..., mathml_elements...]
282 #normal_elements = All other allowed HTML elements are normal elements.
286 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
287 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
288 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
289 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
290 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
291 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
292 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
293 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
294 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
295 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
296 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
297 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
298 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
299 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
300 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
301 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
302 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
303 wbr:NS_HTML, xmp:NS_HTML,
306 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
307 'annotation-xml':NS_MATHML,
310 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
313 formatting_elements = {
314 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
315 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
320 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
323 foster_parenting_targets = {
345 el_is_special = (e) ->
346 return special_elements[e.name] is e.namespace
348 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
349 el_is_special_not_adp = (el) ->
350 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
352 # decode_named_char_ref()
354 # The list of named character references is _huge_ so ask the browser to decode
355 # for us instead of wasting bandwidth/space on including the table here.
357 # Pass without the "&" but with the ";" examples:
358 # for "&" pass "amp;"
359 # for "′" pass "x2032;"
362 textarea: document.createElement('textarea')
364 # TODO test this in IE8
365 decode_named_char_ref = (txt) ->
367 decoded = g_dncr.cache[txt]
368 return decoded if decoded?
369 g_dncr.textarea.innerHTML = txt
370 decoded = g_dncr.textarea.value
371 return null if decoded is txt
372 return g_dncr.cache[txt] = decoded
374 parse_html = (txt, parse_error_cb = null) ->
375 cur = 0 # index of next char in txt to be parsed
376 # declare doc and tokenizer variables so they're in scope below
378 open_els = null # stack of open elements
379 afe = null # active formatting elements
380 template_ins_modes = null
382 original_ins_mode = null
384 tok_cur_tag = null # partially parsed tag
385 flag_scripting = null
386 flag_frameset_ok = null
388 flag_foster_parenting = null
389 form_element_pointer = null
390 temporary_buffer = null
391 pending_table_character_tokens = null
392 head_element_pointer = null
393 flag_fragment_parsing = null
394 context_element = null
403 console.log "Parse error at character #{cur} of #{txt.length}"
405 afe_push = (new_el) ->
408 if el.name is new_el.name and el.namespace is new_el.namespace
410 continue unless new_el.attrs[k] is v
411 for k, v of new_el.attrs
412 continue unless el.attrs[k] is v
419 afe.unshift new_afe_marker()
421 # the functions below impliment the Tree Contstruction algorithm
422 # http://www.w3.org/TR/html5/syntax.html#tree-construction
424 # But first... the helpers
425 template_tag_is_open = ->
427 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
430 is_in_scope_x = (tag_name, scope, namespace) ->
432 if t.name is tag_name and (namespace is null or namespace is t.namespace)
434 if scope[t.name] is t.namespace
437 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
439 if t.name is tag_name and (namespace is null or namespace is t.namespace)
441 if scope[t.name] is t.namespace
443 if scope2[t.name] is t.namespace
447 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
448 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
449 template: NS_HTML, mi: NS_MATHML,
451 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
452 'annotation-xml': NS_MATHML,
454 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
456 button_scopers = button: NS_HTML
457 li_scopers = ol: NS_HTML, ul: NS_HTML
458 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
459 is_in_scope = (tag_name, namespace = null) ->
460 return is_in_scope_x tag_name, standard_scopers, namespace
461 is_in_button_scope = (tag_name, namespace = null) ->
462 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
463 is_in_table_scope = (tag_name, namespace = null) ->
464 return is_in_scope_x tag_name, table_scopers, namespace
465 # aka is_in_list_item_scope
466 is_in_li_scope = (tag_name, namespace = null) ->
467 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
468 is_in_select_scope = (tag_name, namespace = null) ->
470 if t.name is tag_name and (namespace is null or namespace is t.namespace)
472 if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
475 # this checks for a particular element, not by name
476 el_is_in_scope = (el) ->
480 if standard_scopers[t.name] is t.namespace
484 clear_to_table_stopers = {
489 clear_stack_to_table_context = ->
491 if clear_to_table_stopers[open_els[0].name]?
495 clear_to_table_body_stopers = {
502 clear_stack_to_table_body_context = ->
504 if clear_to_table_body_stopers[open_els[0].name]?
508 clear_to_table_row_stopers = {
513 clear_stack_to_table_row_context = ->
515 if clear_to_table_row_stopers[open_els[0].name]?
519 clear_afe_to_marker = ->
521 return unless afe.length > 0 # this happens in fragment case, ?spec error
523 if el.type is TYPE_AFE_MARKER
528 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
530 # 1. Let last be false.
532 # 2. Let node be the last node in the stack of open elements.
534 node = open_els[node_i]
535 # 3. Loop: If node is the first node in the stack of open elements,
536 # then set last to true, and, if the parser was originally created as
537 # part of the HTML fragment parsing algorithm (fragment case) set node
538 # to the context element.
540 if node_i is open_els.length - 1
542 # fixfull (fragment case)
544 # 4. If node is a select element, run these substeps:
545 if node.name is 'select'
546 # 1. If last is true, jump to the step below labeled done.
548 # 2. Let ancestor be node.
551 # 3. Loop: If ancestor is the first node in the stack of
552 # open elements, jump to the step below labeled done.
554 if ancestor_i is open_els.length - 1
556 # 4. Let ancestor be the node before ancestor in the stack
559 ancestor = open_els[ancestor_i]
560 # 5. If ancestor is a template node, jump to the step below
562 if ancestor.name is 'template'
564 # 6. If ancestor is a table node, switch the insertion mode
565 # to "in select in table" and abort these steps.
566 if ancestor.name is 'table'
567 ins_mode = ins_mode_in_select_in_table
569 # 7. Jump back to the step labeled loop.
570 # 8. Done: Switch the insertion mode to "in select" and abort
572 ins_mode = ins_mode_in_select
574 # 5. If node is a td or th element and last is false, then switch
575 # the insertion mode to "in cell" and abort these steps.
576 if (node.name is 'td' or node.name is 'th') and last is false
577 ins_mode = ins_mode_in_cell
579 # 6. If node is a tr element, then switch the insertion mode to "in
580 # row" and abort these steps.
582 ins_mode = ins_mode_in_row
584 # 7. If node is a tbody, thead, or tfoot element, then switch the
585 # insertion mode to "in table body" and abort these steps.
586 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
587 ins_mode = ins_mode_in_table_body
589 # 8. If node is a caption element, then switch the insertion mode
590 # to "in caption" and abort these steps.
591 if node.name is 'caption'
592 ins_mode = ins_mode_in_caption
594 # 9. If node is a colgroup element, then switch the insertion mode
595 # to "in column group" and abort these steps.
596 if node.name is 'colgroup'
597 ins_mode = ins_mode_in_column_group
599 # 10. If node is a table element, then switch the insertion mode to
600 # "in table" and abort these steps.
601 if node.name is 'table'
602 ins_mode = ins_mode_in_table
604 # 11. If node is a template element, then switch the insertion mode
605 # to the current template insertion mode and abort these steps.
606 # fixfull (template insertion mode stack)
608 # 12. If node is a head element and last is true, then switch the
609 # insertion mode to "in body" ("in body"! not "in head"!) and abort
610 # these steps. (fragment case)
611 if node.name is 'head' and last
612 ins_mode = ins_mode_in_body
614 # 13. If node is a head element and last is false, then switch the
615 # insertion mode to "in head" and abort these steps.
616 if node.name is 'head' and last is false
617 ins_mode = ins_mode_in_head
619 # 14. If node is a body element, then switch the insertion mode to
620 # "in body" and abort these steps.
621 if node.name is 'body'
622 ins_mode = ins_mode_in_body
624 # 15. If node is a frameset element, then switch the insertion mode
625 # to "in frameset" and abort these steps. (fragment case)
626 if node.name is 'frameset'
627 ins_mode = ins_mode_in_frameset
629 # 16. If node is an html element, run these substeps:
630 if node.name is 'html'
631 # 1. If the head element pointer is null, switch the insertion
632 # mode to "before head" and abort these steps. (fragment case)
633 if head_element_pointer is null
634 ins_mode = ins_mode_before_head
636 # 2. Otherwise, the head element pointer is not null,
637 # switch the insertion mode to "after head" and abort these
639 ins_mode = ins_mode_after_head
641 # 17. If last is true, then switch the insertion mode to "in body"
642 # and abort these steps. (fragment case)
644 ins_mode = ins_mode_in_body
646 # 18. Let node now be the node before node in the stack of open
649 node = open_els[node_i]
650 # 19. Return to the step labeled loop.
654 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
655 adjusted_current_node = ->
656 if open_els.length is 1 and flag_fragment_parsing
657 return context_element
660 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
661 # this implementation is structured (mostly) as described at the link above.
662 # capitalized comments are the "labels" described at the link above.
664 return if afe.length is 0
665 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
670 if i is afe.length - 1
673 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
678 el = insert_html_element afe[i].token
683 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
684 # adoption agency algorithm
686 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
687 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
688 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
689 adoption_agency = (subject) ->
690 debug_log "adoption_agency()"
691 debug_log "tree: #{serialize_els doc.children, false, true}"
692 debug_log "open_els: #{serialize_els open_els, true, true}"
693 debug_log "afe: #{serialize_els afe, true, true}"
694 if open_els[0].name is subject
697 # remove it from the list of active formatting elements (if found)
702 debug_log "aaa: starting off with subject on top of stack, exiting"
709 # 5. Let formatting element be the last element in the list of
710 # active formatting elements that: is between the end of the list
711 # and the last scope marker in the list, if any, or the start of
712 # the list otherwise, and has the tag name subject.
714 for t, fe_of_afe in afe
715 if t.type is TYPE_AFE_MARKER
720 # If there is no such element, then abort these steps and instead
721 # act as described in the "any other end tag" entry above.
723 debug_log "aaa: fe not found in afe"
724 in_body_any_other_end_tag subject
726 # 6. If formatting element is not in the stack of open elements,
727 # then this is a parse error; remove the element from the list, and
730 for t, fe_of_open_els in open_els
735 debug_log "aaa: fe not found in open_els"
737 # "remove it from the list" must mean afe, since it's not in open_els
738 afe.splice fe_of_afe, 1
740 # 7. If formatting element is in the stack of open elements, but
741 # the element is not in scope, then this is a parse error; abort
743 unless el_is_in_scope fe
744 debug_log "aaa: fe not in scope"
747 # 8. If formatting element is not the current node, this is a parse
748 # error. (But do not abort these steps.)
749 unless open_els[0] is fe
752 # 9. Let furthest block be the topmost node in the stack of open
753 # elements that is lower in the stack than formatting element, and
754 # is an element in the special category. There might not be one.
756 fb_of_open_els = null
763 # and continue, to see if there's one that's more "topmost"
764 # 10. If there is no furthest block, then the UA must first pop all
765 # the nodes from the bottom of the stack of open elements, from the
766 # current node up to and including formatting element, then remove
767 # formatting element from the list of active formatting elements,
768 # and finally abort these steps.
770 debug_log "aaa: no fb"
774 afe.splice fe_of_afe, 1
776 # 11. Let common ancestor be the element immediately above
777 # formatting element in the stack of open elements.
778 ca = open_els[fe_of_open_els + 1] # common ancestor
780 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
781 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
782 bookmark = new_aaa_bookmark()
785 afe.splice i, 0, bookmark
787 node = last_node = fb
791 # 3. Let node be the element immediately above node in the
792 # stack of open elements, or if node is no longer in the stack
793 # of open elements (e.g. because it got removed by this
794 # algorithm), the element that was immediately above node in
795 # the stack of open elements before node was removed.
799 node_next = open_els[i + 1]
801 node = node_next ? node_above
802 debug_log "inner loop #{inner}"
803 debug_log "tree: #{serialize_els doc.children, false, true}"
804 debug_log "open_els: #{serialize_els open_els, true, true}"
805 debug_log "afe: #{serialize_els afe, true, true}"
806 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
807 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
808 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
809 debug_log "node: #{node.serialize true, true}"
810 # TODO make sure node_above gets re-set if/when node is removed from open_els
812 # 4. If node is formatting element, then go to the next step in
813 # the overall algorithm.
817 # 5. If inner loop counter is greater than three and node is in
818 # the list of active formatting elements, then remove node from
819 # the list of active formatting elements.
825 debug_log "max out inner"
830 # 6. If node is not in the list of active formatting elements,
831 # then remove node from the stack of open elements and then go
832 # back to the step labeled inner loop.
834 debug_log "not in afe"
837 node_above = open_els[i + 1]
841 debug_log "the bones"
842 # 7. create an element for the token for which the element node
843 # was created, in the HTML namespace, with common ancestor as
844 # the intended parent; replace the entry for node in the list
845 # of active formatting elements with an entry for the new
846 # element, replace the entry for node in the stack of open
847 # elements with an entry for the new element, and let node be
849 new_node = token_to_element node.token, NS_HTML, ca
853 debug_log "replaced in afe"
857 node_above = open_els[i + 1]
858 open_els[i] = new_node
859 debug_log "replaced in open_els"
862 # 8. If last node is furthest block, then move the
863 # aforementioned bookmark to be immediately after the new node
864 # in the list of active formatting elements.
869 debug_log "removed bookmark"
873 # "after" means lower
874 afe.splice i, 0, bookmark # "after as <-
875 debug_log "placed bookmark after node"
876 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
878 # 9. Insert last node into node, first removing it from its
879 # previous parent node if any.
881 debug_log "last_node has parent"
882 for c, i in last_node.parent.children
884 debug_log "removing last_node from parent"
885 last_node.parent.children.splice i, 1
887 node.children.push last_node
888 last_node.parent = node
889 # 10. Let last node be node.
892 # 11. Return to the step labeled inner loop.
893 # 14. Insert whatever last node ended up being in the previous step
894 # at the appropriate place for inserting a node, but using common
895 # ancestor as the override target.
897 # In the case where fe is immediately followed by fb:
898 # * inner loop exits out early (node==fe)
900 # * last_node is still in the tree (not a duplicate)
902 debug_log "FEFIRST? last_node has parent"
903 for c, i in last_node.parent.children
905 debug_log "removing last_node from parent"
906 last_node.parent.children.splice i, 1
909 debug_log "after aaa inner loop"
910 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
911 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
912 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
913 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
914 debug_log "tree: #{serialize_els doc.children, false, true}"
919 # can't use standard insert token thing, because it's already in
920 # open_els and must stay at it's current position in open_els
921 dest = adjusted_insertion_location ca
922 dest[0].children.splice dest[1], 0, last_node
923 last_node.parent = dest[0]
926 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
927 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
928 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
929 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
930 debug_log "tree: #{serialize_els doc.children, false, true}"
932 # 15. Create an element for the token for which formatting element
933 # was created, in the HTML namespace, with furthest block as the
935 new_element = token_to_element fe.token, NS_HTML, fb
936 # 16. Take all of the child nodes of furthest block and append them
937 # to the element created in the last step.
938 while fb.children.length
939 t = fb.children.shift()
940 t.parent = new_element
941 new_element.children.push t
942 # 17. Append that new element to furthest block.
943 new_element.parent = fb
944 fb.children.push new_element
945 # 18. Remove formatting element from the list of active formatting
946 # elements, and insert the new element into the list of active
947 # formatting elements at the position of the aforementioned
957 # 19. Remove formatting element from the stack of open elements,
958 # and insert the new element into the stack of open elements
959 # immediately below the position of furthest block in that stack.
966 open_els.splice i, 0, new_element
968 # 20. Jump back to the step labeled outer loop.
969 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
970 debug_log "tree: #{serialize_els doc.children, false, true}"
971 debug_log "open_els: #{serialize_els open_els, true, true}"
972 debug_log "afe: #{serialize_els afe, true, true}"
975 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
977 generate_implied_end_tags 'p' # arg is exception
978 if open_els[0].name isnt 'p'
980 while open_els.length > 1 # just in case
981 el = open_els.shift()
984 close_p_if_in_button_scope = ->
985 if is_in_button_scope 'p'
988 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
989 # aka insert_a_character = (t) ->
990 insert_character = (t) ->
991 dest = adjusted_insertion_location()
992 # fixfull check for Document node
994 prev = dest[0].children[dest[1] - 1]
995 if prev.type is TYPE_TEXT
998 dest[0].children.splice dest[1], 0, t
1001 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1002 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1003 adjusted_insertion_location = (override_target = null) ->
1004 # 1. If there was an override target specified, then let target be the
1007 target = override_target
1008 else # Otherwise, let target be the current node.
1009 target = open_els[0]
1010 # 2. Determine the adjusted insertion location using the first matching
1011 # steps from the following list:
1013 # If foster parenting is enabled and target is a table, tbody, tfoot,
1014 # thead, or tr element Foster parenting happens when content is
1015 # misnested in tables.
1016 if flag_foster_parenting and foster_parenting_targets[target.name]
1017 loop # once. this is here so we can ``break`` to "abort these substeps"
1018 # 1. Let last template be the last template element in the
1019 # stack of open elements, if any.
1020 last_template = null
1021 last_template_i = null
1022 for el, i in open_els
1023 if el.name is 'template'
1027 # 2. Let last table be the last table element in the stack of
1028 # open elements, if any.
1031 for el, i in open_els
1032 if el.name is 'table'
1036 # 3. If there is a last template and either there is no last
1037 # table, or there is one, but last template is lower (more
1038 # recently added) than last table in the stack of open
1039 # elements, then: let adjusted insertion location be inside
1040 # last template's template contents, after its last child (if
1041 # any), and abort these substeps.
1042 if last_template and (last_table is null or last_template_i < last_table_i)
1043 target = last_template # fixfull should be it's contents
1044 target_i = target.children.length
1046 # 4. If there is no last table, then let adjusted insertion
1047 # location be inside the first element in the stack of open
1048 # elements (the html element), after its last child (if any),
1049 # and abort these substeps. (fragment case)
1050 if last_table is null
1052 target = open_els[open_els.length - 1]
1053 target_i = target.children.length
1054 # 5. If last table has a parent element, then let adjusted
1055 # insertion location be inside last table's parent element,
1056 # immediately before last table, and abort these substeps.
1057 if last_table.parent?
1058 for c, i in last_table.parent.children
1060 target = last_table.parent
1064 # 6. Let previous element be the element immediately above last
1065 # table in the stack of open elements.
1067 # huh? how could it not have a parent?
1068 previous_element = open_els[last_table_i + 1]
1069 # 7. Let adjusted insertion location be inside previous
1070 # element, after its last child (if any).
1071 target = previous_element
1072 target_i = target.children.length
1073 # Note: These steps are involved in part because it's possible
1074 # for elements, the table element in this case in particular,
1075 # to have been moved by a script around in the DOM, or indeed
1076 # removed from the DOM entirely, after the element was inserted
1078 break # don't really loop
1080 # Otherwise Let adjusted insertion location be inside target, after
1081 # its last child (if any).
1082 target_i = target.children.length
1084 # 3. If the adjusted insertion location is inside a template element,
1085 # let it instead be inside the template element's template contents,
1086 # after its last child (if any).
1087 # fixfull (template)
1089 # 4. Return the adjusted insertion location.
1090 return [target, target_i]
1092 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1093 # aka create_an_element_for_token
1094 token_to_element = (t, namespace, intended_parent) ->
1095 # convert attributes into a hash
1098 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1099 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1101 # TODO 2. If the newly created element has an xmlns attribute in the
1102 # XMLNS namespace whose value is not exactly the same as the element's
1103 # namespace, that is a parse error. Similarly, if the newly created
1104 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1105 # value is not the XLink Namespace, that is a parse error.
1107 # fixfull: the spec says stuff about form pointers and ownerDocument
1111 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1112 insert_foreign_element = (token, namespace) ->
1113 ail = adjusted_insertion_location()
1116 el = token_to_element token, namespace, ail_el
1117 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1119 ail_el.children.splice ail_i, 0, el
1122 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1123 insert_html_element = insert_foreign_element # (token, namespace) ->
1125 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1126 # position should be [node, index_within_children]
1127 insert_comment = (t, position = null) ->
1128 position ?= adjusted_insertion_location()
1129 position[0].children.splice position[1], 0, t
1132 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1133 parse_generic_raw_text = (t) ->
1134 insert_html_element t
1135 tok_state = tok_state_rawtext
1136 original_ins_mode = ins_mode
1137 ins_mode = ins_mode_text
1138 parse_generic_rcdata_text = (t) ->
1139 insert_html_element t
1140 tok_state = tok_state_rcdata
1141 original_ins_mode = ins_mode
1142 ins_mode = ins_mode_text
1144 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1145 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1146 generate_implied_end_tags = (except = null) ->
1147 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1150 # 8.2.5.4 The rules for parsing tokens in HTML content
1151 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1153 # 8.2.5.4.1 The "initial" insertion mode
1154 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1155 ins_mode_initial = (t) ->
1158 if t.type is TYPE_COMMENT
1162 if t.type is TYPE_DOCTYPE
1163 # FIXME check identifiers, set quirks, etc
1166 ins_mode = ins_mode_before_html
1169 #fixfull (iframe, quirks)
1170 ins_mode = ins_mode_before_html
1171 ins_mode t # reprocess the token
1174 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1175 ins_mode_before_html = (t) ->
1176 if t.type is TYPE_DOCTYPE
1179 if t.type is TYPE_COMMENT
1184 if t.type is TYPE_START_TAG and t.name is 'html'
1185 el = token_to_element t, NS_HTML, doc
1186 doc.children.push el
1187 open_els.unshift(el)
1188 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1189 ins_mode = ins_mode_before_head
1191 if t.type is TYPE_END_TAG
1192 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1193 # fall through to "anything else"
1198 html_tok = new_open_tag 'html'
1199 el = token_to_element html_tok, NS_HTML, doc
1200 doc.children.push el
1202 # ?fixfull browsing context
1203 ins_mode = ins_mode_before_head
1207 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1208 ins_mode_before_head = (t) ->
1211 if t.type is TYPE_COMMENT
1214 if t.type is TYPE_DOCTYPE
1217 if t.type is TYPE_START_TAG and t.name is 'html'
1220 if t.type is TYPE_START_TAG and t.name is 'head'
1221 el = insert_html_element t
1222 head_element_pointer = el
1223 ins_mode = ins_mode_in_head
1224 if t.type is TYPE_END_TAG
1225 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1226 # fall through to Anything else below
1231 head_tok = new_open_tag 'head'
1232 el = insert_html_element head_tok
1233 head_element_pointer = el
1234 ins_mode = ins_mode_in_head
1235 ins_mode t # reprocess current token
1237 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1238 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1239 open_els.shift() # spec says this will be a 'head' node
1240 ins_mode = ins_mode_after_head
1242 ins_mode_in_head = (t) ->
1243 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1246 if t.type is TYPE_COMMENT
1249 if t.type is TYPE_DOCTYPE
1252 if t.type is TYPE_START_TAG and t.name is 'html'
1255 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1256 el = insert_html_element t
1258 t.acknowledge_self_closing()
1260 if t.type is TYPE_START_TAG and t.name is 'meta'
1261 el = insert_html_element t
1263 t.acknowledge_self_closing()
1264 # fixfull encoding stuff
1266 if t.type is TYPE_START_TAG and t.name is 'title'
1267 parse_generic_rcdata_text t
1269 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1270 parse_generic_raw_text t
1272 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1273 insert_html_element t
1274 ins_mode = ins_mode_in_head_noscript
1276 if t.type is TYPE_START_TAG and t.name is 'script'
1277 ail = adjusted_insertion_location()
1278 el = token_to_element t, NS_HTML, ail
1279 el.flag 'parser-inserted', true
1280 # fixfull frament case
1281 ail[0].children.splice ail[1], 0, el
1283 tok_state = tok_state_script_data
1284 original_ins_mode = ins_mode # make sure orig... is defined
1285 ins_mode = ins_mode_text
1287 if t.type is TYPE_END_TAG and t.name is 'head'
1288 open_els.shift() # will be a head element... spec says so
1289 ins_mode = ins_mode_after_head
1291 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1292 ins_mode_in_head_else t
1294 if t.type is TYPE_START_TAG and t.name is 'template'
1295 insert_html_element t
1297 flag_frameset_ok = false
1298 ins_mode = ins_mode_in_template
1299 template_ins_modes.unshift ins_mode_in_template
1301 if t.type is TYPE_END_TAG and t.name is 'template'
1302 if template_tag_is_open()
1303 generate_implied_end_tags
1304 if open_els[0].name isnt 'template'
1307 el = open_els.shift()
1308 if el.name is 'template'
1310 clear_afe_to_marker()
1311 template_ins_modes.shift()
1316 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1319 ins_mode_in_head_else t
1321 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1322 ins_mode_in_head_noscript_else = (t) ->
1325 ins_mode = ins_mode_in_head
1327 ins_mode_in_head_noscript = (t) ->
1328 if t.type is TYPE_DOCTYPE
1331 if t.type is TYPE_START_TAG
1334 if t.type is TYPE_END_TAG and t.name is 'noscript'
1336 ins_mode = ins_mode_in_head
1338 if (t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\u000a" or t.text is "\u000c" or t.text is "\u000d" or t.text is ' ')) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1341 if t.type is TYPE_END_TAG and t.name is 'br'
1342 ins_mode_in_head_noscript_else t
1344 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1348 ins_mode_in_head_noscript_else t
1353 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1354 ins_mode_after_head_else = (t) ->
1355 body_tok = new_open_tag 'body'
1356 insert_html_element body_tok
1357 ins_mode = ins_mode_in_body
1358 ins_mode t # reprocess token
1360 ins_mode_after_head = (t) ->
1364 if t.type is TYPE_COMMENT
1367 if t.type is TYPE_DOCTYPE
1370 if t.type is TYPE_START_TAG and t.name is 'html'
1373 if t.type is TYPE_START_TAG and t.name is 'body'
1374 insert_html_element t
1375 flag_frameset_ok = false
1376 ins_mode = ins_mode_in_body
1378 if t.type is TYPE_START_TAG and t.name is 'frameset'
1379 insert_html_element t
1380 ins_mode = ins_mode_in_frameset
1382 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1384 open_els.unshift head_element_pointer
1386 for el, i of open_els
1387 if el is head_element_pointer
1388 open_els.splice i, 1
1390 console.log "warning: 23904 couldn't find head element in open_els"
1392 if t.type is TYPE_END_TAG and t.name is 'template'
1395 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1396 ins_mode_after_head_else t
1398 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1402 ins_mode_after_head_else t
1404 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1405 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1406 for el, i in open_els
1407 if el.namespace is NS_HTML and el.name is name
1408 generate_implied_end_tags name # arg is exception
1409 parse_error() unless i is 0
1414 if special_elements[el.name] is el.namespace
1418 ins_mode_in_body = (t) ->
1419 if t.type is TYPE_TEXT and t.text is "\u0000"
1426 if t.type is TYPE_TEXT
1429 flag_frameset_ok = false
1431 if t.type is TYPE_COMMENT
1434 if t.type is TYPE_DOCTYPE
1437 if t.type is TYPE_START_TAG and t.name is 'html'
1439 return if template_tag_is_open()
1440 root_attrs = open_els[open_els.length - 1].attrs
1442 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1445 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1448 if t.type is TYPE_START_TAG and t.name is 'body'
1450 return if open_els.length < 2
1451 second = open_els[open_els.length - 2]
1452 return unless second.ns is NS_HTML
1453 return unless second.name is 'body'
1454 return if template_tag_is_open()
1455 frameset_ok_flag = false
1457 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1459 if t.type is TYPE_START_TAG and t.name is 'frameset'
1461 return if open_els.length < 2
1462 second_i = open_els.length - 2
1463 second = open_els[second_i]
1464 return unless second.ns is NS_HTML
1465 return unless second.name is 'body'
1466 flag_frameset_ok = false
1468 for el, i in second.parent.children
1470 second.parent.children.splice i, 1
1472 open_els.splice second_i, 1
1473 # pop everything except the "root html element"
1474 while open_els.length > 1
1476 insert_html_element t
1477 ins_mode = ins_mode_in_frameset
1479 if t.type is TYPE_EOF
1481 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1482 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1483 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1486 unless ok_tags[t.name] is el.namespace
1489 if template_ins_modes.length > 0
1490 ins_mode_in_template t
1494 if t.type is TYPE_END_TAG and t.name is 'body'
1495 unless is_in_scope 'body'
1499 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1500 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1501 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1502 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1506 unless ok_tags[t.name] is el.namespace
1509 ins_mode = ins_mode_after_body
1511 if t.type is TYPE_END_TAG and t.name is 'html'
1512 unless is_in_scope 'body'
1516 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1517 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1518 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1519 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1523 unless ok_tags[t.name] is el.namespace
1526 ins_mode = ins_mode_after_body
1529 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1530 close_p_if_in_button_scope()
1531 insert_html_element t
1533 if t.type is TYPE_START_TAG and h_tags[t.name]?
1534 close_p_if_in_button_scope()
1535 if h_tags[open_els[0]] is NS_HTML
1538 insert_html_element t
1540 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1541 close_p_if_in_button_scope()
1542 insert_html_element t
1543 # spec: If the next token is a "LF" (U+000A) character token, then
1544 # ignore that token and move on to the next one. (Newlines at the
1545 # start of pre blocks are ignored as an authoring convenience.)
1546 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1548 flag_frameset_ok = false
1550 if t.type is TYPE_START_TAG and t.name is 'form'
1551 unless form_element_pointer is null or template_tag_is_open()
1554 close_p_if_in_button_scope()
1555 el = insert_html_element t
1556 unless template_tag_is_open()
1557 form_element_pointer = el
1559 if t.type is TYPE_START_TAG and t.name is 'li'
1560 flag_frameset_ok = false
1561 for node in open_els
1562 if node.name is 'li' and node.namespace is NS_HTML
1563 generate_implied_end_tags 'li' # arg is exception
1564 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1567 el = open_els.shift()
1568 if el.name is 'li' and el.namespace is NS_HTML
1571 if el_is_special_not_adp node
1573 close_p_if_in_button_scope()
1574 insert_html_element t
1576 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1577 flag_frameset_ok = false
1578 for node in open_els
1579 if node.name is 'dd' and node.namespace is NS_HTML
1580 generate_implied_end_tags 'dd' # arg is exception
1581 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1584 el = open_els.shift()
1585 if el.name is 'dd' and el.namespace is NS_HTML
1588 if node.name is 'dt' and node.namespace is NS_HTML
1589 generate_implied_end_tags 'dt' # arg is exception
1590 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1593 el = open_els.shift()
1594 if el.name is 'dt' and el.namespace is NS_HTML
1597 if el_is_special_not_adp node
1599 close_p_if_in_button_scope()
1600 insert_html_element t
1602 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1603 close_p_if_in_button_scope()
1604 insert_html_element t
1605 tok_state = tok_state_plaintext
1607 if t.type is TYPE_START_TAG and t.name is 'button'
1608 if is_in_scope 'button', NS_HTML
1610 generate_implied_end_tags()
1612 el = open_els.shift()
1613 if el.name is 'button' and el.namespace is NS_HTML
1616 insert_html_element t
1617 flag_frameset_ok = false
1619 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1620 unless is_in_scope t.name, NS_HTML
1623 generate_implied_end_tags()
1624 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1627 el = open_els.shift()
1628 if el.name is t.name and el.namespace is NS_HTML
1631 if t.type is TYPE_END_TAG and t.name is 'form'
1632 unless template_tag_is_open()
1633 node = form_element_pointer
1634 form_element_pointer = null
1635 if node is null or not el_is_in_scope node
1638 generate_implied_end_tags()
1639 if open_els[0] isnt node
1641 for el, i in open_els
1643 open_els.splice i, 1
1646 unless is_in_scope 'form', NS_HTML
1649 generate_implied_end_tags()
1650 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1653 el = open_els.shift()
1654 if el.name is 'form' and el.namespace is NS_HTML
1657 if t.type is TYPE_END_TAG and t.name is 'p'
1658 unless is_in_button_scope 'p', NS_HTML
1660 insert_html_element new_open_tag 'p'
1663 if t.type is TYPE_END_TAG and t.name is 'li'
1664 unless is_in_li_scope 'li', NS_HTML
1667 generate_implied_end_tags 'li' # arg is exception
1668 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1671 el = open_els.shift()
1672 if el.name is 'li' and el.namespace is NS_HTML
1675 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1676 unless is_in_scope t.name, NS_HTML
1679 generate_implied_end_tags t.name # arg is exception
1680 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1683 el = open_els.shift()
1684 if el.name is t.name and el.namespace is NS_HTML
1687 if t.type is TYPE_END_TAG and h_tags[t.name]?
1690 if h_tags[el.name] is el.namespace
1693 if standard_scopers[el.name] is el.namespace
1698 generate_implied_end_tags()
1699 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1702 el = open_els.shift()
1703 if h_tags[el.name] is el.namespace
1707 if t.type is TYPE_START_TAG and t.name is 'a'
1708 # If the list of active formatting elements contains an a element
1709 # between the end of the list and the last marker on the list (or
1710 # the start of the list if there is no marker on the list), then
1711 # this is a parse error; run the adoption agency algorithm for the
1712 # tag name "a", then remove that element from the list of active
1713 # formatting elements and the stack of open elements if the
1714 # adoption agency algorithm didn't already remove it (it might not
1715 # have if the element is not in table scope).
1718 if el.type is TYPE_AFE_MARKER
1720 if el.name is 'a' and el.namespace is NS_HTML
1728 for el, i in open_els
1730 open_els.splice i, 1
1732 el = insert_html_element t
1735 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1737 el = insert_html_element t
1740 if t.type is TYPE_START_TAG and t.name is 'nobr'
1742 el = insert_html_element t
1745 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1746 adoption_agency t.name
1748 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1750 insert_html_element t
1752 flag_frameset_ok = false
1754 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1755 unless is_in_scope t.name, NS_HTML
1758 generate_implied_end_tags()
1759 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1762 el = open_els.shift()
1763 if el.name is t.name and el.namespace is NS_HTML
1765 clear_afe_to_marker()
1767 if t.type is TYPE_START_TAG and t.name is 'table'
1768 close_p_if_in_button_scope() # fixfull quirksmode thing
1769 insert_html_element t
1770 flag_frameset_ok = false
1771 ins_mode = ins_mode_in_table
1773 if t.type is TYPE_END_TAG and t.name is 'br'
1775 t.type is TYPE_START_TAG
1777 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
1779 insert_html_element t
1781 t.acknowledge_self_closing()
1782 flag_frameset_ok = false
1784 if t.type is TYPE_START_TAG and t.name is 'input'
1786 insert_html_element t
1788 t.acknowledge_self_closing()
1789 unless is_input_hidden_tok t
1790 flag_frameset_ok = false
1792 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
1793 insert_html_element t
1795 t.acknowledge_self_closing()
1797 if t.type is TYPE_START_TAG and t.name is 'hr'
1798 close_p_if_in_button_scope()
1799 insert_html_element t
1801 t.acknowledge_self_closing()
1802 flag_frameset_ok = false
1804 if t.type is TYPE_START_TAG and t.name is 'image'
1809 if t.type is TYPE_START_TAG and t.name is 'isindex'
1811 if template_tag_is_open() is false and form_element_pointer isnt null
1813 t.acknowledge_self_closing()
1814 flag_frameset_ok = false
1815 close_p_if_in_button_scope()
1816 el = insert_html_element new_open_tag 'form'
1817 unless template_tag_is_open()
1818 form_element_pointer = el
1821 el.attrs['action'] = a[1]
1823 insert_html_element new_open_tag 'hr'
1826 insert_html_element new_open_tag 'label'
1827 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
1828 input_el = new_open_tag 'input'
1833 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
1834 input_el.attrs_a.push [a[0], a[1]]
1835 input_el.attrs_a.push ['name', 'isindex']
1836 # fixfull this next bit is in english... internationalize?
1837 prompt ?= "This is a searchable index. Enter search keywords: "
1838 insert_character prompt # fixfull split
1839 # TODO submit typo "balue" in spec
1840 insert_html_element input_el
1842 # insert_character '' # you can put chars here if promt attr missing
1844 insert_html_element new_open_tag 'hr'
1847 unless template_tag_is_open()
1848 form_element_pointer = null
1850 if t.type is TYPE_START_TAG and t.name is 'textarea'
1851 insert_html_element t
1852 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1854 tok_state = tok_state_rcdata
1855 original_ins_mode = ins_mode
1856 flag_frameset_ok = false
1857 ins_mode = ins_mode_text
1859 if t.type is TYPE_START_TAG and t.name is 'xmp'
1860 close_p_if_in_button_scope()
1862 flag_frameset_ok = false
1863 parse_generic_raw_text t
1868 if t.type is TYPE_START_TAG # any other start tag
1870 insert_html_element t
1872 if t.type is TYPE_END_TAG # any other end tag
1873 in_body_any_other_end_tag t.name
1876 ins_mode_in_table_else = (t) ->
1878 flag_foster_parenting = true # FIXME
1880 flag_foster_parenting = false
1881 can_in_table = { # FIXME do this inline like everywhere else
1889 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1890 ins_mode_text = (t) ->
1891 if t.type is TYPE_TEXT
1894 if t.type is TYPE_EOF
1896 if open_els[0].name is 'script'
1897 open_els[0].flag 'already started', true
1899 ins_mode = original_ins_mode
1902 if t.type is TYPE_END_TAG and t.name is 'script'
1904 ins_mode = original_ins_mode
1905 # fixfull the spec seems to assume that I'm going to run the script
1906 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1908 if t.type is TYPE_END_TAG
1910 ins_mode = original_ins_mode
1912 console.log 'warning: end of ins_mode_text reached'
1914 # the functions below implement the tokenizer stats described here:
1915 # http://www.w3.org/TR/html5/syntax.html#tokenization
1917 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1918 ins_mode_in_table = (t) ->
1921 if can_in_table[t.name]
1922 original_ins_mode = ins_mode
1923 ins_mode = ins_mode_in_table_text
1926 ins_mode_in_table_else t
1934 clear_stack_to_table_context()
1936 insert_html_element t
1937 ins_mode = ins_mode_in_caption
1939 clear_stack_to_table_context()
1940 insert_html_element t
1941 ins_mode = ins_mode_in_column_group
1943 clear_stack_to_table_context()
1944 insert_html_element new_open_tag 'colgroup'
1945 ins_mode = ins_mode_in_column_group
1947 when 'tbody', 'tfoot', 'thead'
1948 clear_stack_to_table_context()
1949 insert_html_element t
1950 ins_mode = ins_mode_in_table_body
1951 when 'td', 'th', 'tr'
1952 clear_stack_to_table_context()
1953 insert_html_element new_open_tag 'tbody'
1954 ins_mode = ins_mode_in_table_body
1958 if is_in_table_scope 'table'
1960 el = open_els.shift()
1961 if el.name is 'table'
1965 when 'style', 'script', 'template'
1968 if is_input_hidden_tok t
1969 ins_mode_in_table_else t
1972 el = insert_html_element t
1974 t.acknowledge_self_closing()
1977 if form_element_pointer?
1979 if template_tag_is_open()
1981 form_element_pointer = insert_html_element t
1984 ins_mode_in_table_else t
1988 if is_in_table_scope 'table'
1990 el = open_els.shift()
1991 if el.name is 'table'
1996 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2001 ins_mode_in_table_else t
2005 ins_mode_in_table_else t
2008 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2009 ins_mode_in_table_text = (t) ->
2010 if t.type is TYPE_TEXT and t.text is "\u0000"
2011 # huh? I thought the tokenizer didn't emit these
2014 if t.type is TYPE_TEXT
2015 pending_table_character_tokens.push t
2019 for old in pending_table_character_tokens
2020 unless is_space_tok old
2024 for old in pending_table_character_tokens
2025 insert_character old
2027 for old in pending_table_character_tokens
2028 ins_mode_table_else old
2029 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
2030 ins_mode = original_ins_mode
2033 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2034 ins_mode_in_caption = (t) ->
2035 if t.type is TYPE_END_TAG and t.name is 'caption'
2036 if is_in_table_scope 'caption'
2037 generate_implied_end_tags()
2038 if open_els[0].name isnt 'caption'
2041 el = open_els.shift()
2042 if el.name is 'caption'
2044 clear_afe_to_marker()
2045 ins_mode = ins_mode_in_table
2050 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2052 if is_in_table_scope 'caption'
2054 el = open_els.shift()
2055 if el.name is 'caption'
2057 clear_afe_to_marker()
2058 ins_mode = ins_mode_in_table
2060 # else fragment case
2062 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2068 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2069 ins_mode_in_column_group = (t) ->
2073 if t.type is TYPE_COMMENT
2076 if t.type is TYPE_DOCTYPE
2079 if t.type is TYPE_START_TAG and t.name is 'html'
2082 if t.type is TYPE_START_TAG and t.name is 'col'
2083 el = insert_html_element t
2085 t.acknowledge_self_closing()
2087 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2088 if open_els[0].name is 'colgroup'
2090 ins_mode = ins_mode_in_table
2094 if t.type is TYPE_END_TAG and t.name is 'col'
2097 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2100 if t.type is TYPE_EOF
2104 if open_els[0].name isnt 'colgroup'
2108 ins_mode = ins_mode_in_table
2112 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2113 ins_mode_in_table_body = (t) ->
2114 if t.type is TYPE_START_TAG and t.name is 'tr'
2115 clear_stack_to_table_body_context()
2116 insert_html_element t
2117 ins_mode = ins_mode_in_row
2119 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2121 clear_stack_to_table_body_context()
2122 insert_html_element new_open_tag 'tr'
2123 ins_mode = ins_mode_in_row
2126 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2127 unless is_in_table_scope t.name # fixfull check namespace
2130 clear_stack_to_table_body_context()
2132 ins_mode = ins_mode_in_table
2134 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2137 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
2140 if table_scopers[el.name]
2145 clear_stack_to_table_body_context()
2147 ins_mode = ins_mode_in_table
2150 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2156 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2157 ins_mode_in_row = (t) ->
2158 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2159 clear_stack_to_table_row_context()
2160 insert_html_element t
2161 ins_mode = ins_mode_in_cell
2164 if t.type is TYPE_END_TAG and t.name is 'tr'
2165 if is_in_table_scope 'tr'
2166 clear_stack_to_table_row_context()
2168 ins_mode = ins_mode_in_table_body
2172 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2173 if is_in_table_scope 'tr'
2174 clear_stack_to_table_row_context()
2176 ins_mode = ins_mode_in_table_body
2181 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2182 if is_in_table_scope t.name # fixfull namespace
2183 if is_in_table_scope 'tr'
2184 clear_stack_to_table_row_context()
2186 ins_mode = ins_mode_in_table_body
2191 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2197 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2199 generate_implied_end_tags()
2200 unless open_els[0].name is 'td' or open_els[0] is 'th'
2203 el = open_els.shift()
2204 if el.name is 'td' or el.name is 'th'
2206 clear_afe_to_marker()
2207 ins_mode = ins_mode_in_row
2209 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2210 ins_mode_in_cell = (t) ->
2211 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2212 if is_in_table_scope t.name
2213 generate_implied_end_tags()
2214 if open_els[0].name isnt t.name
2217 el = open_els.shift()
2218 if el.name is t.name
2220 clear_afe_to_marker()
2221 ins_mode = ins_mode_in_row
2225 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2228 if el.name is 'td' or el.name is 'th'
2231 if table_scopers[el.name]
2239 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2242 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2243 if is_in_table_scope t.name # fixfull namespace
2252 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2253 ins_mode_in_select = (t) ->
2254 if t.type is TYPE_TEXT and t.text is "\u0000"
2257 if t.type is TYPE_TEXT
2260 if t.type is TYPE_COMMENT
2263 if t.type is TYPE_DOCTYPE
2266 if t.type is TYPE_START_TAG and t.name is 'html'
2269 if t.type is TYPE_START_TAG and t.name is 'option'
2270 if open_els[0].name is 'option'
2272 insert_html_element t
2274 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2275 if open_els[0].name is 'option'
2277 if open_els[0].name is 'optgroup'
2279 insert_html_element t
2281 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2282 if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
2284 if open_els[0].name is 'optgroup'
2289 if t.type is TYPE_END_TAG and t.name is 'option'
2290 if open_els[0].name is 'option'
2295 if t.type is TYPE_END_TAG and t.name is 'select'
2296 if is_in_select_scope 'select'
2298 el = open_els.shift()
2299 if el.name is 'select'
2305 if t.type is TYPE_START_TAG and t.name is 'select'
2308 el = open_els.shift()
2309 if el.name is 'select'
2312 # spec says that this is the same as </select> but it doesn't say
2313 # to check scope first
2315 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2317 if is_in_select_scope 'select'
2320 el = open_els.shift()
2321 if el.name is 'select'
2326 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2329 if t.type is TYPE_EOF
2336 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2337 ins_mode_in_select_in_table = (t) ->
2338 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2341 el = open_els.shift()
2342 if el.name is 'select'
2347 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2349 unless is_in_table_scope t.name, NS_HTML
2352 el = open_els.shift()
2353 if el.name is 'select'
2359 ins_mode_in_select t
2362 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2363 ins_mode_in_template = (t) ->
2364 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2367 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2370 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2371 template_ins_modes.shift()
2372 template_ins_modes.unshift ins_mode_in_table
2373 ins_mode = ins_mode_in_table
2376 if t.type is TYPE_START_TAG and t.name is 'col'
2377 template_ins_modes.shift()
2378 template_ins_modes.unshift ins_mode_in_column_group
2379 ins_mode = ins_mode_in_column_group
2382 if t.type is TYPE_START_TAG and t.name is 'tr'
2383 template_ins_modes.shift()
2384 template_ins_modes.unshift ins_mode_in_table_body
2385 ins_mode = ins_mode_in_table_body
2388 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2389 template_ins_modes.shift()
2390 template_ins_modes.unshift ins_mode_in_row
2391 ins_mode = ins_mode_in_row
2394 if t.type is TYPE_START_TAG
2395 template_ins_modes.shift()
2396 template_ins_modes.unshift ins_mode_in_body
2397 ins_mode = ins_mode_in_body
2400 if t.type is TYPE_END_TAG
2403 if t.type is TYPE_EOF
2404 unless template_tag_is_open()
2409 el = open_els.shift()
2410 if el.name is 'template' # fixfull check namespace
2412 clear_afe_to_marker()
2413 template_ins_modes.shift()
2417 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2418 ins_mode_after_body = (t) ->
2422 if t.type is TYPE_COMMENT
2423 insert_comment t, [open_els[0], open_els[0].children.length]
2425 if t.type is TYPE_DOCTYPE
2428 if t.type is TYPE_START_TAG and t.name is 'html'
2431 if t.type is TYPE_END_TAG and t.name is 'html'
2432 # fixfull fragment case
2433 ins_mode = ins_mode_after_after_body
2435 if t.type is TYPE_EOF
2440 ins_mode = ins_mode_in_body
2443 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2444 ins_mode_in_frameset = (t) ->
2448 if t.type is TYPE_COMMENT
2451 if t.type is TYPE_DOCTYPE
2454 if t.type is TYPE_START_TAG and t.name is 'html'
2457 if t.type is TYPE_START_TAG and t.name is 'frameset'
2458 insert_html_element t
2460 if t.type is TYPE_END_TAG and t.name is 'frameset'
2461 # TODO ?correct for: "if the current node is the root html element"
2462 if open_els.length is 1
2464 return # fragment case
2466 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2467 ins_mode = ins_mode_after_frameset
2469 if t.type is TYPE_START_TAG and t.name is 'frame'
2470 insert_html_element t
2472 t.acknowledge_self_closing()
2474 if t.type is TYPE_START_TAG and t.name is 'noframes'
2477 if t.type is TYPE_EOF
2478 # TODO ?correct for: "if the current node is not the root html element"
2479 if open_els.length isnt 1
2487 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2488 ins_mode_after_frameset = (t) ->
2492 if t.type is TYPE_COMMENT
2495 if t.type is TYPE_DOCTYPE
2498 if t.type is TYPE_START_TAG and t.name is 'html'
2501 if t.type is TYPE_END_TAG and t.name is 'html'
2502 insert_mode = ins_mode_after_after_frameset
2504 if t.type is TYPE_START_TAG and t.name is 'noframes'
2507 if t.type is TYPE_EOF
2514 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2515 ins_mode_after_after_body = (t) ->
2516 if t.type is TYPE_COMMENT
2517 insert_comment t, [doc, doc.children.length]
2519 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2522 if t.type is TYPE_EOF
2527 ins_mode = ins_mode_in_body
2530 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2531 ins_mode_after_after_frameset = (t) ->
2532 if t.type is TYPE_COMMENT
2533 insert_comment t, [doc, doc.children.length]
2535 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2538 if t.type is TYPE_EOF
2541 if t.type is TYPE_START_TAG and t.name is 'noframes'
2552 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2554 switch c = txt.charAt(cur++)
2556 return new_text_node parse_character_reference()
2558 tok_state = tok_state_tag_open
2561 return new_text_node c
2563 return new_eof_token()
2565 return new_text_node c
2568 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2569 # not needed: tok_state_character_reference_in_data = ->
2570 # just call parse_character_reference()
2572 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2573 tok_state_rcdata = ->
2574 switch c = txt.charAt(cur++)
2576 return new_text_node parse_character_reference()
2578 tok_state = tok_state_rcdata_less_than_sign
2581 return new_character_token "\ufffd"
2583 return new_eof_token()
2585 return new_character_token c
2588 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2589 # not needed: tok_state_character_reference_in_rcdata = ->
2590 # just call parse_character_reference()
2592 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2593 tok_state_rawtext = ->
2594 switch c = txt.charAt(cur++)
2596 tok_state = tok_state_rawtext_less_than_sign
2599 return new_character_token "\ufffd"
2601 return new_eof_token()
2603 return new_character_token c
2606 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2607 tok_state_script_data = ->
2608 switch c = txt.charAt(cur++)
2610 tok_state = tok_state_script_data_less_than_sign
2613 return new_character_token "\ufffd"
2615 return new_eof_token()
2617 return new_character_token c
2620 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2621 tok_state_plaintext = ->
2622 switch c = txt.charAt(cur++)
2625 return new_character_token "\ufffd"
2627 return new_eof_token()
2629 return new_character_token c
2633 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2634 tok_state_tag_open = ->
2635 switch c = txt.charAt(cur++)
2637 tok_state = tok_state_markup_declaration_open
2639 tok_state = tok_state_end_tag_open
2642 tok_cur_tag = new_comment_token '?'
2643 tok_state = tok_state_bogus_comment
2646 tok_cur_tag = new_open_tag c
2647 tok_state = tok_state_tag_name
2648 else if is_uc_alpha(c)
2649 tok_cur_tag = new_open_tag c.toLowerCase()
2650 tok_state = tok_state_tag_name
2653 tok_state = tok_state_data
2654 cur -= 1 # we didn't parse/handle the char after <
2655 return new_text_node '<'
2658 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2659 tok_state_end_tag_open = ->
2660 switch c = txt.charAt(cur++)
2663 tok_state = tok_state_data
2666 tok_state = tok_state_data
2667 return new_text_node '</'
2670 tok_cur_tag = new_end_tag c.toLowerCase()
2671 tok_state = tok_state_tag_name
2672 else if is_lc_alpha(c)
2673 tok_cur_tag = new_end_tag c
2674 tok_state = tok_state_tag_name
2677 tok_cur_tag = new_comment_token '/'
2678 tok_state = tok_state_bogus_comment
2681 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2682 tok_state_tag_name = ->
2683 switch c = txt.charAt(cur++)
2684 when "\t", "\n", "\u000c", ' '
2685 tok_state = tok_state_before_attribute_name
2687 tok_state = tok_state_self_closing_start_tag
2689 tok_state = tok_state_data
2695 tok_cur_tag.name += "\ufffd"
2698 tok_state = tok_state_data
2701 tok_cur_tag.name += c.toLowerCase()
2703 tok_cur_tag.name += c
2706 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2707 tok_state_rcdata_less_than_sign = ->
2708 c = txt.charAt(cur++)
2710 temporary_buffer = ''
2711 tok_state = tok_state_rcdata_end_tag_open
2714 tok_state = tok_state_rcdata
2715 cur -= 1 # reconsume the input character
2716 return new_character_token '<'
2718 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2719 tok_state_rcdata_end_tag_open = ->
2720 c = txt.charAt(cur++)
2722 tok_cur_tag = new_end_tag c.toLowerCase()
2723 temporary_buffer += c
2724 tok_state = tok_state_rcdata_end_tag_name
2727 tok_cur_tag = new_end_tag c
2728 temporary_buffer += c
2729 tok_state = tok_state_rcdata_end_tag_name
2732 tok_state = tok_state_rcdata
2733 cur -= 1 # reconsume the input character
2734 return new_character_token "</" # fixfull separate these
2736 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2737 is_appropriate_end_tag = (t) ->
2738 # spec says to check against "the tag name of the last start tag to
2739 # have been emitted from this tokenizer", but this is only called from
2740 # the various "raw" states, which I'm pretty sure all push the start
2741 # token onto open_els. TODO: verify this after the script data states
2743 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2744 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2746 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2747 tok_state_rcdata_end_tag_name = ->
2748 c = txt.charAt(cur++)
2749 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2750 if is_appropriate_end_tag tok_cur_tag
2751 tok_state = tok_state_before_attribute_name
2753 # else fall through to "Anything else"
2755 if is_appropriate_end_tag tok_cur_tag
2756 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2758 # else fall through to "Anything else"
2760 if is_appropriate_end_tag tok_cur_tag
2761 tok_state = tok_state_data
2763 # else fall through to "Anything else"
2765 tok_cur_tag.name += c.toLowerCase()
2766 temporary_buffer += c
2769 tok_cur_tag.name += c
2770 temporary_buffer += c
2773 tok_state = tok_state_rcdata
2774 cur -= 1 # reconsume the input character
2775 return new_character_token '</' + temporary_buffer # fixfull separate these
2777 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2778 tok_state_rawtext_less_than_sign = ->
2779 c = txt.charAt(cur++)
2781 temporary_buffer = ''
2782 tok_state = tok_state_rawtext_end_tag_open
2785 tok_state = tok_state_rawtext
2786 cur -= 1 # reconsume the input character
2787 return new_character_token '<'
2789 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2790 tok_state_rawtext_end_tag_open = ->
2791 c = txt.charAt(cur++)
2793 tok_cur_tag = new_end_tag c.toLowerCase()
2794 temporary_buffer += c
2795 tok_state = tok_state_rawtext_end_tag_name
2798 tok_cur_tag = new_end_tag c
2799 temporary_buffer += c
2800 tok_state = tok_state_rawtext_end_tag_name
2803 tok_state = tok_state_rawtext
2804 cur -= 1 # reconsume the input character
2805 return new_character_token "</" # fixfull separate these
2807 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2808 tok_state_rawtext_end_tag_name = ->
2809 c = txt.charAt(cur++)
2810 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2811 if is_appropriate_end_tag tok_cur_tag
2812 tok_state = tok_state_before_attribute_name
2814 # else fall through to "Anything else"
2816 if is_appropriate_end_tag tok_cur_tag
2817 tok_state = tok_state_self_closing_start_tag
2819 # else fall through to "Anything else"
2821 if is_appropriate_end_tag tok_cur_tag
2822 tok_state = tok_state_data
2824 # else fall through to "Anything else"
2826 tok_cur_tag.name += c.toLowerCase()
2827 temporary_buffer += c
2830 tok_cur_tag.name += c
2831 temporary_buffer += c
2834 tok_state = tok_state_rawtext
2835 cur -= 1 # reconsume the input character
2836 return new_character_token '</' + temporary_buffer # fixfull separate these
2838 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
2839 tok_state_script_data_less_than_sign = ->
2840 c = txt.charAt(cur++)
2842 temporary_buffer = ''
2843 tok_state = tok_state_script_data_end_tag_open
2846 tok_state = tok_state_script_data_escape_start
2847 return new_character_token '<!' # fixfull split
2849 tok_state = tok_state_script_data
2850 cur -= 1 # Reconsume
2851 return new_character_token '<'
2853 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2854 tok_state_script_data_end_tag_open = ->
2855 c = txt.charAt(cur++)
2857 tok_cur_tag = new_end_tag c.toLowerCase()
2858 temporary_buffer += c
2859 tok_state = tok_state_script_data_end_tag_name
2862 tok_cur_tag = new_end_tag c
2863 temporary_buffer += c
2864 tok_state = tok_state_script_data_end_tag_name
2867 tok_state = tok_state_script_data
2868 cur -= 1 # Reconsume
2869 return new_character_token '</'
2871 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2872 tok_state_script_data_end_tag_name = ->
2873 c = txt.charAt(cur++)
2874 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2875 if is_appropriate_end_tag tok_cur_tag
2876 tok_state = tok_state_before_attribute_name
2880 if is_appropriate_end_tag tok_cur_tag
2881 tok_state = tok_state_self_closing_start_tag
2885 tok_cur_tag.name += c.toLowerCase()
2886 temporary_buffer += c
2889 tok_cur_tag.name += c
2890 temporary_buffer += c
2893 tok_state = tok_state_script_data
2894 cur -= 1 # Reconsume
2895 return new_character_token "</#{temporary_buffer}" # fixfull split
2897 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
2898 tok_state_script_data_escape_start = ->
2899 c = txt.charAt(cur++)
2901 tok_state = tok_state_script_data_escape_start_dash
2902 return new_character_token '-'
2904 tok_state = tok_state_script_data
2905 cur -= 1 # Reconsume
2908 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
2909 tok_state_script_data_escape_start_dash = ->
2910 c = txt.charAt(cur++)
2912 tok_state = tok_state_script_data_escaped_dash_dash
2913 return new_character_token '-'
2915 tok_state = tok_state_script_data
2916 cur -= 1 # Reconsume
2919 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
2920 tok_state_script_data_escaped = ->
2921 c = txt.charAt(cur++)
2923 tok_state = tok_state_script_data_escaped_dash
2924 return new_character_token '-'
2926 tok_state = tok_state_script_data_escaped_less_than_sign
2930 return new_character_token "\ufffd"
2932 tok_state = tok_state_data
2934 cur -= 1 # Reconsume
2937 return new_character_token c
2939 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
2940 tok_state_script_data_escaped_dash = ->
2941 c = txt.charAt(cur++)
2943 tok_state = tok_state_script_data_escaped_dash_dash
2944 return new_character_token '-'
2946 tok_state = tok_state_script_data_escaped_less_than_sign
2950 tok_state = tok_state_script_data_escaped
2951 return new_character_token "\ufffd"
2953 tok_state = tok_state_data
2955 cur -= 1 # Reconsume
2958 tok_state = tok_state_script_data_escaped
2959 return new_character_token c
2961 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
2962 tok_state_script_data_escaped_dash_dash = ->
2963 c = txt.charAt(cur++)
2965 return new_character_token '-'
2967 tok_state = tok_state_script_data_escaped_less_than_sign
2970 tok_state = tok_state_script_data
2971 return new_character_token '>'
2974 tok_state = tok_state_script_data_escaped
2975 return new_character_token "\ufffd"
2978 tok_state = tok_state_data
2979 cur -= 1 # Reconsume
2982 tok_state = tok_state_script_data_escaped
2983 return new_character_token c
2985 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
2986 tok_state_script_data_escaped_less_than_sign = ->
2987 c = txt.charAt(cur++)
2989 temporary_buffer = ''
2990 tok_state = tok_state_script_data_escaped_end_tag_open
2993 temporary_buffer = c.toLowerCase() # yes, really
2994 tok_state = tok_state_script_data_double_escape_start
2995 return new_character_token "<#{c}" # fixfull split
2997 temporary_buffer = c
2998 tok_state = tok_state_script_data_double_escape_start
2999 return new_character_token "<#{c}" # fixfull split
3001 tok_state = tok_state_script_data_escaped
3002 cur -= 1 # Reconsume
3003 return new_character_token c
3005 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3006 tok_state_script_data_escaped_end_tag_open = ->
3007 c = txt.charAt(cur++)
3009 tok_cur_tag = new_end_tag c.toLowerCase()
3010 temporary_buffer += c
3011 tok_state = tok_state_script_data_escaped_end_tag_name
3014 tok_cur_tag = new_end_tag c
3015 temporary_buffer += c
3016 tok_state = tok_state_script_data_escaped_end_tag_name
3019 tok_state = tok_state_script_data_escaped
3020 cur -= 1 # Reconsume
3021 return new_character_token '</' # fixfull split
3023 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3024 tok_state_script_data_escaped_end_tag_name = ->
3025 c = txt.charAt(cur++)
3026 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3027 if is_appropriate_end_tag tok_cur_tag
3028 tok_state = tok_state_before_attribute_name
3032 if is_appropriate_end_tag tok_cur_tag
3033 tok_state = tok_state_self_closing_start_tag
3037 tok_cur_tag.name += c.toLowerCase()
3038 temporary_buffer += c.toLowerCase()
3041 tok_cur_tag.name += c
3042 temporary_buffer += c.toLowerCase()
3045 tok_state = tok_state_script_data_escaped
3046 cur -= 1 # Reconsume
3047 return new_character_token "</#{temporary_buffer}" # fixfull split
3049 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3050 tok_state_script_data_double_escape_start = ->
3051 c = txt.charAt(cur++)
3052 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3053 if temporary_buffer is 'script'
3054 tok_state = tok_state_script_data_double_escaped
3056 tok_state = tok_state_script_data_escaped
3057 return new_character_token c
3059 temporary_buffer += c.toLowerCase() # yes, really lowercase
3060 return new_character_token c
3062 temporary_buffer += c
3063 return new_character_token c
3065 tok_state = tok_state_script_data_escaped
3066 cur -= 1 # Reconsume
3069 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3070 tok_state_script_data_double_escaped = ->
3071 c = txt.charAt(cur++)
3073 tok_state = tok_state_script_data_double_escaped_dash
3074 return new_character_token '-'
3076 tok_state = tok_state_script_data_double_escaped_less_than_sign
3077 return new_character_token '<'
3080 return new_character_token "\ufffd"
3083 tok_state = tok_state_data
3084 cur -= 1 # Reconsume
3087 return new_character_token c
3089 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3090 tok_state_script_data_double_escaped_dash = ->
3091 c = txt.charAt(cur++)
3093 tok_state = tok_state_script_data_double_escaped_dash_dash
3094 return new_character_token '-'
3096 tok_state = tok_state_script_data_double_escaped_less_than_sign
3097 return new_character_token '<'
3100 tok_state = tok_state_script_data_double_escaped
3101 return new_character_token "\ufffd"
3104 tok_state = tok_state_data
3105 cur -= 1 # Reconsume
3108 tok_state = tok_state_script_data_double_escaped
3109 return new_character_token c
3111 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3112 tok_state_script_data_double_escaped_dash_dash = ->
3113 c = txt.charAt(cur++)
3115 return new_character_token '-'
3117 tok_state = tok_state_script_data_double_escaped_less_than_sign
3118 return new_character_token '<'
3120 tok_state = tok_state_script_data
3121 return new_character_token '>'
3124 tok_state = tok_state_script_data_double_escaped
3125 return new_character_token "\ufffd"
3128 tok_state = tok_state_data
3129 cur -= 1 # Reconsume
3132 tok_state = tok_state_script_data_double_escaped
3133 return new_character_token c
3135 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3136 tok_state_script_data_double_escaped_less_than_sign = ->
3137 c = txt.charAt(cur++)
3139 temporary_buffer = ''
3140 tok_state = tok_state_script_data_double_escape_end
3141 return new_character_token '/'
3143 tok_state = tok_state_script_data_double_escaped
3144 cur -= 1 # Reconsume
3147 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3148 tok_state_script_data_double_escape_end = ->
3149 c = txt.charAt(cur++)
3150 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3151 if temporary_buffer is 'script'
3152 tok_state = tok_state_script_data_escaped
3154 tok_state = tok_state_script_data_double_escaped
3155 return new_character_token c
3157 temporary_buffer += c.toLowerCase() # yes, really lowercase
3158 return new_character_token c
3160 temporary_buffer += c
3161 return new_character_token c
3163 tok_state = tok_state_script_data_double_escaped
3164 cur -= 1 # Reconsume
3167 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3168 tok_state_before_attribute_name = ->
3170 switch c = txt.charAt(cur++)
3171 when "\t", "\n", "\u000c", ' '
3174 tok_state = tok_state_self_closing_start_tag
3177 tok_state = tok_state_data
3183 attr_name = "\ufffd"
3184 when '"', "'", '<', '='
3189 tok_state = tok_state_data
3192 attr_name = c.toLowerCase()
3196 tok_cur_tag.attrs_a.unshift [attr_name, '']
3197 tok_state = tok_state_attribute_name
3200 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3201 tok_state_attribute_name = ->
3202 switch c = txt.charAt(cur++)
3203 when "\t", "\n", "\u000c", ' '
3204 tok_state = tok_state_after_attribute_name
3206 tok_state = tok_state_self_closing_start_tag
3208 tok_state = tok_state_before_attribute_value
3210 tok_state = tok_state_data
3216 tok_cur_tag.attrs_a[0][0] = "\ufffd"
3219 tok_cur_tag.attrs_a[0][0] = c
3222 tok_state = tok_state_data
3225 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
3227 tok_cur_tag.attrs_a[0][0] += c
3230 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3231 tok_state_after_attribute_name = ->
3232 c = txt.charAt(cur++)
3233 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3236 tok_state = tok_state_self_closing_start_tag
3239 tok_state = tok_state_before_attribute_value
3242 tok_state = tok_state_data
3245 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3246 tok_state = tok_state_attribute_name
3250 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3251 tok_state = tok_state_attribute_name
3255 tok_state = tok_state_data
3256 cur -= 1 # reconsume
3258 if c is '"' or c is "'" or c is '<'
3260 # fall through to Anything else
3262 tok_cur_tag.attrs_a.unshift [c, '']
3263 tok_state = tok_state_attribute_name
3265 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3266 tok_state_before_attribute_value = ->
3267 switch c = txt.charAt(cur++)
3268 when "\t", "\n", "\u000c", ' '
3271 tok_state = tok_state_attribute_value_double_quoted
3273 tok_state = tok_state_attribute_value_unquoted
3276 tok_state = tok_state_attribute_value_single_quoted
3279 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3280 tok_state = tok_state_attribute_value_unquoted
3283 tok_state = tok_state_data
3289 tok_state = tok_state_data
3291 tok_cur_tag.attrs_a[0][1] += c
3292 tok_state = tok_state_attribute_value_unquoted
3295 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3296 tok_state_attribute_value_double_quoted = ->
3297 switch c = txt.charAt(cur++)
3299 tok_state = tok_state_after_attribute_value_quoted
3301 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3304 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3307 tok_state = tok_state_data
3309 tok_cur_tag.attrs_a[0][1] += c
3312 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3313 tok_state_attribute_value_single_quoted = ->
3314 switch c = txt.charAt(cur++)
3316 tok_state = tok_state_after_attribute_value_quoted
3318 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3321 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3324 tok_state = tok_state_data
3326 tok_cur_tag.attrs_a[0][1] += c
3329 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3330 tok_state_attribute_value_unquoted = ->
3331 switch c = txt.charAt(cur++)
3332 when "\t", "\n", "\u000c", ' '
3333 tok_state = tok_state_before_attribute_name
3335 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3337 tok_state = tok_state_data
3342 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3345 tok_state = tok_state_data
3347 # Parse Error if ', <, = or ` (backtick)
3348 tok_cur_tag.attrs_a[0][1] += c
3351 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3352 tok_state_after_attribute_value_quoted = ->
3353 switch c = txt.charAt(cur++)
3354 when "\t", "\n", "\u000c", ' '
3355 tok_state = tok_state_before_attribute_name
3357 tok_state = tok_state_self_closing_start_tag
3359 tok_state = tok_state_data
3365 tok_state = tok_state_data
3368 tok_state = tok_state_before_attribute_name
3369 cur -= 1 # we didn't handle that char
3372 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3373 tok_state_self_closing_start_tag = ->
3374 c = txt.charAt(cur++)
3376 tok_cur_tag.flag 'self-closing'
3377 tok_state = tok_state_data
3381 tok_state = tok_state_data
3382 cur -= 1 # Reconsume
3386 tok_state = tok_state_before_attribute_name
3387 cur -= 1 # Reconsume
3390 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3391 # WARNING: put a comment token in tok_cur_tag before setting this state
3392 tok_state_bogus_comment = ->
3393 next_gt = txt.indexOf '>', cur
3395 val = txt.substr cur
3398 val = txt.substr cur, (next_gt - cur)
3400 val = val.replace "\u0000", "\ufffd"
3401 tok_cur_tag.text += val
3402 tok_state = tok_state_data
3405 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3406 tok_state_markup_declaration_open = ->
3407 if txt.substr(cur, 2) is '--'
3409 tok_cur_tag = new_comment_token ''
3410 tok_state = tok_state_comment_start
3412 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3414 tok_state = tok_state_doctype
3416 acn = adjusted_current_node()
3417 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3419 tok_state = tok_state_cdata_section
3423 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
3424 tok_state = tok_state_bogus_comment
3427 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3428 tok_state_comment_start = ->
3429 switch c = txt.charAt(cur++)
3431 tok_state = tok_state_comment_start_dash
3434 return new_character_token "\ufffd"
3437 tok_state = tok_state_data
3441 tok_state = tok_state_data
3442 cur -= 1 # Reconsume
3445 tok_cur_tag.text += c
3448 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3449 tok_state_comment_start_dash = ->
3450 switch c = txt.charAt(cur++)
3452 tok_state = tok_state_comment_end
3455 tok_cur_tag.text += "-\ufffd"
3456 tok_state = tok_state_comment
3459 tok_state = tok_state_data
3463 tok_state = tok_state_data
3464 cur -= 1 # Reconsume
3467 tok_cur_tag.text += "-#{c}"
3468 tok_state = tok_state_comment
3471 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3472 tok_state_comment = ->
3473 switch c = txt.charAt(cur++)
3475 tok_state = tok_state_comment_end_dash
3478 tok_cur_tag.text += "\ufffd"
3481 tok_state = tok_state_data
3482 cur -= 1 # Reconsume
3485 tok_cur_tag.text += c
3488 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3489 tok_state_comment_end_dash = ->
3490 switch c = txt.charAt(cur++)
3492 tok_state = tok_state_comment_end
3495 tok_cur_tag.text += "-\ufffd"
3496 tok_state = tok_state_comment
3499 tok_state = tok_state_data
3500 cur -= 1 # Reconsume
3503 tok_cur_tag.text += "-#{c}"
3504 tok_state = tok_state_comment
3507 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3508 tok_state_comment_end = ->
3509 switch c = txt.charAt(cur++)
3511 tok_state = tok_state_data
3515 tok_cur_tag.text += "--\ufffd"
3516 tok_state = tok_state_comment
3519 tok_state = tok_state_comment_end_bang
3522 tok_cur_tag.text += '-'
3525 tok_state = tok_state_data
3526 cur -= 1 # Reconsume
3530 tok_cur_tag.text += "--#{c}"
3531 tok_state = tok_state_comment
3534 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3535 tok_state_comment_end_bang = ->
3536 switch c = txt.charAt(cur++)
3538 tok_cur_tag.text += "--!#{c}"
3539 tok_state = tok_state_comment_end_dash
3541 tok_state = tok_state_data
3545 tok_cur_tag.text += "--!\ufffd"
3546 tok_state = tok_state_comment
3549 tok_state = tok_state_data
3550 cur -= 1 # Reconsume
3553 tok_cur_tag.text += "--!#{c}"
3554 tok_state = tok_state_comment
3557 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3558 tok_state_doctype = ->
3559 switch c = txt.charAt(cur++)
3560 when "\t", "\u000a", "\u000c", ' '
3561 tok_state = tok_state_before_doctype_name
3564 tok_state = tok_state_data
3565 el = new_doctype_token ''
3566 el.flag 'force-quirks', true
3567 cur -= 1 # Reconsume
3571 tok_state = tok_state_before_doctype_name
3572 cur -= 1 # Reconsume
3575 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3576 tok_state_before_doctype_name = ->
3577 c = txt.charAt(cur++)
3578 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3581 tok_cur_tag = new_doctype_token c.toLowerCase()
3582 tok_state = tok_state_doctype_name
3586 tok_cur_tag = new_doctype_token "\ufffd"
3587 tok_state = tok_state_doctype_name
3591 el = new_doctype_token ''
3592 el.flag 'force-quirks', true
3593 tok_state = tok_state_data
3597 tok_state = tok_state_data
3598 el = new_doctype_token ''
3599 el.flag 'force-quirks', true
3600 cur -= 1 # Reconsume
3603 tok_cur_tag = new_doctype_token c
3604 tok_state = tok_state_doctype_name
3607 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3608 tok_state_doctype_name = ->
3609 c = txt.charAt(cur++)
3610 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3611 tok_state = tok_state_after_doctype_name
3614 tok_state = tok_state_data
3617 tok_cur_tag.name += c.toLowerCase()
3621 tok_cur_tag.name += "\ufffd"
3625 tok_state = tok_state_data
3626 tok_cur_tag.flag 'force-quirks', true
3627 cur -= 1 # Reconsume
3630 tok_cur_tag.name += c
3633 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3634 tok_state_after_doctype_name = ->
3635 c = txt.charAt(cur++)
3636 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3639 tok_state = tok_state_data
3643 tok_state = tok_state_data
3644 tok_cur_tag.flag 'force-quirks', true
3645 cur -= 1 # Reconsume
3648 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3650 tok_state = tok_state_after_doctype_public_keyword
3652 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3654 tok_state = tok_state_after_doctype_system_keyword
3657 tok_cur_tag.flag 'force-quirks', true
3658 tok_state = tok_state_bogus_doctype
3661 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3662 tok_state_after_doctype_public_keyword = ->
3663 c = txt.charAt(cur++)
3664 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3665 tok_state = tok_state_before_doctype_public_identifier
3669 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3670 tok_state = tok_state_doctype_public_identifier_double_quoted
3674 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3675 tok_state = tok_state_doctype_public_identifier_single_quoted
3679 tok_cur_tag.flag 'force-quirks', true
3680 tok_state = tok_state_data
3684 tok_state = tok_state_data
3685 tok_cur_tag.flag 'force-quirks', true
3686 cur -= 1 # Reconsume
3690 tok_cur_tag.flag 'force-quirks', true
3691 tok_state = tok_state_bogus_doctype
3694 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
3695 tok_state_before_doctype_public_identifier = ->
3696 c = txt.charAt(cur++)
3697 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3701 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3702 tok_state = tok_state_doctype_public_identifier_double_quoted
3706 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3707 tok_state = tok_state_doctype_public_identifier_single_quoted
3711 tok_cur_tag.flag 'force-quirks', true
3712 tok_state = tok_state_data
3716 tok_state = tok_state_data
3717 tok_cur_tag.flag 'force-quirks', true
3718 cur -= 1 # Reconsume
3722 tok_cur_tag.flag 'force-quirks', true
3723 tok_state = tok_state_bogus_doctype
3727 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
3728 tok_state_doctype_public_identifier_double_quoted = ->
3729 c = txt.charAt(cur++)
3731 tok_state = tok_state_after_doctype_public_identifier
3735 tok_cur_tag.public_identifier += "\ufffd"
3739 tok_cur_tag.flag 'force-quirks', true
3740 tok_state = tok_state_data
3744 tok_state = tok_state_data
3745 tok_cur_tag.flag 'force-quirks', true
3746 cur -= 1 # Reconsume
3749 tok_cur_tag.public_identifier += c
3752 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
3753 tok_state_doctype_public_identifier_single_quoted = ->
3754 c = txt.charAt(cur++)
3756 tok_state = tok_state_after_doctype_public_identifier
3760 tok_cur_tag.public_identifier += "\ufffd"
3764 tok_cur_tag.flag 'force-quirks', true
3765 tok_state = tok_state_data
3769 tok_state = tok_state_data
3770 tok_cur_tag.flag 'force-quirks', true
3771 cur -= 1 # Reconsume
3774 tok_cur_tag.public_identifier += c
3777 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
3778 tok_state_after_doctype_public_identifier = ->
3779 c = txt.charAt(cur++)
3780 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3781 tok_state = tok_state_between_doctype_public_and_system_identifiers
3784 tok_state = tok_state_data
3788 tok_cur_tag.system_identifier = ''
3789 tok_state = tok_state_doctype_system_identifier_double_quoted
3793 tok_cur_tag.system_identifier = ''
3794 tok_state = tok_state_doctype_system_identifier_single_quoted
3798 tok_state = tok_state_data
3799 tok_cur_tag.flag 'force-quirks', true
3800 cur -= 1 # Reconsume
3804 tok_cur_tag.flag 'force-quirks', true
3805 tok_state = tok_state_bogus_doctype
3808 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
3809 tok_state_between_doctype_public_and_system_identifiers = ->
3810 c = txt.charAt(cur++)
3811 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3814 tok_state = tok_state_data
3818 tok_cur_tag.system_identifier = ''
3819 tok_state = tok_state_doctype_system_identifier_double_quoted
3823 tok_cur_tag.system_identifier = ''
3824 tok_state = tok_state_doctype_system_identifier_single_quoted
3828 tok_state = tok_state_data
3829 tok_cur_tag.flag 'force-quirks', true
3830 cur -= 1 # Reconsume
3834 tok_cur_tag.flag 'force-quirks', true
3835 tok_state = tok_state_bogus_doctype
3838 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
3839 tok_state_after_doctype_system_keyword = ->
3840 c = txt.charAt(cur++)
3841 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3842 tok_state = tok_state_before_doctype_system_identifier
3846 tok_cur_tag.system_identifier = ''
3847 tok_state = tok_state_doctype_system_identifier_double_quoted
3851 tok_cur_tag.system_identifier = ''
3852 tok_state = tok_state_doctype_system_identifier_single_quoted
3856 tok_cur_tag.flag 'force-quirks', true
3857 tok_state = tok_state_data
3861 tok_state = tok_state_data
3862 tok_cur_tag.flag 'force-quirks', true
3863 cur -= 1 # Reconsume
3867 tok_cur_tag.flag 'force-quirks', true
3868 tok_state = tok_state_bogus_doctype
3871 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
3872 tok_state_before_doctype_system_identifier = ->
3873 c = txt.charAt(cur++)
3874 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3877 tok_cur_tag.system_identifier = ''
3878 tok_state = tok_state_doctype_system_identifier_double_quoted
3881 tok_cur_tag.system_identifier = ''
3882 tok_state = tok_state_doctype_system_identifier_single_quoted
3886 tok_cur_tag.flag 'force-quirks', true
3887 tok_state = tok_state_data
3891 tok_state = tok_state_data
3892 tok_cur_tag.flag 'force-quirks', true
3893 cur -= 1 # Reconsume
3897 tok_cur_tag.flag 'force-quirks', true
3898 tok_state = tok_state_bogus_doctype
3901 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
3902 tok_state_doctype_system_identifier_double_quoted = ->
3903 c = txt.charAt(cur++)
3905 tok_state = tok_state_after_doctype_system_identifier
3909 tok_cur_tag.system_identifier += "\ufffd"
3913 tok_cur_tag.flag 'force-quirks', true
3914 tok_state = tok_state_data
3918 tok_state = tok_state_data
3919 tok_cur_tag.flag 'force-quirks', true
3920 cur -= 1 # Reconsume
3923 tok_cur_tag.system_identifier += c
3926 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
3927 tok_state_doctype_system_identifier_single_quoted = ->
3928 c = txt.charAt(cur++)
3930 tok_state = tok_state_after_doctype_system_identifier
3934 tok_cur_tag.system_identifier += "\ufffd"
3938 tok_cur_tag.flag 'force-quirks', true
3939 tok_state = tok_state_data
3943 tok_state = tok_state_data
3944 tok_cur_tag.flag 'force-quirks', true
3945 cur -= 1 # Reconsume
3948 tok_cur_tag.system_identifier += c
3951 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
3952 tok_state_after_doctype_system_identifier = ->
3953 c = txt.charAt(cur++)
3954 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3957 tok_state = tok_state_data
3961 tok_state = tok_state_data
3962 tok_cur_tag.flag 'force-quirks', true
3963 cur -= 1 # Reconsume
3967 # do _not_ tok_cur_tag.flag 'force-quirks', true
3968 tok_state = tok_state_bogus_doctype
3971 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
3972 tok_state_bogus_doctype = ->
3973 c = txt.charAt(cur++)
3975 tok_state = tok_state_data
3978 tok_state = tok_state_data
3979 cur -= 1 # Reconsume
3985 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
3986 # Don't set this as a state, just call it
3987 # returns a string (NOT a text node)
3988 parse_character_reference = (allowed_char = null, in_attr = false) ->
3989 if cur >= txt.length
3991 switch c = txt.charAt(cur)
3992 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
3993 # explicitly not a parse error
3996 # there has to be "one or more" alnums between & and ; to be a parse error
3999 if cur + 1 >= txt.length
4001 if txt.charAt(cur + 1).toLowerCase() is 'x'
4010 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4014 if txt.charAt(start + i) is ';'
4016 # FIXME This is supposed to generate parse errors for some chars
4017 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
4024 if alnum.indexOf(txt.charAt(cur + i)) is -1
4027 # exit early, because parse_error() below needs at least one alnum
4029 if txt.charAt(cur + i) is ';'
4030 i += 1 # include ';' terminator in value
4031 decoded = decode_named_char_ref txt.substr(cur, i)
4038 # no ';' terminator (only legacy char refs)
4040 for i in [2..max] # no prefix matches, so ok to check shortest first
4041 c = legacy_char_refs[txt.substr(cur, i)]
4044 if txt.charAt(cur + i) is '='
4045 # "because some legacy user agents will
4046 # misinterpret the markup in those cases"
4049 if alnum.indexOf(txt.charAt(cur + i)) > -1
4050 # this makes attributes forgiving about url args
4052 # ok, and besides the weird exceptions for attributes...
4053 # return the matching char
4054 cur += i # consume entity chars
4055 parse_error() # because no terminating ";"
4059 return # never reached
4061 # tree constructor initialization
4062 # see comments on TYPE_TAG/etc for the structure of this data
4063 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4065 afe = [] # active formatting elements
4066 template_ins_modes = []
4067 ins_mode = ins_mode_initial
4068 original_ins_mode = ins_mode # TODO check spec
4069 flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
4070 flag_frameset_ok = true
4072 flag_foster_parenting = false
4073 form_element_pointer = null
4074 temporary_buffer = null
4075 pending_table_character_tokens = []
4076 head_element_pointer = null
4077 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4078 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4080 # tokenizer initialization
4081 tok_state = tok_state_data
4088 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4091 serialize_els = (els, shallow, show_ids) ->
4097 serialized += t.serialize shallow, show_ids
4100 # TODO export TYPE_*
4101 module.exports.parse_html = parse_html
4102 module.exports.debug_log_reset = debug_log_reset
4103 module.exports.debug_log_each = debug_log_each
4104 module.exports.TYPE_TAG = TYPE_TAG
4105 module.exports.TYPE_TEXT = TYPE_TEXT
4106 module.exports.TYPE_COMMENT = TYPE_COMMENT
4107 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE