1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
55 module = exports: window.wheic
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
79 debug_log_each = (cb) ->
80 for str in g_debug_log
85 constructor: (type, args = {}) ->
86 @type = type # one of the TYPE_* constants above
87 @name = args.name ? '' # tag name
88 @text = args.text ? '' # contents for text/comment nodes
89 @attrs = args.attrs ? {}
90 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91 @children = args.children ? []
92 @namespace = args.namespace ? NS_HTML
93 @parent = args.parent ? null
94 @token = args.token ? null
98 @id = "#{++prev_node_id}"
99 acknowledge_self_closing: ->
101 @token.flag 'did_self_close'
103 @flag 'did_self_close', true
106 serialize: (shallow = false, show_ids = false) -> # for unit tests
111 ret += JSON.stringify @name
126 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
132 ret += c.serialize shallow, show_ids
136 ret += JSON.stringify @text
139 ret += JSON.stringify @text
141 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
144 when TYPE_AAA_BOOKMARK
145 ret += 'aaa_bookmark'
148 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
151 # helpers: (only take args that are normally known when parser creates nodes)
152 new_open_tag = (name) ->
153 return new Node TYPE_START_TAG, name: name
154 new_end_tag = (name) ->
155 return new Node TYPE_END_TAG, name: name
156 new_element = (name) ->
157 return new Node TYPE_TAG, name: name
158 new_text_node = (txt) ->
159 return new Node TYPE_TEXT, text: txt
160 new_character_token = new_text_node
161 new_comment_token = (txt) ->
162 return new Node TYPE_COMMENT, text: txt
163 new_doctype_token = (name) ->
164 return new Node TYPE_DOCTYPE, name: name
166 return new Node TYPE_EOF
168 return new Node TYPE_AFE_MARKER
169 new_aaa_bookmark = ->
170 return new Node TYPE_AAA_BOOKMARK
172 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
173 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
174 digits = "0123456789"
175 alnum = lc_alpha + uc_alpha + digits
176 hex_chars = digits + "abcdefABCDEF"
178 is_uc_alpha = (str) ->
179 return str.length is 1 and uc_alpha.indexOf(str) > -1
180 is_lc_alpha = (str) ->
181 return str.length is 1 and lc_alpha.indexOf(str) > -1
183 # some SVG elements have dashes in them
184 tag_name_chars = alnum + "-"
186 # http://www.w3.org/TR/html5/infrastructure.html#space-character
187 space_chars = "\u0009\u000a\u000c\u000d\u0020"
189 return txt.length is 1 and space_chars.indexOf(txt) > -1
190 is_space_tok = (t) ->
191 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
193 is_input_hidden_tok = (t) ->
194 return unless t.type is TYPE_START_TAG
197 if a[1].toLowerCase() is 'hidden'
202 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
203 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
205 # These are the character references that don't need a terminating semicolon
206 # min length: 2, max: 6, none are a prefix of any other.
208 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
209 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
210 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
211 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
212 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
213 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
214 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
215 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
216 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
217 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
218 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
219 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
220 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
221 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
222 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
223 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
224 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
228 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
229 raw_text_elements = ['script', 'style']
230 escapable_raw_text_elements = ['textarea', 'title']
231 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
233 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
234 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
235 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
236 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
237 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
238 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
239 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
240 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
241 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
242 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
243 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
244 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
245 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
246 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
250 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
252 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
253 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
254 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
255 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
256 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
257 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
258 'determinant', 'diff', 'divergence', 'divide', 'domain',
259 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
260 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
261 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
262 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
263 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
264 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
265 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
266 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
267 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
268 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
269 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
270 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
271 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
272 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
273 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
274 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
275 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
276 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
277 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
278 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
279 'vectorproduct', 'xor'
281 # foreign_elements = [svg_elements..., mathml_elements...]
282 #normal_elements = All other allowed HTML elements are normal elements.
286 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
287 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
288 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
289 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
290 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
291 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
292 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
293 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
294 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
295 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
296 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
297 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
298 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
299 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
300 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
301 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
302 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
303 wbr:NS_HTML, xmp:NS_HTML,
306 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
307 'annotation-xml':NS_MATHML,
310 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
313 formatting_elements = {
314 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
315 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
319 foster_parenting_targets = {
341 el_is_special = (e) ->
342 return special_elements[e.name] is e.namespace
344 # decode_named_char_ref()
346 # The list of named character references is _huge_ so ask the browser to decode
347 # for us instead of wasting bandwidth/space on including the table here.
349 # Pass without the "&" but with the ";" examples:
350 # for "&" pass "amp;"
351 # for "′" pass "x2032;"
354 textarea: document.createElement('textarea')
356 # TODO test this in IE8
357 decode_named_char_ref = (txt) ->
359 decoded = g_dncr.cache[txt]
360 return decoded if decoded?
361 g_dncr.textarea.innerHTML = txt
362 decoded = g_dncr.textarea.value
363 return null if decoded is txt
364 return g_dncr.cache[txt] = decoded
366 parse_html = (txt, parse_error_cb = null) ->
367 cur = 0 # index of next char in txt to be parsed
368 # declare doc and tokenizer variables so they're in scope below
370 open_els = null # stack of open elements
371 afe = null # active formatting elements
372 template_insertion_modes = null
373 insertion_mode = null
374 original_insertion_mode = null
376 tok_cur_tag = null # partially parsed tag
377 flag_scripting = null
378 flag_frameset_ok = null
380 flag_foster_parenting = null
381 form_element_pointer = null
382 temporary_buffer = null
383 pending_table_character_tokens = null
384 head_element_pointer = null
385 flag_fragment_parsing = null
386 context_element = null
395 console.log "Parse error at character #{cur} of #{txt.length}"
397 afe_push = (new_el) ->
400 if el.name is new_el.name and el.namespace is new_el.namespace
402 continue unless new_el.attrs[k] is v
403 for k, v of new_el.attrs
404 continue unless el.attrs[k] is v
411 afe.unshift new_afe_marker()
413 # the functions below impliment the Tree Contstruction algorithm
414 # http://www.w3.org/TR/html5/syntax.html#tree-construction
416 # But first... the helpers
417 template_tag_is_open = ->
419 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
422 is_in_scope_x = (tag_name, scope, namespace) ->
424 if t.name is tag_name and (namespace is null or namespace is t.namespace)
426 if scope[t.name] is t.namespace
429 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
431 if t.name is tag_name and (namespace is null or namespace is t.namespace)
433 if scope[t.name] is t.namespace
435 if scope2[t.name] is t.namespace
439 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
440 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
441 template: NS_HTML, mi: NS_MATHML,
443 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
444 'annotation-xml': NS_MATHML,
446 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
448 button_scopers = button: NS_HTML
449 li_scopers = ol: NS_HTML, ul: NS_HTML
450 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
451 is_in_scope = (tag_name, namespace = null) ->
452 return is_in_scope_x tag_name, standard_scopers, namespace
453 is_in_button_scope = (tag_name, namespace = null) ->
454 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
455 is_in_table_scope = (tag_name, namespace = null) ->
456 return is_in_scope_x tag_name, table_scopers, namespace
457 is_in_select_scope = (tag_name, namespace = null) ->
459 if t.name is tag_name and (namespace is null or namespace is t.namespace)
461 if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
464 # this checks for a particular element, not by name
465 el_is_in_scope = (el) ->
469 if standard_scopers[t.name] is t.namespace
473 clear_to_table_stopers = {
478 clear_stack_to_table_context = ->
480 if clear_to_table_stopers[open_els[0].name]?
484 clear_to_table_body_stopers = {
491 clear_stack_to_table_body_context = ->
493 if clear_to_table_body_stopers[open_els[0].name]?
497 clear_to_table_row_stopers = {
502 clear_stack_to_table_row_context = ->
504 if clear_to_table_row_stopers[open_els[0].name]?
508 clear_afe_to_marker = ->
510 return unless afe.length > 0 # this happens in fragment case, ?spec error
512 if el.type is TYPE_AFE_MARKER
517 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
518 reset_insertion_mode = ->
519 # 1. Let last be false.
521 # 2. Let node be the last node in the stack of open elements.
523 node = open_els[node_i]
524 # 3. Loop: If node is the first node in the stack of open elements,
525 # then set last to true, and, if the parser was originally created as
526 # part of the HTML fragment parsing algorithm (fragment case) set node
527 # to the context element.
529 if node_i is open_els.length - 1
531 # fixfull (fragment case)
533 # 4. If node is a select element, run these substeps:
534 if node.name is 'select'
535 # 1. If last is true, jump to the step below labeled done.
537 # 2. Let ancestor be node.
540 # 3. Loop: If ancestor is the first node in the stack of
541 # open elements, jump to the step below labeled done.
543 if ancestor_i is open_els.length - 1
545 # 4. Let ancestor be the node before ancestor in the stack
548 ancestor = open_els[ancestor_i]
549 # 5. If ancestor is a template node, jump to the step below
551 if ancestor.name is 'template'
553 # 6. If ancestor is a table node, switch the insertion mode
554 # to "in select in table" and abort these steps.
555 if ancestor.name is 'table'
556 insertion_mode = ins_mode_in_select_in_table
558 # 7. Jump back to the step labeled loop.
559 # 8. Done: Switch the insertion mode to "in select" and abort
561 insertion_mode = ins_mode_in_select
563 # 5. If node is a td or th element and last is false, then switch
564 # the insertion mode to "in cell" and abort these steps.
565 if (node.name is 'td' or node.name is 'th') and last is false
566 insertion_mode = ins_mode_in_cell
568 # 6. If node is a tr element, then switch the insertion mode to "in
569 # row" and abort these steps.
571 insertion_mode = ins_mode_in_row
573 # 7. If node is a tbody, thead, or tfoot element, then switch the
574 # insertion mode to "in table body" and abort these steps.
575 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
576 insertion_mode = ins_mode_in_table_body
578 # 8. If node is a caption element, then switch the insertion mode
579 # to "in caption" and abort these steps.
580 if node.name is 'caption'
581 insertion_mode = ins_mode_in_caption
583 # 9. If node is a colgroup element, then switch the insertion mode
584 # to "in column group" and abort these steps.
585 if node.name is 'colgroup'
586 insertion_mode = ins_mode_in_column_group
588 # 10. If node is a table element, then switch the insertion mode to
589 # "in table" and abort these steps.
590 if node.name is 'table'
591 insertion_mode = ins_mode_in_table
593 # 11. If node is a template element, then switch the insertion mode
594 # to the current template insertion mode and abort these steps.
595 # fixfull (template insertion mode stack)
597 # 12. If node is a head element and last is true, then switch the
598 # insertion mode to "in body" ("in body"! not "in head"!) and abort
599 # these steps. (fragment case)
600 if node.name is 'head' and last
601 insertion_mode = ins_mode_in_body
603 # 13. If node is a head element and last is false, then switch the
604 # insertion mode to "in head" and abort these steps.
605 if node.name is 'head' and last is false
606 insertion_mode = ins_mode_in_head
608 # 14. If node is a body element, then switch the insertion mode to
609 # "in body" and abort these steps.
610 if node.name is 'body'
611 insertion_mode = ins_mode_in_body
613 # 15. If node is a frameset element, then switch the insertion mode
614 # to "in frameset" and abort these steps. (fragment case)
615 if node.name is 'frameset'
616 insertion_mode = ins_mode_in_frameset
618 # 16. If node is an html element, run these substeps:
619 if node.name is 'html'
620 # 1. If the head element pointer is null, switch the insertion
621 # mode to "before head" and abort these steps. (fragment case)
622 if head_element_pointer is null
623 ins_mode = ins_mode_before_head
625 # 2. Otherwise, the head element pointer is not null,
626 # switch the insertion mode to "after head" and abort these
628 insertion_mode = ins_mode_after_head
630 # 17. If last is true, then switch the insertion mode to "in body"
631 # and abort these steps. (fragment case)
633 insertion_mode = ins_mode_in_body
635 # 18. Let node now be the node before node in the stack of open
638 node = open_els[node_i]
639 # 19. Return to the step labeled loop.
643 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
644 adjusted_current_node = ->
645 if open_els.length is 1 and flag_fragment_parsing
646 return context_element
649 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
650 # this implementation is structured (mostly) as described at the link above.
651 # capitalized comments are the "labels" described at the link above.
652 reconstruct_active_formatting_elements = ->
653 return if afe.length is 0
654 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
659 if i is afe.length - 1
662 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
667 el = insert_html_element afe[i].token
672 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
673 # adoption agency algorithm
675 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
676 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
677 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
678 adoption_agency = (subject) ->
679 debug_log "adoption_agency()"
680 debug_log "tree: #{serialize_els doc.children, false, true}"
681 debug_log "open_els: #{serialize_els open_els, true, true}"
682 debug_log "afe: #{serialize_els afe, true, true}"
683 if open_els[0].name is subject
686 # remove it from the list of active formatting elements (if found)
691 debug_log "aaa: starting off with subject on top of stack, exiting"
698 # 5. Let formatting element be the last element in the list of
699 # active formatting elements that: is between the end of the list
700 # and the last scope marker in the list, if any, or the start of
701 # the list otherwise, and has the tag name subject.
703 for t, fe_of_afe in afe
704 if t.type is TYPE_AFE_MARKER
709 # If there is no such element, then abort these steps and instead
710 # act as described in the "any other end tag" entry above.
712 debug_log "aaa: fe not found in afe"
713 in_body_any_other_end_tag subject
715 # 6. If formatting element is not in the stack of open elements,
716 # then this is a parse error; remove the element from the list, and
719 for t, fe_of_open_els in open_els
724 debug_log "aaa: fe not found in open_els"
726 # "remove it from the list" must mean afe, since it's not in open_els
727 afe.splice fe_of_afe, 1
729 # 7. If formatting element is in the stack of open elements, but
730 # the element is not in scope, then this is a parse error; abort
732 unless el_is_in_scope fe
733 debug_log "aaa: fe not in scope"
736 # 8. If formatting element is not the current node, this is a parse
737 # error. (But do not abort these steps.)
738 unless open_els[0] is fe
741 # 9. Let furthest block be the topmost node in the stack of open
742 # elements that is lower in the stack than formatting element, and
743 # is an element in the special category. There might not be one.
745 fb_of_open_els = null
752 # and continue, to see if there's one that's more "topmost"
753 # 10. If there is no furthest block, then the UA must first pop all
754 # the nodes from the bottom of the stack of open elements, from the
755 # current node up to and including formatting element, then remove
756 # formatting element from the list of active formatting elements,
757 # and finally abort these steps.
759 debug_log "aaa: no fb"
763 afe.splice fe_of_afe, 1
765 # 11. Let common ancestor be the element immediately above
766 # formatting element in the stack of open elements.
767 ca = open_els[fe_of_open_els + 1] # common ancestor
769 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
770 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
771 bookmark = new_aaa_bookmark()
774 afe.splice i, 0, bookmark
776 node = last_node = fb
780 # 3. Let node be the element immediately above node in the
781 # stack of open elements, or if node is no longer in the stack
782 # of open elements (e.g. because it got removed by this
783 # algorithm), the element that was immediately above node in
784 # the stack of open elements before node was removed.
788 node_next = open_els[i + 1]
790 node = node_next ? node_above
791 debug_log "inner loop #{inner}"
792 debug_log "tree: #{serialize_els doc.children, false, true}"
793 debug_log "open_els: #{serialize_els open_els, true, true}"
794 debug_log "afe: #{serialize_els afe, true, true}"
795 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
796 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
797 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
798 debug_log "node: #{node.serialize true, true}"
799 # TODO make sure node_above gets re-set if/when node is removed from open_els
801 # 4. If node is formatting element, then go to the next step in
802 # the overall algorithm.
806 # 5. If inner loop counter is greater than three and node is in
807 # the list of active formatting elements, then remove node from
808 # the list of active formatting elements.
814 debug_log "max out inner"
819 # 6. If node is not in the list of active formatting elements,
820 # then remove node from the stack of open elements and then go
821 # back to the step labeled inner loop.
823 debug_log "not in afe"
826 node_above = open_els[i + 1]
830 debug_log "the bones"
831 # 7. create an element for the token for which the element node
832 # was created, in the HTML namespace, with common ancestor as
833 # the intended parent; replace the entry for node in the list
834 # of active formatting elements with an entry for the new
835 # element, replace the entry for node in the stack of open
836 # elements with an entry for the new element, and let node be
838 new_node = token_to_element node.token, NS_HTML, ca
842 debug_log "replaced in afe"
846 node_above = open_els[i + 1]
847 open_els[i] = new_node
848 debug_log "replaced in open_els"
851 # 8. If last node is furthest block, then move the
852 # aforementioned bookmark to be immediately after the new node
853 # in the list of active formatting elements.
858 debug_log "removed bookmark"
862 # "after" means lower
863 afe.splice i, 0, bookmark # "after as <-
864 debug_log "placed bookmark after node"
865 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
867 # 9. Insert last node into node, first removing it from its
868 # previous parent node if any.
870 debug_log "last_node has parent"
871 for c, i in last_node.parent.children
873 debug_log "removing last_node from parent"
874 last_node.parent.children.splice i, 1
876 node.children.push last_node
877 last_node.parent = node
878 # 10. Let last node be node.
881 # 11. Return to the step labeled inner loop.
882 # 14. Insert whatever last node ended up being in the previous step
883 # at the appropriate place for inserting a node, but using common
884 # ancestor as the override target.
886 # In the case where fe is immediately followed by fb:
887 # * inner loop exits out early (node==fe)
889 # * last_node is still in the tree (not a duplicate)
891 debug_log "FEFIRST? last_node has parent"
892 for c, i in last_node.parent.children
894 debug_log "removing last_node from parent"
895 last_node.parent.children.splice i, 1
898 debug_log "after aaa inner loop"
899 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
900 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
901 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
902 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
903 debug_log "tree: #{serialize_els doc.children, false, true}"
908 # can't use standard insert token thing, because it's already in
909 # open_els and must stay at it's current position in open_els
910 dest = adjusted_insertion_location ca
911 dest[0].children.splice dest[1], 0, last_node
912 last_node.parent = dest[0]
915 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
916 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
917 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
918 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
919 debug_log "tree: #{serialize_els doc.children, false, true}"
921 # 15. Create an element for the token for which formatting element
922 # was created, in the HTML namespace, with furthest block as the
924 new_element = token_to_element fe.token, NS_HTML, fb
925 # 16. Take all of the child nodes of furthest block and append them
926 # to the element created in the last step.
927 while fb.children.length
928 t = fb.children.shift()
929 t.parent = new_element
930 new_element.children.push t
931 # 17. Append that new element to furthest block.
932 new_element.parent = fb
933 fb.children.push new_element
934 # 18. Remove formatting element from the list of active formatting
935 # elements, and insert the new element into the list of active
936 # formatting elements at the position of the aforementioned
946 # 19. Remove formatting element from the stack of open elements,
947 # and insert the new element into the stack of open elements
948 # immediately below the position of furthest block in that stack.
955 open_els.splice i, 0, new_element
957 # 20. Jump back to the step labeled outer loop.
958 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
959 debug_log "tree: #{serialize_els doc.children, false, true}"
960 debug_log "open_els: #{serialize_els open_els, true, true}"
961 debug_log "afe: #{serialize_els afe, true, true}"
964 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
966 generate_implied_end_tags 'p' # arg is exception
967 if open_els[0].name isnt 'p'
969 while open_els.length > 1 # just in case
970 el = open_els.shift()
973 close_p_if_in_button_scope = ->
974 if is_in_button_scope 'p'
977 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
978 # aka insert_a_character = (t) ->
979 insert_character = (t) ->
980 dest = adjusted_insertion_location()
981 # fixfull check for Document node
983 prev = dest[0].children[dest[1] - 1]
984 if prev.type is TYPE_TEXT
987 dest[0].children.splice dest[1], 0, t
990 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
991 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
992 adjusted_insertion_location = (override_target = null) ->
993 # 1. If there was an override target specified, then let target be the
996 target = override_target
997 else # Otherwise, let target be the current node.
999 # 2. Determine the adjusted insertion location using the first matching
1000 # steps from the following list:
1002 # If foster parenting is enabled and target is a table, tbody, tfoot,
1003 # thead, or tr element Foster parenting happens when content is
1004 # misnested in tables.
1005 if flag_foster_parenting and foster_parenting_targets[target.name]
1006 loop # once. this is here so we can ``break`` to "abort these substeps"
1007 # 1. Let last template be the last template element in the
1008 # stack of open elements, if any.
1009 last_template = null
1010 last_template_i = null
1011 for el, i in open_els
1012 if el.name is 'template'
1016 # 2. Let last table be the last table element in the stack of
1017 # open elements, if any.
1020 for el, i in open_els
1021 if el.name is 'table'
1025 # 3. If there is a last template and either there is no last
1026 # table, or there is one, but last template is lower (more
1027 # recently added) than last table in the stack of open
1028 # elements, then: let adjusted insertion location be inside
1029 # last template's template contents, after its last child (if
1030 # any), and abort these substeps.
1031 if last_template and (last_table is null or last_template_i < last_table_i)
1032 target = last_template # fixfull should be it's contents
1033 target_i = target.children.length
1035 # 4. If there is no last table, then let adjusted insertion
1036 # location be inside the first element in the stack of open
1037 # elements (the html element), after its last child (if any),
1038 # and abort these substeps. (fragment case)
1039 if last_table is null
1041 target = open_els[open_els.length - 1]
1042 target_i = target.children.length
1043 # 5. If last table has a parent element, then let adjusted
1044 # insertion location be inside last table's parent element,
1045 # immediately before last table, and abort these substeps.
1046 if last_table.parent?
1047 for c, i in last_table.parent.children
1049 target = last_table.parent
1053 # 6. Let previous element be the element immediately above last
1054 # table in the stack of open elements.
1056 # huh? how could it not have a parent?
1057 previous_element = open_els[last_table_i + 1]
1058 # 7. Let adjusted insertion location be inside previous
1059 # element, after its last child (if any).
1060 target = previous_element
1061 target_i = target.children.length
1062 # Note: These steps are involved in part because it's possible
1063 # for elements, the table element in this case in particular,
1064 # to have been moved by a script around in the DOM, or indeed
1065 # removed from the DOM entirely, after the element was inserted
1067 break # don't really loop
1069 # Otherwise Let adjusted insertion location be inside target, after
1070 # its last child (if any).
1071 target_i = target.children.length
1073 # 3. If the adjusted insertion location is inside a template element,
1074 # let it instead be inside the template element's template contents,
1075 # after its last child (if any).
1076 # fixfull (template)
1078 # 4. Return the adjusted insertion location.
1079 return [target, target_i]
1081 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1082 # aka create_an_element_for_token
1083 token_to_element = (t, namespace, intended_parent) ->
1084 # convert attributes into a hash
1087 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1088 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1090 # TODO 2. If the newly created element has an xmlns attribute in the
1091 # XMLNS namespace whose value is not exactly the same as the element's
1092 # namespace, that is a parse error. Similarly, if the newly created
1093 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1094 # value is not the XLink Namespace, that is a parse error.
1096 # fixfull: the spec says stuff about form pointers and ownerDocument
1100 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1101 insert_foreign_element = (token, namespace) ->
1102 ail = adjusted_insertion_location()
1105 el = token_to_element token, namespace, ail_el
1106 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1108 ail_el.children.splice ail_i, 0, el
1111 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1112 insert_html_element = insert_foreign_element # (token, namespace) ->
1114 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1115 # position should be [node, index_within_children]
1116 insert_comment = (t, position = null) ->
1117 position ?= adjusted_insertion_location()
1118 position[0].children.splice position[1], 0, t
1121 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1122 parse_generic_raw_text = (t) ->
1123 insert_html_element t
1124 tok_state = tok_state_rawtext
1125 original_insertion_mode = insertion_mode
1126 insertion_mode = ins_mode_text
1127 parse_generic_rcdata_text = (t) ->
1128 insert_html_element t
1129 tok_state = tok_state_rcdata
1130 original_insertion_mode = insertion_mode
1131 insertion_mode = ins_mode_text
1133 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1134 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1135 generate_implied_end_tags = (except = null) ->
1136 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1139 # 8.2.5.4 The rules for parsing tokens in HTML content
1140 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1142 # 8.2.5.4.1 The "initial" insertion mode
1143 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1144 ins_mode_initial = (t) ->
1147 if t.type is TYPE_COMMENT
1151 if t.type is TYPE_DOCTYPE
1152 # FIXME check identifiers, set quirks, etc
1155 insertion_mode = ins_mode_before_html
1158 #fixfull (iframe, quirks)
1159 insertion_mode = ins_mode_before_html
1160 insertion_mode t # reprocess the token
1163 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1164 ins_mode_before_html = (t) ->
1165 if t.type is TYPE_DOCTYPE
1168 if t.type is TYPE_COMMENT
1173 if t.type is TYPE_START_TAG and t.name is 'html'
1174 el = token_to_element t, NS_HTML, doc
1175 doc.children.push el
1176 open_els.unshift(el)
1177 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1178 insertion_mode = ins_mode_before_head
1180 if t.type is TYPE_END_TAG
1181 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1182 # fall through to "anything else"
1187 html_tok = new_open_tag 'html'
1188 el = token_to_element html_tok, NS_HTML, doc
1189 doc.children.push el
1191 # ?fixfull browsing context
1192 insertion_mode = ins_mode_before_head
1196 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1197 ins_mode_before_head = (t) ->
1200 if t.type is TYPE_COMMENT
1203 if t.type is TYPE_DOCTYPE
1206 if t.type is TYPE_START_TAG and t.name is 'html'
1209 if t.type is TYPE_START_TAG and t.name is 'head'
1210 el = insert_html_element t
1211 head_element_pointer = el
1212 insertion_mode = ins_mode_in_head
1213 if t.type is TYPE_END_TAG
1214 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1215 # fall through to Anything else below
1220 head_tok = new_open_tag 'head'
1221 el = insert_html_element head_tok
1222 head_element_pointer = el
1223 insertion_mode = ins_mode_in_head
1224 insertion_mode t # reprocess current token
1226 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1227 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1228 open_els.shift() # spec says this will be a 'head' node
1229 insertion_mode = ins_mode_after_head
1231 ins_mode_in_head = (t) ->
1232 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1235 if t.type is TYPE_COMMENT
1238 if t.type is TYPE_DOCTYPE
1241 if t.type is TYPE_START_TAG and t.name is 'html'
1244 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1245 el = insert_html_element t
1247 t.acknowledge_self_closing()
1249 if t.type is TYPE_START_TAG and t.name is 'meta'
1250 el = insert_html_element t
1252 t.acknowledge_self_closing()
1253 # fixfull encoding stuff
1255 if t.type is TYPE_START_TAG and t.name is 'title'
1256 parse_generic_rcdata_text t
1258 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1259 parse_generic_raw_text t
1261 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1262 insert_html_element t
1263 insertion_mode = ins_mode_in_head_noscript # FIXME implement
1265 if t.type is TYPE_START_TAG and t.name is 'script'
1266 ail = adjusted_insertion_location()
1267 el = token_to_element t, NS_HTML, ail
1268 el.flag 'parser-inserted', true # FIXME implement
1269 # fixfull frament case
1270 ail[0].children.splice ail[1], 0, el
1272 tok_state = tok_state_script_data
1273 original_insertion_mode = insertion_mode # make sure orig... is defined
1274 insertion_mode = ins_mode_text # FIXME implement
1276 if t.type is TYPE_END_TAG and t.name is 'head'
1277 open_els.shift() # will be a head element... spec says so
1278 insertion_mode = ins_mode_after_head
1280 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1281 ins_mode_in_head_else t
1283 if t.type is TYPE_START_TAG and t.name is 'template'
1284 insert_html_element t
1286 flag_frameset_ok = false
1287 insertion_mode = ins_mode_in_template
1288 template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1290 if t.type is TYPE_END_TAG and t.name is 'template'
1291 if template_tag_is_open()
1292 generate_implied_end_tags
1293 if open_els[0].name isnt 'template'
1296 el = open_els.shift()
1297 if el.name is 'template'
1299 clear_afe_to_marker()
1300 template_insertion_modes.shift()
1301 reset_insertion_mode()
1305 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1308 ins_mode_in_head_else t
1310 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1311 ins_mode_in_head_noscript = (t) ->
1313 console.log "ins_mode_in_head_noscript unimplemented"
1315 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1316 ins_mode_after_head_else = (t) ->
1317 body_tok = new_open_tag 'body'
1318 insert_html_element body_tok
1319 insertion_mode = ins_mode_in_body
1320 insertion_mode t # reprocess token
1322 ins_mode_after_head = (t) ->
1326 if t.type is TYPE_COMMENT
1329 if t.type is TYPE_DOCTYPE
1332 if t.type is TYPE_START_TAG and t.name is 'html'
1335 if t.type is TYPE_START_TAG and t.name is 'body'
1336 insert_html_element t
1337 flag_frameset_ok = false
1338 insertion_mode = ins_mode_in_body
1340 if t.type is TYPE_START_TAG and t.name is 'frameset'
1341 insert_html_element t
1342 insertion_mode = ins_mode_in_frameset
1344 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1346 open_els.unshift head_element_pointer
1348 for el, i of open_els
1349 if el is head_element_pointer
1350 open_els.splice i, 1
1352 console.log "warning: 23904 couldn't find head element in open_els"
1354 if t.type is TYPE_END_TAG and t.name is 'template'
1357 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1358 ins_mode_after_head_else t
1360 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1364 ins_mode_after_head_else t
1366 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1367 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1368 for node, i in open_els
1369 if node.name is name # FIXME check namespace too
1370 generate_implied_end_tags name # arg is exception
1371 parse_error() unless i is 0
1376 if special_elements[node.name]? # FIXME check namespac too
1379 ins_mode_in_body = (t) ->
1385 when "\t", "\u000a", "\u000c", "\u000d", ' '
1386 reconstruct_active_formatting_elements()
1389 reconstruct_active_formatting_elements()
1391 flag_frameset_ok = false
1400 return if template_tag_is_open()
1401 root_attrs = open_els[open_els.length - 1].attrs
1403 root_attrs[k] = v unless root_attrs[k]?
1404 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1405 # FIXME also do this for </template> (end tag)
1406 return ins_mode_in_head t
1413 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1414 close_p_if_in_button_scope()
1415 insert_html_element t
1416 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1417 close_p_if_in_button_scope()
1418 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1421 insert_html_element t
1422 # TODO lots more to implement here
1424 # If the list of active formatting elements
1425 # contains an a element between the end of the list and
1426 # the last marker on the list (or the start of the list
1427 # if there is no marker on the list), then this is a
1428 # parse error; run the adoption agency algorithm for
1429 # the tag name "a", then remove that element from the
1430 # list of active formatting elements and the stack of
1431 # open elements if the adoption agency algorithm didn't
1432 # already remove it (it might not have if the element
1433 # is not in table scope).
1436 if el.type is TYPE_AFE_MARKER
1446 for el, i in open_els
1448 open_els.splice i, 1
1449 reconstruct_active_formatting_elements()
1450 el = insert_html_element t
1452 when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1453 reconstruct_active_formatting_elements()
1454 el = insert_html_element t
1457 # fixfull quirksmode thing
1458 close_p_if_in_button_scope()
1459 insert_html_element t
1460 insertion_mode = ins_mode_in_table
1461 # TODO lots more to implement here
1462 else # any other start tag
1463 reconstruct_active_formatting_elements()
1464 insert_html_element t
1467 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1468 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1471 unless ok_tags[t.name]?
1474 # TODO stack of template insertion modes thing
1479 unless is_in_scope 'body'
1482 # TODO implement parse error and move to tree_after_body
1484 unless is_in_scope 'body' # weird, but it's what the spec says
1487 # TODO implement parse error and move to tree_after_body, reprocess
1488 when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1489 unless is_in_scope t.name, NS_HTML
1492 generate_implied_end_tags()
1493 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1496 el = open_els.shift()
1497 if el.name is t.name and el.namespace is NS_HTML
1499 # TODO lots more close tags to implement here
1501 unless is_in_button_scope 'p'
1503 insert_html_element new_open_tag 'p'
1505 # TODO lots more close tags to implement here
1506 when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1507 adoption_agency t.name
1508 # TODO lots more close tags to implement here
1510 in_body_any_other_end_tag t.name
1513 ins_mode_in_table_else = (t) ->
1515 flag_foster_parenting = true # FIXME
1517 flag_foster_parenting = false
1518 can_in_table = { # FIXME do this inline like everywhere else
1526 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1527 ins_mode_text = (t) ->
1528 if t.type is TYPE_TEXT
1531 if t.type is TYPE_EOF
1533 if open_els[0].name is 'script'
1534 open_els[0].flag 'already started', true
1536 insertion_mode = original_insertion_mode
1539 if t.type is TYPE_END_TAG and t.name is 'script'
1541 insertion_mode = original_insertion_mode
1542 # fixfull the spec seems to assume that I'm going to run the script
1543 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1545 if t.type is TYPE_END_TAG
1547 insertion_mode = original_insertion_mode
1549 console.log 'warning: end of ins_mode_text reached'
1551 # the functions below implement the tokenizer stats described here:
1552 # http://www.w3.org/TR/html5/syntax.html#tokenization
1554 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1555 ins_mode_in_table = (t) ->
1558 if can_in_table[t.name]
1559 original_insertion_mode = insertion_mode
1560 insertion_mode = ins_mode_in_table_text
1563 ins_mode_in_table_else t
1571 clear_stack_to_table_context()
1573 insert_html_element t
1574 insertion_mode = ins_mode_in_caption
1576 clear_stack_to_table_context()
1577 insert_html_element t
1578 insertion_mode = ins_mode_in_column_group
1580 clear_stack_to_table_context()
1581 insert_html_element new_open_tag 'colgroup'
1582 insertion_mode = ins_mode_in_column_group
1584 when 'tbody', 'tfoot', 'thead'
1585 clear_stack_to_table_context()
1586 insert_html_element t
1587 insertion_mode = ins_mode_in_table_body
1588 when 'td', 'th', 'tr'
1589 clear_stack_to_table_context()
1590 insert_html_element new_open_tag 'tbody'
1591 insertion_mode = ins_mode_in_table_body
1595 if is_in_table_scope 'table'
1597 el = open_els.shift()
1598 if el.name is 'table'
1600 reset_insertion_mode()
1602 when 'style', 'script', 'template'
1605 if is_input_hidden_tok t
1606 ins_mode_in_table_else t
1609 el = insert_html_element t
1611 t.acknowledge_self_closing()
1614 if form_element_pointer?
1616 if template_tag_is_open()
1618 form_element_pointer = insert_html_element t
1621 ins_mode_in_table_else t
1625 if is_in_table_scope 'table'
1627 el = open_els.shift()
1628 if el.name is 'table'
1630 reset_insertion_mode()
1633 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1638 ins_mode_in_table_else t
1642 ins_mode_in_table_else t
1645 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1646 ins_mode_in_table_text = (t) ->
1647 if t.type is TYPE_TEXT and t.text is "\u0000"
1648 # huh? I thought the tokenizer didn't emit these
1651 if t.type is TYPE_TEXT
1652 pending_table_character_tokens.push t
1656 for old in pending_table_character_tokens
1657 unless is_space_tok old
1661 for old in pending_table_character_tokens
1662 insert_character old
1664 for old in pending_table_character_tokens
1665 ins_mode_table_else old
1666 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1667 insertion_mode = original_insertion_mode
1670 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1671 ins_mode_in_caption = (t) ->
1672 if t.type is TYPE_END_TAG and t.name is 'caption'
1673 if is_in_table_scope 'caption'
1674 generate_implied_end_tags()
1675 if open_els[0].name isnt 'caption'
1678 el = open_els.shift()
1679 if el.name is 'caption'
1681 clear_afe_to_marker()
1682 insertion_mode = ins_mode_in_table
1687 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1689 if is_in_table_scope 'caption'
1691 el = open_els.shift()
1692 if el.name is 'caption'
1694 clear_afe_to_marker()
1695 insertion_mode = ins_mode_in_table
1697 # else fragment case
1699 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1705 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1706 ins_mode_in_column_group = (t) ->
1710 if t.type is TYPE_COMMENT
1713 if t.type is TYPE_DOCTYPE
1716 if t.type is TYPE_START_TAG and t.name is 'html'
1719 if t.type is TYPE_START_TAG and t.name is 'col'
1720 el = insert_html_element t
1722 t.acknowledge_self_closing()
1724 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1725 if open_els[0].name is 'colgroup'
1727 insertion_mode = ins_mode_in_table
1731 if t.type is TYPE_END_TAG and t.name is 'col'
1734 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1737 if t.type is TYPE_EOF
1741 if open_els[0].name isnt 'colgroup'
1745 insertion_mode = ins_mode_in_table
1749 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1750 ins_mode_in_table_body = (t) ->
1751 if t.type is TYPE_START_TAG and t.name is 'tr'
1752 clear_stack_to_table_body_context()
1753 insert_html_element t
1754 insertion_mode = ins_mode_in_row
1756 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1758 clear_stack_to_table_body_context()
1759 insert_html_element new_open_tag 'tr'
1760 insertion_mode = ins_mode_in_row
1763 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1764 unless is_in_table_scope t.name # fixfull check namespace
1767 clear_stack_to_table_body_context()
1769 insertion_mode = ins_mode_in_table
1771 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1774 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1777 if table_scopers[el.name]
1782 clear_stack_to_table_body_context()
1784 insertion_mode = ins_mode_in_table
1787 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1793 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1794 ins_mode_in_row = (t) ->
1795 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1796 clear_stack_to_table_row_context()
1797 insert_html_element t
1798 insertion_mode = ins_mode_in_cell
1801 if t.type is TYPE_END_TAG and t.name is 'tr'
1802 if is_in_table_scope 'tr'
1803 clear_stack_to_table_row_context()
1805 insertion_mode = ins_mode_in_table_body
1809 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1810 if is_in_table_scope 'tr'
1811 clear_stack_to_table_row_context()
1813 insertion_mode = ins_mode_in_table_body
1818 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1819 if is_in_table_scope t.name # fixfull namespace
1820 if is_in_table_scope 'tr'
1821 clear_stack_to_table_row_context()
1823 insertion_mode = ins_mode_in_table_body
1828 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1834 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1836 generate_implied_end_tags()
1837 unless open_els[0].name is 'td' or open_els[0] is 'th'
1840 el = open_els.shift()
1841 if el.name is 'td' or el.name is 'th'
1843 clear_afe_to_marker()
1844 insertion_mode = ins_mode_in_row
1846 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1847 ins_mode_in_cell = (t) ->
1848 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1849 if is_in_table_scope t.name
1850 generate_implied_end_tags()
1851 if open_els[0].name isnt t.name
1854 el = open_els.shift()
1855 if el.name is t.name
1857 clear_afe_to_marker()
1858 insertion_mode = ins_mode_in_row
1862 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1865 if el.name is 'td' or el.name is 'th'
1868 if table_scopers[el.name]
1876 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1879 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1880 if is_in_table_scope t.name # fixfull namespace
1889 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
1890 ins_mode_in_select = (t) ->
1891 if t.type is TYPE_TEXT and t.text is "\u0000"
1894 if t.type is TYPE_TEXT
1897 if t.type is TYPE_COMMENT
1900 if t.type is TYPE_DOCTYPE
1903 if t.type is TYPE_START_TAG and t.name is 'html'
1906 if t.type is TYPE_START_TAG and t.name is 'option'
1907 if open_els[0].name is 'option'
1909 insert_html_element t
1911 if t.type is TYPE_START_TAG and t.name is 'optgroup'
1912 if open_els[0].name is 'option'
1914 if open_els[0].name is 'optgroup'
1916 insert_html_element t
1918 if t.type is TYPE_END_TAG and t.name is 'optgroup'
1919 if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
1921 if open_els[0].name is 'optgroup'
1926 if t.type is TYPE_END_TAG and t.name is 'option'
1927 if open_els[0].name is 'option'
1932 if t.type is TYPE_END_TAG and t.name is 'select'
1933 if is_in_select_scope 'select'
1935 el = open_els.shift()
1936 if el.name is 'select'
1938 reset_insertion_mode()
1942 if t.type is TYPE_START_TAG and t.name is 'select'
1945 el = open_els.shift()
1946 if el.name is 'select'
1948 reset_insertion_mode()
1949 # spec says that this is the same as </select> but it doesn't say
1950 # to check scope first
1952 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
1954 if is_in_select_scope 'select'
1957 el = open_els.shift()
1958 if el.name is 'select'
1960 reset_insertion_mode()
1963 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
1966 if t.type is TYPE_EOF
1973 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
1974 ins_mode_in_select_in_table = (t) ->
1975 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1978 el = open_els.shift()
1979 if el.name is 'select'
1981 reset_insertion_mode()
1984 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1986 unless is_in_table_scope t.name, NS_HTML
1989 el = open_els.shift()
1990 if el.name is 'select'
1992 reset_insertion_mode()
1996 ins_mode_in_select t
1999 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2000 ins_mode_in_template = (t) ->
2001 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2004 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2007 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2008 template_insertion_modes.shift()
2009 template_insertion_modes.unshift ins_mode_in_table
2010 insertion_mode = ins_mode_in_table
2013 if t.type is TYPE_START_TAG and t.name is 'col'
2014 template_insertion_modes.shift()
2015 template_insertion_modes.unshift ins_mode_in_column_group
2016 insertion_mode = ins_mode_in_column_group
2019 if t.type is TYPE_START_TAG and t.name is 'tr'
2020 template_insertion_modes.shift()
2021 template_insertion_modes.unshift ins_mode_in_table_body
2022 insertion_mode = ins_mode_in_table_body
2025 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2026 template_insertion_modes.shift()
2027 template_insertion_modes.unshift ins_mode_in_row
2028 insertion_mode = ins_mode_in_row
2031 if t.type is TYPE_START_TAG
2032 template_insertion_modes.shift()
2033 template_insertion_modes.unshift ins_mode_in_body
2034 insertion_mode = ins_mode_in_body
2037 if t.type is TYPE_END_TAG
2040 if t.type is TYPE_EOF
2041 unless template_tag_is_open()
2046 el = open_els.shift()
2047 if el.name is 'template' # fixfull check namespace
2049 clear_afe_to_marker()
2050 template_insertion_modes.shift()
2051 reset_insertion_mode()
2054 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2055 ins_mode_after_body = (t) ->
2059 if t.type is TYPE_COMMENT
2060 insert_comment t, [open_els[0], open_els[0].children.length]
2062 if t.type is TYPE_DOCTYPE
2065 if t.type is TYPE_START_TAG and t.name is 'html'
2068 if t.type is TYPE_END_TAG and t.name is 'html'
2069 # fixfull fragment case
2070 insertion_mode = ins_mode_after_after_body
2072 if t.type is TYPE_EOF
2077 insertion_mode = ins_mode_in_body
2080 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2081 ins_mode_in_frameset = (t) ->
2085 if t.type is TYPE_COMMENT
2088 if t.type is TYPE_DOCTYPE
2091 if t.type is TYPE_START_TAG and t.name is 'html'
2094 if t.type is TYPE_START_TAG and t.name is 'frameset'
2095 insert_html_element t
2097 if t.type is TYPE_END_TAG and t.name is 'frameset'
2098 # TODO ?correct for: "if the current node is the root html element"
2099 if open_els.length is 1
2101 return # fragment case
2103 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2104 insertion_mode = ins_mode_after_frameset
2106 if t.type is TYPE_START_TAG and t.name is 'frame'
2107 insert_html_element t
2109 t.acknowledge_self_closing()
2111 if t.type is TYPE_START_TAG and t.name is 'noframes'
2114 if t.type is TYPE_EOF
2115 # TODO ?correct for: "if the current node is not the root html element"
2116 if open_els.length isnt 1
2124 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2125 ins_mode_after_frameset = (t) ->
2129 if t.type is TYPE_COMMENT
2132 if t.type is TYPE_DOCTYPE
2135 if t.type is TYPE_START_TAG and t.name is 'html'
2138 if t.type is TYPE_END_TAG and t.name is 'html'
2139 insert_mode = ins_mode_after_after_frameset
2141 if t.type is TYPE_START_TAG and t.name is 'noframes'
2144 if t.type is TYPE_EOF
2151 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2152 ins_mode_after_after_body = (t) ->
2153 if t.type is TYPE_COMMENT
2154 insert_comment t, [doc, doc.children.length]
2156 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2159 if t.type is TYPE_EOF
2164 insertion_mode = ins_mode_in_body
2167 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2168 ins_mode_after_after_frameset = (t) ->
2169 if t.type is TYPE_COMMENT
2170 insert_comment t, [doc, doc.children.length]
2172 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2175 if t.type is TYPE_EOF
2178 if t.type is TYPE_START_TAG and t.name is 'noframes'
2189 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2191 switch c = txt.charAt(cur++)
2193 return new_text_node parse_character_reference()
2195 tok_state = tok_state_tag_open
2198 return new_text_node c
2200 return new_eof_token()
2202 return new_text_node c
2205 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2206 # not needed: tok_state_character_reference_in_data = ->
2207 # just call parse_character_reference()
2209 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2210 tok_state_rcdata = ->
2211 switch c = txt.charAt(cur++)
2213 return new_text_node parse_character_reference()
2215 tok_state = tok_state_rcdata_less_than_sign
2218 return new_character_token "\ufffd"
2220 return new_eof_token()
2222 return new_character_token c
2225 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2226 # not needed: tok_state_character_reference_in_rcdata = ->
2227 # just call parse_character_reference()
2229 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2230 tok_state_rawtext = ->
2231 switch c = txt.charAt(cur++)
2233 tok_state = tok_state_rawtext_less_than_sign
2236 return new_character_token "\ufffd"
2238 return new_eof_token()
2240 return new_character_token c
2243 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2244 tok_state_script_data = ->
2245 switch c = txt.charAt(cur++)
2247 tok_state = tok_state_script_data_less_than_sign
2250 return new_character_token "\ufffd"
2252 return new_eof_token()
2254 return new_character_token c
2257 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2258 tok_state_plaintext = ->
2259 switch c = txt.charAt(cur++)
2262 return new_character_token "\ufffd"
2264 return new_eof_token()
2266 return new_character_token c
2270 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2271 tok_state_tag_open = ->
2272 switch c = txt.charAt(cur++)
2274 tok_state = tok_state_markup_declaration_open
2276 tok_state = tok_state_end_tag_open
2279 tok_cur_tag = new_comment_token '?'
2280 tok_state = tok_state_bogus_comment
2283 tok_cur_tag = new_open_tag c
2284 tok_state = tok_state_tag_name
2285 else if is_uc_alpha(c)
2286 tok_cur_tag = new_open_tag c.toLowerCase()
2287 tok_state = tok_state_tag_name
2290 tok_state = tok_state_data
2291 cur -= 1 # we didn't parse/handle the char after <
2292 return new_text_node '<'
2295 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2296 tok_state_end_tag_open = ->
2297 switch c = txt.charAt(cur++)
2300 tok_state = tok_state_data
2303 tok_state = tok_state_data
2304 return new_text_node '</'
2307 tok_cur_tag = new_end_tag c.toLowerCase()
2308 tok_state = tok_state_tag_name
2309 else if is_lc_alpha(c)
2310 tok_cur_tag = new_end_tag c
2311 tok_state = tok_state_tag_name
2314 tok_cur_tag = new_comment_token '/'
2315 tok_state = tok_state_bogus_comment
2318 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2319 tok_state_tag_name = ->
2320 switch c = txt.charAt(cur++)
2321 when "\t", "\n", "\u000c", ' '
2322 tok_state = tok_state_before_attribute_name
2324 tok_state = tok_state_self_closing_start_tag
2326 tok_state = tok_state_data
2332 tok_cur_tag.name += "\ufffd"
2335 tok_state = tok_state_data
2338 tok_cur_tag.name += c.toLowerCase()
2340 tok_cur_tag.name += c
2343 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2344 tok_state_rcdata_less_than_sign = ->
2345 c = txt.charAt(cur++)
2347 temporary_buffer = ''
2348 tok_state = tok_state_rcdata_end_tag_open
2351 tok_state = tok_state_rcdata
2352 cur -= 1 # reconsume the input character
2353 return new_character_token '<'
2355 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2356 tok_state_rcdata_end_tag_open = ->
2357 c = txt.charAt(cur++)
2359 tok_cur_tag = new_end_tag c.toLowerCase()
2360 temporary_buffer += c
2361 tok_state = tok_state_rcdata_end_tag_name
2364 tok_cur_tag = new_end_tag c
2365 temporary_buffer += c
2366 tok_state = tok_state_rcdata_end_tag_name
2369 tok_state = tok_state_rcdata
2370 cur -= 1 # reconsume the input character
2371 return new_character_token "</" # fixfull separate these
2373 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2374 is_appropriate_end_tag = (t) ->
2375 # spec says to check against "the tag name of the last start tag to
2376 # have been emitted from this tokenizer", but this is only called from
2377 # the various "raw" states, which I'm pretty sure all push the start
2378 # token onto open_els. TODO: verify this after the script data states
2380 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2381 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2383 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2384 tok_state_rcdata_end_tag_name = ->
2385 c = txt.charAt(cur++)
2386 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2387 if is_appropriate_end_tag tok_cur_tag
2388 tok_state = tok_state_before_attribute_name
2390 # else fall through to "Anything else"
2392 if is_appropriate_end_tag tok_cur_tag
2393 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2395 # else fall through to "Anything else"
2397 if is_appropriate_end_tag tok_cur_tag
2398 tok_state = tok_state_data
2400 # else fall through to "Anything else"
2402 tok_cur_tag.name += c.toLowerCase()
2403 temporary_buffer += c
2406 tok_cur_tag.name += c
2407 temporary_buffer += c
2410 tok_state = tok_state_rcdata
2411 cur -= 1 # reconsume the input character
2412 return new_character_token '</' + temporary_buffer # fixfull separate these
2414 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2415 tok_state_rawtext_less_than_sign = ->
2416 c = txt.charAt(cur++)
2418 temporary_buffer = ''
2419 tok_state = tok_state_rawtext_end_tag_open
2422 tok_state = tok_state_rawtext
2423 cur -= 1 # reconsume the input character
2424 return new_character_token '<'
2426 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2427 tok_state_rawtext_end_tag_open = ->
2428 c = txt.charAt(cur++)
2430 tok_cur_tag = new_end_tag c.toLowerCase()
2431 temporary_buffer += c
2432 tok_state = tok_state_rawtext_end_tag_name
2435 tok_cur_tag = new_end_tag c
2436 temporary_buffer += c
2437 tok_state = tok_state_rawtext_end_tag_name
2440 tok_state = tok_state_rawtext
2441 cur -= 1 # reconsume the input character
2442 return new_character_token "</" # fixfull separate these
2444 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2445 tok_state_rawtext_end_tag_name = ->
2446 c = txt.charAt(cur++)
2447 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2448 if is_appropriate_end_tag tok_cur_tag
2449 tok_state = tok_state_before_attribute_name
2451 # else fall through to "Anything else"
2453 if is_appropriate_end_tag tok_cur_tag
2454 tok_state = tok_state_self_closing_start_tag
2456 # else fall through to "Anything else"
2458 if is_appropriate_end_tag tok_cur_tag
2459 tok_state = tok_state_data
2461 # else fall through to "Anything else"
2463 tok_cur_tag.name += c.toLowerCase()
2464 temporary_buffer += c
2467 tok_cur_tag.name += c
2468 temporary_buffer += c
2471 tok_state = tok_state_rawtext
2472 cur -= 1 # reconsume the input character
2473 return new_character_token '</' + temporary_buffer # fixfull separate these
2475 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
2476 tok_state_script_data_less_than_sign = ->
2477 c = txt.charAt(cur++)
2479 temporary_buffer = ''
2480 tok_state = tok_state_script_data_end_tag_open
2483 tok_state = tok_state_script_data_escape_start
2484 return new_character_token '<!' # fixfull split
2486 tok_state = tok_state_script_data
2487 cur -= 1 # Reconsume
2488 return new_character_token '<'
2490 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2491 tok_state_script_data_end_tag_open = ->
2492 c = txt.charAt(cur++)
2494 tok_cur_tag = new_end_tag c.toLowerCase()
2495 temporary_buffer += c
2496 tok_state = tok_state_script_data_end_tag_name
2499 tok_cur_tag = new_end_tag c
2500 temporary_buffer += c
2501 tok_state = tok_state_script_data_end_tag_name
2504 tok_state = tok_state_script_data
2505 cur -= 1 # Reconsume
2506 return new_character_token '</'
2508 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2509 tok_state_script_data_end_tag_name = ->
2510 c = txt.charAt(cur++)
2511 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2512 if is_appropriate_end_tag tok_cur_tag
2513 tok_state = tok_state_before_attribute_name
2517 if is_appropriate_end_tag tok_cur_tag
2518 tok_state = tok_state_self_closing_start_tag
2522 tok_cur_tag.name += c.toLowerCase()
2523 temporary_buffer += c
2526 tok_cur_tag.name += c
2527 temporary_buffer += c
2530 tok_state = tok_state_script_data
2531 cur -= 1 # Reconsume
2532 return new_character_token "</#{temporary_buffer}" # fixfull split
2534 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
2535 tok_state_script_data_escape_start = ->
2536 c = txt.charAt(cur++)
2538 tok_state = tok_state_script_data_escape_start_dash
2539 return new_character_token '-'
2541 tok_state = tok_state_script_data
2542 cur -= 1 # Reconsume
2545 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
2546 tok_state_script_data_escape_start_dash = ->
2547 c = txt.charAt(cur++)
2549 tok_state = tok_state_script_data_escaped_dash_dash
2550 return new_character_token '-'
2552 tok_state = tok_state_script_data
2553 cur -= 1 # Reconsume
2556 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
2557 tok_state_script_data_escaped = ->
2558 c = txt.charAt(cur++)
2560 tok_state = tok_state_script_data_escaped_dash
2561 return new_character_token '-'
2563 tok_state = tok_state_script_data_escaped_less_than_sign
2567 return new_character_token "\ufffd"
2569 tok_state = tok_state_data
2571 cur -= 1 # Reconsume
2574 return new_character_token c
2576 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
2577 tok_state_script_data_escaped_dash = ->
2578 c = txt.charAt(cur++)
2580 tok_state = tok_state_script_data_escaped_dash_dash
2581 return new_character_token '-'
2583 tok_state = tok_state_script_data_escaped_less_than_sign
2587 tok_state = tok_state_script_data_escaped
2588 return new_character_token "\ufffd"
2590 tok_state = tok_state_data
2592 cur -= 1 # Reconsume
2595 tok_state = tok_state_script_data_escaped
2596 return new_character_token c
2598 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
2599 tok_state_script_data_escaped_dash_dash = ->
2600 c = txt.charAt(cur++)
2602 return new_character_token '-'
2604 tok_state = tok_state_script_data_escaped_less_than_sign
2607 tok_state = tok_state_script_data
2608 return new_character_token '>'
2611 tok_state = tok_state_script_data_escaped
2612 return new_character_token "\ufffd"
2615 tok_state = tok_state_data
2616 cur -= 1 # Reconsume
2619 tok_state = tok_state_script_data_escaped
2620 return new_character_token c
2622 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
2623 tok_state_script_data_escaped_less_than_sign = ->
2624 c = txt.charAt(cur++)
2626 temporary_buffer = ''
2627 tok_state = tok_state_script_data_escaped_end_tag_open
2630 temporary_buffer = c.toLowerCase() # yes, really
2631 tok_state = tok_state_script_data_double_escape_start
2632 return new_character_token "<#{c}" # fixfull split
2634 temporary_buffer = c
2635 tok_state = tok_state_script_data_double_escape_start
2636 return new_character_token "<#{c}" # fixfull split
2638 tok_state = tok_state_script_data_escaped
2639 cur -= 1 # Reconsume
2640 return new_character_token c
2642 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
2643 tok_state_script_data_escaped_end_tag_open = ->
2644 c = txt.charAt(cur++)
2646 tok_cur_tag = new_end_tag c.toLowerCase()
2647 temporary_buffer += c
2648 tok_state = tok_state_script_data_escaped_end_tag_name
2651 tok_cur_tag = new_end_tag c
2652 temporary_buffer += c
2653 tok_state = tok_state_script_data_escaped_end_tag_name
2656 tok_state = tok_state_script_data_escaped
2657 cur -= 1 # Reconsume
2658 return new_character_token '</' # fixfull split
2660 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
2661 tok_state_script_data_escaped_end_tag_name = ->
2662 c = txt.charAt(cur++)
2663 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2664 if is_appropriate_end_tag tok_cur_tag
2665 tok_state = tok_state_before_attribute_name
2669 if is_appropriate_end_tag tok_cur_tag
2670 tok_state = tok_state_self_closing_start_tag
2674 tok_cur_tag.name += c.toLowerCase()
2675 temporary_buffer += c.toLowerCase()
2678 tok_cur_tag.name += c
2679 temporary_buffer += c.toLowerCase()
2682 tok_state = tok_state_script_data_escaped
2683 cur -= 1 # Reconsume
2684 return new_character_token "</#{temporary_buffer}" # fixfull split
2686 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
2687 tok_state_script_data_double_escape_start = ->
2688 c = txt.charAt(cur++)
2689 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
2690 if temporary_buffer is 'script'
2691 tok_state = tok_state_script_data_double_escaped
2693 tok_state = tok_state_script_data_escaped
2694 return new_character_token c
2696 temporary_buffer += c.toLowerCase() # yes, really lowercase
2697 return new_character_token c
2699 temporary_buffer += c
2700 return new_character_token c
2702 tok_state = tok_state_script_data_escaped
2703 cur -= 1 # Reconsume
2706 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
2707 tok_state_script_data_double_escaped = ->
2708 c = txt.charAt(cur++)
2710 tok_state = tok_state_script_data_double_escaped_dash
2711 return new_character_token '-'
2713 tok_state = tok_state_script_data_double_escaped_less_than_sign
2714 return new_character_token '<'
2717 return new_character_token "\ufffd"
2720 tok_state = tok_state_data
2721 cur -= 1 # Reconsume
2724 return new_character_token c
2726 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
2727 tok_state_script_data_double_escaped_dash = ->
2728 c = txt.charAt(cur++)
2730 tok_state = tok_state_script_data_double_escaped_dash_dash
2731 return new_character_token '-'
2733 tok_state = tok_state_script_data_double_escaped_less_than_sign
2734 return new_character_token '<'
2737 tok_state = tok_state_script_data_double_escaped
2738 return new_character_token "\ufffd"
2741 tok_state = tok_state_data
2742 cur -= 1 # Reconsume
2745 tok_state = tok_state_script_data_double_escaped
2746 return new_character_token c
2748 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
2749 tok_state_script_data_double_escaped_dash_dash = ->
2750 c = txt.charAt(cur++)
2752 return new_character_token '-'
2754 tok_state = tok_state_script_data_double_escaped_less_than_sign
2755 return new_character_token '<'
2757 tok_state = tok_state_script_data
2758 return new_character_token '>'
2761 tok_state = tok_state_script_data_double_escaped
2762 return new_character_token "\ufffd"
2765 tok_state = tok_state_data
2766 cur -= 1 # Reconsume
2769 tok_state = tok_state_script_data_double_escaped
2770 return new_character_token c
2772 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
2773 tok_state_script_data_double_escaped_less_than_sign = ->
2774 c = txt.charAt(cur++)
2776 temporary_buffer = ''
2777 tok_state = tok_state_script_data_double_escape_end
2778 return new_character_token '/'
2780 tok_state = tok_state_script_data_double_escaped
2781 cur -= 1 # Reconsume
2784 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
2785 tok_state_script_data_double_escape_end = ->
2786 c = txt.charAt(cur++)
2787 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
2788 if temporary_buffer is 'script'
2789 tok_state = tok_state_script_data_escaped
2791 tok_state = tok_state_script_data_double_escaped
2792 return new_character_token c
2794 temporary_buffer += c.toLowerCase() # yes, really lowercase
2795 return new_character_token c
2797 temporary_buffer += c
2798 return new_character_token c
2800 tok_state = tok_state_script_data_double_escaped
2801 cur -= 1 # Reconsume
2804 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2805 tok_state_before_attribute_name = ->
2807 switch c = txt.charAt(cur++)
2808 when "\t", "\n", "\u000c", ' '
2811 tok_state = tok_state_self_closing_start_tag
2814 tok_state = tok_state_data
2820 attr_name = "\ufffd"
2821 when '"', "'", '<', '='
2826 tok_state = tok_state_data
2829 attr_name = c.toLowerCase()
2833 tok_cur_tag.attrs_a.unshift [attr_name, '']
2834 tok_state = tok_state_attribute_name
2837 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2838 tok_state_attribute_name = ->
2839 switch c = txt.charAt(cur++)
2840 when "\t", "\n", "\u000c", ' '
2841 tok_state = tok_state_after_attribute_name
2843 tok_state = tok_state_self_closing_start_tag
2845 tok_state = tok_state_before_attribute_value
2847 tok_state = tok_state_data
2853 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2856 tok_cur_tag.attrs_a[0][0] = c
2859 tok_state = tok_state_data
2862 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2864 tok_cur_tag.attrs_a[0][0] += c
2867 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2868 tok_state_after_attribute_name = ->
2869 c = txt.charAt(cur++)
2870 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2873 tok_state = tok_state_self_closing_start_tag
2876 tok_state = tok_state_before_attribute_value
2879 tok_state = tok_state_data
2882 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2883 tok_state = tok_state_attribute_name
2887 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2888 tok_state = tok_state_attribute_name
2892 tok_state = tok_state_data
2893 cur -= 1 # reconsume
2895 if c is '"' or c is "'" or c is '<'
2897 # fall through to Anything else
2899 tok_cur_tag.attrs_a.unshift [c, '']
2900 tok_state = tok_state_attribute_name
2902 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2903 tok_state_before_attribute_value = ->
2904 switch c = txt.charAt(cur++)
2905 when "\t", "\n", "\u000c", ' '
2908 tok_state = tok_state_attribute_value_double_quoted
2910 tok_state = tok_state_attribute_value_unquoted
2913 tok_state = tok_state_attribute_value_single_quoted
2916 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2917 tok_state = tok_state_attribute_value_unquoted
2920 tok_state = tok_state_data
2926 tok_state = tok_state_data
2928 tok_cur_tag.attrs_a[0][1] += c
2929 tok_state = tok_state_attribute_value_unquoted
2932 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2933 tok_state_attribute_value_double_quoted = ->
2934 switch c = txt.charAt(cur++)
2936 tok_state = tok_state_after_attribute_value_quoted
2938 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2941 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2944 tok_state = tok_state_data
2946 tok_cur_tag.attrs_a[0][1] += c
2949 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2950 tok_state_attribute_value_single_quoted = ->
2951 switch c = txt.charAt(cur++)
2953 tok_state = tok_state_after_attribute_value_quoted
2955 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2958 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2961 tok_state = tok_state_data
2963 tok_cur_tag.attrs_a[0][1] += c
2966 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2967 tok_state_attribute_value_unquoted = ->
2968 switch c = txt.charAt(cur++)
2969 when "\t", "\n", "\u000c", ' '
2970 tok_state = tok_state_before_attribute_name
2972 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2974 tok_state = tok_state_data
2979 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2982 tok_state = tok_state_data
2984 # Parse Error if ', <, = or ` (backtick)
2985 tok_cur_tag.attrs_a[0][1] += c
2988 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2989 tok_state_after_attribute_value_quoted = ->
2990 switch c = txt.charAt(cur++)
2991 when "\t", "\n", "\u000c", ' '
2992 tok_state = tok_state_before_attribute_name
2994 tok_state = tok_state_self_closing_start_tag
2996 tok_state = tok_state_data
3002 tok_state = tok_state_data
3005 tok_state = tok_state_before_attribute_name
3006 cur -= 1 # we didn't handle that char
3009 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3010 tok_state_self_closing_start_tag = ->
3011 c = txt.charAt(cur++)
3013 tok_cur_tag.flag 'self-closing'
3014 tok_state = tok_state_data
3018 tok_state = tok_state_data
3019 cur -= 1 # Reconsume
3023 tok_state = tok_state_before_attribute_name
3024 cur -= 1 # Reconsume
3027 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3028 # WARNING: put a comment token in tok_cur_tag before setting this state
3029 tok_state_bogus_comment = ->
3030 next_gt = txt.indexOf '>', cur
3032 val = txt.substr cur
3035 val = txt.substr cur, (next_gt - cur)
3037 val = val.replace "\u0000", "\ufffd"
3038 tok_cur_tag.text += val
3039 tok_state = tok_state_data
3042 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3043 tok_state_markup_declaration_open = ->
3044 if txt.substr(cur, 2) is '--'
3046 tok_cur_tag = new_comment_token ''
3047 tok_state = tok_state_comment_start
3049 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3051 tok_state = tok_state_doctype
3053 acn = adjusted_current_node()
3054 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3056 tok_state = tok_state_cdata_section
3060 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
3061 tok_state = tok_state_bogus_comment
3064 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3065 tok_state_comment_start = ->
3066 switch c = txt.charAt(cur++)
3068 tok_state = tok_state_comment_start_dash
3071 return new_character_token "\ufffd"
3074 tok_state = tok_state_data
3078 tok_state = tok_state_data
3079 cur -= 1 # Reconsume
3082 tok_cur_tag.text += c
3085 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3086 tok_state_comment_start_dash = ->
3087 switch c = txt.charAt(cur++)
3089 tok_state = tok_state_comment_end
3092 tok_cur_tag.text += "-\ufffd"
3093 tok_state = tok_state_comment
3096 tok_state = tok_state_data
3100 tok_state = tok_state_data
3101 cur -= 1 # Reconsume
3104 tok_cur_tag.text += "-#{c}"
3105 tok_state = tok_state_comment
3108 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3109 tok_state_comment = ->
3110 switch c = txt.charAt(cur++)
3112 tok_state = tok_state_comment_end_dash
3115 tok_cur_tag.text += "\ufffd"
3118 tok_state = tok_state_data
3119 cur -= 1 # Reconsume
3122 tok_cur_tag.text += c
3125 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3126 tok_state_comment_end_dash = ->
3127 switch c = txt.charAt(cur++)
3129 tok_state = tok_state_comment_end
3132 tok_cur_tag.text += "-\ufffd"
3133 tok_state = tok_state_comment
3136 tok_state = tok_state_data
3137 cur -= 1 # Reconsume
3140 tok_cur_tag.text += "-#{c}"
3141 tok_state = tok_state_comment
3144 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3145 tok_state_comment_end = ->
3146 switch c = txt.charAt(cur++)
3148 tok_state = tok_state_data
3152 tok_cur_tag.text += "--\ufffd"
3153 tok_state = tok_state_comment
3156 tok_state = tok_state_comment_end_bang
3159 tok_cur_tag.text += '-'
3162 tok_state = tok_state_data
3163 cur -= 1 # Reconsume
3167 tok_cur_tag.text += "--#{c}"
3168 tok_state = tok_state_comment
3171 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3172 tok_state_comment_end_bang = ->
3173 switch c = txt.charAt(cur++)
3175 tok_cur_tag.text += "--!#{c}"
3176 tok_state = tok_state_comment_end_dash
3178 tok_state = tok_state_data
3182 tok_cur_tag.text += "--!\ufffd"
3183 tok_state = tok_state_comment
3186 tok_state = tok_state_data
3187 cur -= 1 # Reconsume
3190 tok_cur_tag.text += "--!#{c}"
3191 tok_state = tok_state_comment
3194 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3195 tok_state_doctype = ->
3196 switch c = txt.charAt(cur++)
3197 when "\t", "\u000a", "\u000c", ' '
3198 tok_state = tok_state_before_doctype_name
3201 tok_state = tok_state_data
3202 el = new_doctype_token ''
3203 el.flag 'force-quirks', true
3204 cur -= 1 # Reconsume
3208 tok_state = tok_state_before_doctype_name
3209 cur -= 1 # Reconsume
3212 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3213 tok_state_before_doctype_name = ->
3214 c = txt.charAt(cur++)
3215 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3218 tok_cur_tag = new_doctype_token c.toLowerCase()
3219 tok_state = tok_state_doctype_name
3223 tok_cur_tag = new_doctype_token "\ufffd"
3224 tok_state = tok_state_doctype_name
3228 el = new_doctype_token ''
3229 el.flag 'force-quirks', true
3230 tok_state = tok_state_data
3234 tok_state = tok_state_data
3235 el = new_doctype_token ''
3236 el.flag 'force-quirks', true
3237 cur -= 1 # Reconsume
3240 tok_cur_tag = new_doctype_token c
3241 tok_state = tok_state_doctype_name
3244 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3245 tok_state_doctype_name = ->
3246 c = txt.charAt(cur++)
3247 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3248 tok_state = tok_state_after_doctype_name
3251 tok_state = tok_state_data
3254 tok_cur_tag.name += c.toLowerCase()
3258 tok_cur_tag.name += "\ufffd"
3262 tok_state = tok_state_data
3263 tok_cur_tag.flag 'force-quirks', true
3264 cur -= 1 # Reconsume
3267 tok_cur_tag.name += c
3270 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3271 tok_state_after_doctype_name = ->
3272 c = txt.charAt(cur++)
3273 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3276 tok_state = tok_state_data
3280 tok_state = tok_state_data
3281 tok_cur_tag.flag 'force-quirks', true
3282 cur -= 1 # Reconsume
3285 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3287 tok_state = tok_state_after_doctype_public_keyword
3289 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3291 tok_state = tok_state_after_doctype_system_keyword
3294 tok_cur_tag.flag 'force-quirks', true
3295 tok_state = tok_state_bogus_doctype
3298 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3299 tok_state_after_doctype_public_keyword = ->
3300 c = txt.charAt(cur++)
3301 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3302 tok_state = tok_state_before_doctype_public_identifier
3306 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3307 tok_state = tok_state_doctype_public_identifier_double_quoted
3311 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3312 tok_state = tok_state_doctype_public_identifier_single_quoted
3316 tok_cur_tag.flag 'force-quirks', true
3317 tok_state = tok_state_data
3321 tok_state = tok_state_data
3322 tok_cur_tag.flag 'force-quirks', true
3323 cur -= 1 # Reconsume
3327 tok_cur_tag.flag 'force-quirks', true
3328 tok_state = tok_state_bogus_doctype
3331 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
3332 tok_state_before_doctype_public_identifier = ->
3333 c = txt.charAt(cur++)
3334 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3338 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3339 tok_state = tok_state_doctype_public_identifier_double_quoted
3343 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3344 tok_state = tok_state_doctype_public_identifier_single_quoted
3348 tok_cur_tag.flag 'force-quirks', true
3349 tok_state = tok_state_data
3353 tok_state = tok_state_data
3354 tok_cur_tag.flag 'force-quirks', true
3355 cur -= 1 # Reconsume
3359 tok_cur_tag.flag 'force-quirks', true
3360 tok_state = tok_state_bogus_doctype
3364 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
3365 tok_state_doctype_public_identifier_double_quoted = ->
3366 c = txt.charAt(cur++)
3368 tok_state = tok_state_after_doctype_public_identifier
3372 tok_cur_tag.public_identifier += "\ufffd"
3376 tok_cur_tag.flag 'force-quirks', true
3377 tok_state = tok_state_data
3381 tok_state = tok_state_data
3382 tok_cur_tag.flag 'force-quirks', true
3383 cur -= 1 # Reconsume
3386 tok_cur_tag.public_identifier += c
3389 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
3390 tok_state_doctype_public_identifier_single_quoted = ->
3391 c = txt.charAt(cur++)
3393 tok_state = tok_state_after_doctype_public_identifier
3397 tok_cur_tag.public_identifier += "\ufffd"
3401 tok_cur_tag.flag 'force-quirks', true
3402 tok_state = tok_state_data
3406 tok_state = tok_state_data
3407 tok_cur_tag.flag 'force-quirks', true
3408 cur -= 1 # Reconsume
3411 tok_cur_tag.public_identifier += c
3414 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
3415 tok_state_after_doctype_public_identifier = ->
3416 c = txt.charAt(cur++)
3417 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3418 tok_state = tok_state_between_doctype_public_and_system_identifiers
3421 tok_state = tok_state_data
3425 tok_cur_tag.system_identifier = ''
3426 tok_state = tok_state_doctype_system_identifier_double_quoted
3430 tok_cur_tag.system_identifier = ''
3431 tok_state = tok_state_doctype_system_identifier_single_quoted
3435 tok_state = tok_state_data
3436 tok_cur_tag.flag 'force-quirks', true
3437 cur -= 1 # Reconsume
3441 tok_cur_tag.flag 'force-quirks', true
3442 tok_state = tok_state_bogus_doctype
3445 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
3446 tok_state_between_doctype_public_and_system_identifiers = ->
3447 c = txt.charAt(cur++)
3448 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3451 tok_state = tok_state_data
3455 tok_cur_tag.system_identifier = ''
3456 tok_state = tok_state_doctype_system_identifier_double_quoted
3460 tok_cur_tag.system_identifier = ''
3461 tok_state = tok_state_doctype_system_identifier_single_quoted
3465 tok_state = tok_state_data
3466 tok_cur_tag.flag 'force-quirks', true
3467 cur -= 1 # Reconsume
3471 tok_cur_tag.flag 'force-quirks', true
3472 tok_state = tok_state_bogus_doctype
3475 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
3476 tok_state_after_doctype_system_keyword = ->
3477 c = txt.charAt(cur++)
3478 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3479 tok_state = tok_state_before_doctype_system_identifier
3483 tok_cur_tag.system_identifier = ''
3484 tok_state = tok_state_doctype_system_identifier_double_quoted
3488 tok_cur_tag.system_identifier = ''
3489 tok_state = tok_state_doctype_system_identifier_single_quoted
3493 tok_cur_tag.flag 'force-quirks', true
3494 tok_state = tok_state_data
3498 tok_state = tok_state_data
3499 tok_cur_tag.flag 'force-quirks', true
3500 cur -= 1 # Reconsume
3504 tok_cur_tag.flag 'force-quirks', true
3505 tok_state = tok_state_bogus_doctype
3508 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
3509 tok_state_before_doctype_system_identifier = ->
3510 c = txt.charAt(cur++)
3511 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3514 tok_cur_tag.system_identifier = ''
3515 tok_state = tok_state_doctype_system_identifier_double_quoted
3518 tok_cur_tag.system_identifier = ''
3519 tok_state = tok_state_doctype_system_identifier_single_quoted
3523 tok_cur_tag.flag 'force-quirks', true
3524 tok_state = tok_state_data
3528 tok_state = tok_state_data
3529 tok_cur_tag.flag 'force-quirks', true
3530 cur -= 1 # Reconsume
3534 tok_cur_tag.flag 'force-quirks', true
3535 tok_state = tok_state_bogus_doctype
3538 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
3539 tok_state_doctype_system_identifier_double_quoted = ->
3540 c = txt.charAt(cur++)
3542 tok_state = tok_state_after_doctype_system_identifier
3546 tok_cur_tag.system_identifier += "\ufffd"
3550 tok_cur_tag.flag 'force-quirks', true
3551 tok_state = tok_state_data
3555 tok_state = tok_state_data
3556 tok_cur_tag.flag 'force-quirks', true
3557 cur -= 1 # Reconsume
3560 tok_cur_tag.system_identifier += c
3563 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
3564 tok_state_doctype_system_identifier_single_quoted = ->
3565 c = txt.charAt(cur++)
3567 tok_state = tok_state_after_doctype_system_identifier
3571 tok_cur_tag.system_identifier += "\ufffd"
3575 tok_cur_tag.flag 'force-quirks', true
3576 tok_state = tok_state_data
3580 tok_state = tok_state_data
3581 tok_cur_tag.flag 'force-quirks', true
3582 cur -= 1 # Reconsume
3585 tok_cur_tag.system_identifier += c
3588 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
3589 tok_state_after_doctype_system_identifier = ->
3590 c = txt.charAt(cur++)
3591 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3594 tok_state = tok_state_data
3598 tok_state = tok_state_data
3599 tok_cur_tag.flag 'force-quirks', true
3600 cur -= 1 # Reconsume
3604 # do _not_ tok_cur_tag.flag 'force-quirks', true
3605 tok_state = tok_state_bogus_doctype
3608 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
3609 tok_state_bogus_doctype = ->
3610 c = txt.charAt(cur++)
3612 tok_state = tok_state_data
3615 tok_state = tok_state_data
3616 cur -= 1 # Reconsume
3622 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
3623 # Don't set this as a state, just call it
3624 # returns a string (NOT a text node)
3625 parse_character_reference = (allowed_char = null, in_attr = false) ->
3626 if cur >= txt.length
3628 switch c = txt.charAt(cur)
3629 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
3630 # explicitly not a parse error
3633 # there has to be "one or more" alnums between & and ; to be a parse error
3636 if cur + 1 >= txt.length
3638 if txt.charAt(cur + 1).toLowerCase() is 'x'
3647 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
3651 if txt.charAt(start + i) is ';'
3653 # FIXME This is supposed to generate parse errors for some chars
3654 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
3661 if alnum.indexOf(txt.charAt(cur + i)) is -1
3664 # exit early, because parse_error() below needs at least one alnum
3666 if txt.charAt(cur + i) is ';'
3667 i += 1 # include ';' terminator in value
3668 decoded = decode_named_char_ref txt.substr(cur, i)
3675 # no ';' terminator (only legacy char refs)
3677 for i in [2..max] # no prefix matches, so ok to check shortest first
3678 c = legacy_char_refs[txt.substr(cur, i)]
3681 if txt.charAt(cur + i) is '='
3682 # "because some legacy user agents will
3683 # misinterpret the markup in those cases"
3686 if alnum.indexOf(txt.charAt(cur + i)) > -1
3687 # this makes attributes forgiving about url args
3689 # ok, and besides the weird exceptions for attributes...
3690 # return the matching char
3691 cur += i # consume entity chars
3692 parse_error() # because no terminating ";"
3696 return # never reached
3698 # tree constructor initialization
3699 # see comments on TYPE_TAG/etc for the structure of this data
3700 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
3702 afe = [] # active formatting elements
3703 template_insertion_modes = []
3704 insertion_mode = ins_mode_initial
3705 original_insertion_mode = insertion_mode # TODO check spec
3706 flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
3707 flag_frameset_ok = true
3709 flag_foster_parenting = false
3710 form_element_pointer = null
3711 temporary_buffer = null
3712 pending_table_character_tokens = []
3713 head_element_pointer = null
3714 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
3715 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
3717 # tokenizer initialization
3718 tok_state = tok_state_data
3725 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
3728 serialize_els = (els, shallow, show_ids) ->
3734 serialized += t.serialize shallow, show_ids
3737 # TODO export TYPE_*
3738 module.exports.parse_html = parse_html
3739 module.exports.debug_log_reset = debug_log_reset
3740 module.exports.debug_log_each = debug_log_each
3741 module.exports.TYPE_TAG = TYPE_TAG
3742 module.exports.TYPE_TEXT = TYPE_TEXT
3743 module.exports.TYPE_COMMENT = TYPE_COMMENT
3744 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE