1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
55 module = exports: window.wheic
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
79 debug_log_each = (cb) ->
80 for str in g_debug_log
85 constructor: (type, args = {}) ->
86 @type = type # one of the TYPE_* constants above
87 @name = args.name ? '' # tag name
88 @text = args.text ? '' # contents for text/comment nodes
89 @attrs = args.attrs ? {}
90 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91 @children = args.children ? []
92 @namespace = args.namespace ? NS_HTML
93 @parent = args.parent ? null
94 @token = args.token ? null
98 @id = "#{++prev_node_id}"
99 shallow_clone: -> # return a new node that's the same except without the children or parent
100 # WARNING this doesn't work right on open tags that are still being parsed
102 attrs[k] = v for k, v of @attrs
103 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id, token: @token
104 acknowledge_self_closing: ->
106 @token.flag 'did_self_close'
108 @flag 'did_self_close', true
111 serialize: (shallow = false, show_ids = false) -> # for unit tests
116 ret += JSON.stringify @name
131 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
137 ret += c.serialize shallow, show_ids
141 ret += JSON.stringify @text
144 ret += JSON.stringify @text
150 when TYPE_AAA_BOOKMARK
151 ret += 'aaa_bookmark'
154 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
157 # helpers: (only take args that are normally known when parser creates nodes)
158 new_open_tag = (name) ->
159 return new Node TYPE_START_TAG, name: name
160 new_end_tag = (name) ->
161 return new Node TYPE_END_TAG, name: name
162 new_element = (name) ->
163 return new Node TYPE_TAG, name: name
164 new_text_node = (txt) ->
165 return new Node TYPE_TEXT, text: txt
166 new_character_token = new_text_node
167 new_comment_token = (txt) ->
168 return new Node TYPE_COMMENT, text: txt
169 new_doctype_token = (name) ->
170 return new Node TYPE_DOCTYPE, name: name
172 return new Node TYPE_EOF
174 return new Node TYPE_AFE_MARKER
175 new_aaa_bookmark = ->
176 return new Node TYPE_AAA_BOOKMARK
178 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
179 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
180 digits = "0123456789"
181 alnum = lc_alpha + uc_alpha + digits
182 hex_chars = digits + "abcdefABCDEF"
184 # some SVG elements have dashes in them
185 tag_name_chars = alnum + "-"
187 # http://www.w3.org/TR/html5/infrastructure.html#space-character
188 space_chars = "\u0009\u000a\u000c\u000d\u0020"
190 return txt.length is 1 and space_chars.indexOf(txt) > -1
191 is_space_tok = (t) ->
192 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
194 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
195 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
197 # These are the character references that don't need a terminating semicolon
198 # min length: 2, max: 6, none are a prefix of any other.
200 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
201 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
202 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
203 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
204 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
205 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
206 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
207 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
208 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
209 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
210 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
211 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
212 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
213 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
214 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
215 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
216 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
220 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
221 raw_text_elements = ['script', 'style']
222 escapable_raw_text_elements = ['textarea', 'title']
223 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
225 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
226 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
227 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
228 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
229 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
230 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
231 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
232 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
233 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
234 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
235 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
236 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
237 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
238 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
242 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
244 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
245 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
246 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
247 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
248 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
249 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
250 'determinant', 'diff', 'divergence', 'divide', 'domain',
251 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
252 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
253 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
254 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
255 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
256 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
257 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
258 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
259 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
260 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
261 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
262 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
263 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
264 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
265 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
266 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
267 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
268 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
269 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
270 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
271 'vectorproduct', 'xor'
273 # foreign_elements = [svg_elements..., mathml_elements...]
274 #normal_elements = All other allowed HTML elements are normal elements.
278 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
279 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
280 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
281 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
282 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
283 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
284 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
285 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
286 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
287 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
288 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
289 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
290 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
291 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
292 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
293 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
294 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
295 wbr:NS_HTML, xmp:NS_HTML,
298 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
299 'annotation-xml':NS_MATHML,
302 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
305 formatting_elements = {
306 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
307 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
311 foster_parenting_targets = {
333 el_is_special = (e) ->
334 return special_elements[e.name] is e.namespace
336 # decode_named_char_ref()
338 # The list of named character references is _huge_ so ask the browser to decode
339 # for us instead of wasting bandwidth/space on including the table here.
341 # Pass without the "&" but with the ";" examples:
342 # for "&" pass "amp;"
343 # for "′" pass "x2032;"
346 textarea: document.createElement('textarea')
348 # TODO test this in IE8
349 decode_named_char_ref = (txt) ->
351 decoded = g_dncr.cache[txt]
352 return decoded if decoded?
353 g_dncr.textarea.innerHTML = txt
354 decoded = g_dncr.textarea.value
355 return null if decoded is txt
356 return g_dncr.cache[txt] = decoded
358 parse_html = (txt, parse_error_cb = null) ->
359 cur = 0 # index of next char in txt to be parsed
360 # declare doc and tokenizer variables so they're in scope below
362 open_els = null # stack of open elements
363 afe = null # active formatting elements
364 template_insertion_modes = null
365 insertion_mode = null
366 original_insertion_mode = null
368 tok_cur_tag = null # partially parsed tag
369 flag_scripting = null
370 flag_frameset_ok = null
372 flag_foster_parenting = null
373 form_element_pointer = null
374 temporary_buffer = null
375 pending_table_character_tokens = null
376 head_element_pointer = null
377 flag_fragment_parsing = null
378 context_element = null
387 console.log "Parse error at character #{cur} of #{txt.length}"
389 afe_push = (new_el) ->
392 if el.name is new_el.name and el.namespace is new_el.namespace
394 continue unless new_el.attrs[k] is v
395 for k, v of new_el.attrs
396 continue unless el.attrs[k] is v
403 afe.unshift new_afe_marker()
405 # the functions below impliment the Tree Contstruction algorithm
406 # http://www.w3.org/TR/html5/syntax.html#tree-construction
408 # But first... the helpers
409 template_tag_is_open = ->
411 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
414 is_in_scope_x = (tag_name, scope, namespace) ->
416 if t.name is tag_name and (namespace is null or namespace is t.namespace)
418 if scope[t.name] is t.namespace
421 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
423 if t.name is tag_name and (namespace is null or namespace is t.namespace)
425 if scope[t.name] is t.namespace
427 if scope2[t.name] is t.namespace
430 standard_scopers = { # FIXME these are supposed to be namespace specific
431 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
432 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
433 template: NS_HTML, mi: NS_MATHML,
435 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
436 'annotation-xml': NS_MATHML,
438 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
440 button_scopers = button: NS_HTML
441 li_scopers = ol: NS_HTML, ul: NS_HTML
442 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
443 is_in_scope = (tag_name, namespace = null) ->
444 return is_in_scope_x tag_name, standard_scopers, namespace
445 is_in_button_scope = (tag_name, namespace = null) ->
446 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
447 is_in_table_scope = (tag_name, namespace = null) ->
448 return is_in_scope_x tag_name, table_scopers, namespace
449 is_in_select_scope = (tag_name, namespace = null) ->
451 if t.name is tag_name and (namespace is null or namespace is t.namespace)
453 if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
456 # this checks for a particular element, not by name
457 el_is_in_scope = (el) ->
461 if standard_scopers[t.name] is t.namespace
465 clear_to_table_stopers = {
470 clear_stack_to_table_context = ->
472 if clear_to_table_stopers[open_els[0].name]?
476 clear_to_table_body_stopers = {
483 clear_stack_to_table_body_context = ->
485 if clear_to_table_body_stopers[open_els[0].name]?
489 clear_to_table_row_stopers = {
494 clear_stack_to_table_row_context = ->
496 if clear_to_table_row_stopers[open_els[0].name]?
500 clear_afe_to_marker = ->
503 if el.type is TYPE_AFE_MARKER
507 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
508 reset_insertion_mode = ->
509 # 1. Let last be false.
511 # 2. Let node be the last node in the stack of open elements.
513 node = open_els[node_i]
514 # 3. Loop: If node is the first node in the stack of open elements,
515 # then set last to true, and, if the parser was originally created as
516 # part of the HTML fragment parsing algorithm (fragment case) set node
517 # to the context element.
519 if node_i is open_els.length - 1
521 # fixfull (fragment case)
523 # 4. If node is a select element, run these substeps:
524 if node.name is 'select'
525 # 1. If last is true, jump to the step below labeled done.
527 # 2. Let ancestor be node.
530 # 3. Loop: If ancestor is the first node in the stack of
531 # open elements, jump to the step below labeled done.
533 if ancestor_i is open_els.length - 1
535 # 4. Let ancestor be the node before ancestor in the stack
538 ancestor = open_els[ancestor_i]
539 # 5. If ancestor is a template node, jump to the step below
541 if ancestor.name is 'template'
543 # 6. If ancestor is a table node, switch the insertion mode
544 # to "in select in table" and abort these steps.
545 if ancestor.name is 'table'
546 insertion_mode = ins_mode_in_select_in_table
548 # 7. Jump back to the step labeled loop.
549 # 8. Done: Switch the insertion mode to "in select" and abort
551 insertion_mode = ins_mode_in_select
553 # 5. If node is a td or th element and last is false, then switch
554 # the insertion mode to "in cell" and abort these steps.
555 if (node.name is 'td' or node.name is 'th') and last is false
556 insertion_mode = ins_mode_in_cell
558 # 6. If node is a tr element, then switch the insertion mode to "in
559 # row" and abort these steps.
561 insertion_mode = ins_mode_in_row
563 # 7. If node is a tbody, thead, or tfoot element, then switch the
564 # insertion mode to "in table body" and abort these steps.
565 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
566 insertion_mode = ins_mode_in_table_body
568 # 8. If node is a caption element, then switch the insertion mode
569 # to "in caption" and abort these steps.
570 if node.name is 'caption'
571 insertion_mode = ins_mode_in_caption
573 # 9. If node is a colgroup element, then switch the insertion mode
574 # to "in column group" and abort these steps.
575 if node.name is 'colgroup'
576 insertion_mode = ins_mode_in_column_group
578 # 10. If node is a table element, then switch the insertion mode to
579 # "in table" and abort these steps.
580 if node.name is 'table'
581 insertion_mode = ins_mode_in_table
583 # 11. If node is a template element, then switch the insertion mode
584 # to the current template insertion mode and abort these steps.
585 # fixfull (template insertion mode stack)
587 # 12. If node is a head element and last is true, then switch the
588 # insertion mode to "in body" ("in body"! not "in head"!) and abort
589 # these steps. (fragment case)
590 if node.name is 'head' and last
591 insertion_mode = ins_mode_in_body
593 # 13. If node is a head element and last is false, then switch the
594 # insertion mode to "in head" and abort these steps.
595 if node.name is 'head' and last is false
596 insertion_mode = ins_mode_in_head
598 # 14. If node is a body element, then switch the insertion mode to
599 # "in body" and abort these steps.
600 if node.name is 'body'
601 insertion_mode = ins_mode_in_body
603 # 15. If node is a frameset element, then switch the insertion mode
604 # to "in frameset" and abort these steps. (fragment case)
605 if node.name is 'frameset'
606 insertion_mode = ins_mode_in_frameset
608 # 16. If node is an html element, run these substeps:
609 if node.name is 'html'
610 # 1. If the head element pointer is null, switch the insertion
611 # mode to "before head" and abort these steps. (fragment case)
612 # fixfull (fragment case)
614 # 2. Otherwise, the head element pointer is not null, switch
615 # the insertion mode to "after head" and abort these steps.
616 insertion_mode = ins_mode_in_body # FIXME fixfull
618 # 17. If last is true, then switch the insertion mode to "in body"
619 # and abort these steps. (fragment case)
621 insertion_mode = ins_mode_in_body
623 # 18. Let node now be the node before node in the stack of open
626 node = open_els[node_i]
627 # 19. Return to the step labeled loop.
631 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
632 adjusted_current_node = ->
633 if open_els.length is 1 and flag_fragment_parsing
634 return context_element
637 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
638 # this implementation is structured (mostly) as described at the link above.
639 # capitalized comments are the "labels" described at the link above.
640 reconstruct_active_formatting_elements = ->
641 return if afe.length is 0
642 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
647 if i is afe.length - 1
650 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
655 el = afe[i].shallow_clone()
656 tree_insert_element el
661 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
662 # adoption agency algorithm
664 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
665 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
666 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
667 adoption_agency = (subject) ->
668 debug_log "adoption_agency()"
669 debug_log "tree: #{serialize_els doc.children, false, true}"
670 debug_log "open_els: #{serialize_els open_els, true, true}"
671 debug_log "afe: #{serialize_els afe, true, true}"
672 if open_els[0].name is subject
675 # remove it from the list of active formatting elements (if found)
680 debug_log "aaa: starting off with subject on top of stack, exiting"
687 # 5. Let formatting element be the last element in the list of
688 # active formatting elements that: is between the end of the list
689 # and the last scope marker in the list, if any, or the start of
690 # the list otherwise, and has the tag name subject.
692 for t, fe_of_afe in afe
693 if t.type is TYPE_AFE_MARKER
698 # If there is no such element, then abort these steps and instead
699 # act as described in the "any other end tag" entry above.
701 debug_log "aaa: fe not found in afe"
702 in_body_any_other_end_tag subject
704 # 6. If formatting element is not in the stack of open elements,
705 # then this is a parse error; remove the element from the list, and
708 for t, fe_of_open_els in open_els
713 debug_log "aaa: fe not found in open_els"
715 # "remove it from the list" must mean afe, since it's not in open_els
716 afe.splice fe_of_afe, 1
718 # 7. If formatting element is in the stack of open elements, but
719 # the element is not in scope, then this is a parse error; abort
721 unless el_is_in_scope fe
722 debug_log "aaa: fe not in scope"
725 # 8. If formatting element is not the current node, this is a parse
726 # error. (But do not abort these steps.)
727 unless open_els[0] is fe
730 # 9. Let furthest block be the topmost node in the stack of open
731 # elements that is lower in the stack than formatting element, and
732 # is an element in the special category. There might not be one.
734 fb_of_open_els = null
741 # and continue, to see if there's one that's more "topmost"
742 # 10. If there is no furthest block, then the UA must first pop all
743 # the nodes from the bottom of the stack of open elements, from the
744 # current node up to and including formatting element, then remove
745 # formatting element from the list of active formatting elements,
746 # and finally abort these steps.
748 debug_log "aaa: no fb"
752 afe.splice fe_of_afe, 1
754 # 11. Let common ancestor be the element immediately above
755 # formatting element in the stack of open elements.
756 ca = open_els[fe_of_open_els + 1] # common ancestor
758 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
759 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
760 bookmark = new_aaa_bookmark()
763 afe.splice i, 0, bookmark
765 node = last_node = fb
769 # 3. Let node be the element immediately above node in the
770 # stack of open elements, or if node is no longer in the stack
771 # of open elements (e.g. because it got removed by this
772 # algorithm), the element that was immediately above node in
773 # the stack of open elements before node was removed.
777 node_next = open_els[i + 1]
779 node = node_next ? node_above
780 debug_log "inner loop #{inner}"
781 debug_log "tree: #{serialize_els doc.children, false, true}"
782 debug_log "open_els: #{serialize_els open_els, true, true}"
783 debug_log "afe: #{serialize_els afe, true, true}"
784 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
785 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
786 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
787 debug_log "node: #{node.serialize true, true}"
788 # TODO make sure node_above gets re-set if/when node is removed from open_els
790 # 4. If node is formatting element, then go to the next step in
791 # the overall algorithm.
795 # 5. If inner loop counter is greater than three and node is in
796 # the list of active formatting elements, then remove node from
797 # the list of active formatting elements.
803 debug_log "max out inner"
808 # 6. If node is not in the list of active formatting elements,
809 # then remove node from the stack of open elements and then go
810 # back to the step labeled inner loop.
812 debug_log "not in afe"
815 node_above = open_els[i + 1]
819 debug_log "the bones"
820 # 7. create an element for the token for which the element node
821 # was created, in the HTML namespace, with common ancestor as
822 # the intended parent; replace the entry for node in the list
823 # of active formatting elements with an entry for the new
824 # element, replace the entry for node in the stack of open
825 # elements with an entry for the new element, and let node be
827 new_node = node.shallow_clone()
831 debug_log "replaced in afe"
835 node_above = open_els[i + 1]
836 open_els[i] = new_node
837 debug_log "replaced in open_els"
840 # 8. If last node is furthest block, then move the
841 # aforementioned bookmark to be immediately after the new node
842 # in the list of active formatting elements.
847 debug_log "removed bookmark"
851 # "after" means lower
852 afe.splice i, 0, bookmark # "after as <-
853 debug_log "placed bookmark after node"
854 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
856 # 9. Insert last node into node, first removing it from its
857 # previous parent node if any.
859 debug_log "last_node has parent"
860 for c, i in last_node.parent.children
862 debug_log "removing last_node from parent"
863 last_node.parent.children.splice i, 1
865 node.children.push last_node
866 last_node.parent = node
867 # 10. Let last node be node.
870 # 11. Return to the step labeled inner loop.
871 # 14. Insert whatever last node ended up being in the previous step
872 # at the appropriate place for inserting a node, but using common
873 # ancestor as the override target.
875 # In the case where fe is immediately followed by fb:
876 # * inner loop exits out early (node==fe)
878 # * last_node is still in the tree (not a duplicate)
880 debug_log "FEFIRST? last_node has parent"
881 for c, i in last_node.parent.children
883 debug_log "removing last_node from parent"
884 last_node.parent.children.splice i, 1
887 debug_log "after aaa inner loop"
888 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
889 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
890 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
891 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
892 debug_log "tree: #{serialize_els doc.children, false, true}"
897 # can't use standard insert token thing, because it's already in
898 # open_els and must stay at it's current position in open_els
899 dest = adjusted_insertion_location ca
900 dest[0].children.splice dest[1], 0, last_node
901 last_node.parent = dest[0]
904 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
905 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
906 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
907 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
908 debug_log "tree: #{serialize_els doc.children, false, true}"
910 # 15. Create an element for the token for which formatting element
911 # was created, in the HTML namespace, with furthest block as the
913 new_element = fe.shallow_clone() # FIXME intended parent thing
914 # 16. Take all of the child nodes of furthest block and append them
915 # to the element created in the last step.
916 while fb.children.length
917 t = fb.children.shift()
918 t.parent = new_element
919 new_element.children.push t
920 # 17. Append that new element to furthest block.
921 new_element.parent = fb
922 fb.children.push new_element
923 # 18. Remove formatting element from the list of active formatting
924 # elements, and insert the new element into the list of active
925 # formatting elements at the position of the aforementioned
935 # 19. Remove formatting element from the stack of open elements,
936 # and insert the new element into the stack of open elements
937 # immediately below the position of furthest block in that stack.
944 open_els.splice i, 0, new_element
946 # 20. Jump back to the step labeled outer loop.
947 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
948 debug_log "tree: #{serialize_els doc.children, false, true}"
949 debug_log "open_els: #{serialize_els open_els, true, true}"
950 debug_log "afe: #{serialize_els afe, true, true}"
953 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
955 generate_implied_end_tags 'p' # arg is exception
956 if open_els[0].name isnt 'p'
958 while open_els.length > 1 # just in case
959 el = open_els.shift()
962 close_p_if_in_button_scope = ->
963 if is_in_button_scope 'p'
966 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
967 # aka insert_a_character = (t) ->
968 insert_character = (t) ->
969 dest = adjusted_insertion_location()
970 # fixfull check for Document node
972 prev = dest[0].children[dest[1] - 1]
973 if prev.type is TYPE_TEXT
976 dest[0].children.splice dest[1], 0, t
979 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
980 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
981 adjusted_insertion_location = (override_target = null) ->
982 # 1. If there was an override target specified, then let target be the
985 target = override_target
986 else # Otherwise, let target be the current node.
988 # 2. Determine the adjusted insertion location using the first matching
989 # steps from the following list:
991 # If foster parenting is enabled and target is a table, tbody, tfoot,
992 # thead, or tr element Foster parenting happens when content is
993 # misnested in tables.
994 if flag_foster_parenting and foster_parenting_targets[target.name]
995 loop # once. this is here so we can ``break`` to "abort these substeps"
996 # 1. Let last template be the last template element in the
997 # stack of open elements, if any.
999 last_template_i = null
1000 for el, i in open_els
1001 if el.name is 'template'
1005 # 2. Let last table be the last table element in the stack of
1006 # open elements, if any.
1009 for el, i in open_els
1010 if el.name is 'table'
1014 # 3. If there is a last template and either there is no last
1015 # table, or there is one, but last template is lower (more
1016 # recently added) than last table in the stack of open
1017 # elements, then: let adjusted insertion location be inside
1018 # last template's template contents, after its last child (if
1019 # any), and abort these substeps.
1020 if last_template and (last_table is null or last_template_i < last_table_i)
1021 target = template # fixfull should be it's contents
1022 target_i = target.children.length
1024 # 4. If there is no last table, then let adjusted insertion
1025 # location be inside the first element in the stack of open
1026 # elements (the html element), after its last child (if any),
1027 # and abort these substeps. (fragment case)
1028 if last_table is null
1030 target = open_els[open_els.length - 1]
1031 target_i = target.children.length
1032 # 5. If last table has a parent element, then let adjusted
1033 # insertion location be inside last table's parent element,
1034 # immediately before last table, and abort these substeps.
1035 if last_table.parent?
1036 for c, i in last_table.parent.children
1038 target = last_table.parent
1042 # 6. Let previous element be the element immediately above last
1043 # table in the stack of open elements.
1045 # huh? how could it not have a parent?
1046 previous_element = open_els[last_table_i + 1]
1047 # 7. Let adjusted insertion location be inside previous
1048 # element, after its last child (if any).
1049 target = previous_element
1050 target_i = target.children.length
1051 # Note: These steps are involved in part because it's possible
1052 # for elements, the table element in this case in particular,
1053 # to have been moved by a script around in the DOM, or indeed
1054 # removed from the DOM entirely, after the element was inserted
1056 break # don't really loop
1058 # Otherwise Let adjusted insertion location be inside target, after
1059 # its last child (if any).
1060 target_i = target.children.length
1062 # 3. If the adjusted insertion location is inside a template element,
1063 # let it instead be inside the template element's template contents,
1064 # after its last child (if any).
1065 # fixfull (template)
1067 # 4. Return the adjusted insertion location.
1068 return [target, target_i]
1070 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1071 # aka create_an_element_for_token
1072 token_to_element = (t, namespace, intended_parent) ->
1073 t.type = TYPE_TAG # not TYPE_START_TAG
1074 # convert attributes into a hash
1076 while t.attrs_a.length
1078 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1079 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1081 # TODO 2. If the newly created element has an xmlns attribute in the
1082 # XMLNS namespace whose value is not exactly the same as the element's
1083 # namespace, that is a parse error. Similarly, if the newly created
1084 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1085 # value is not the XLink Namespace, that is a parse error.
1087 # fixfull: the spec says stuff about form pointers and ownerDocument
1091 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1092 insert_foreign_element = (token, namespace) ->
1093 ail = adjusted_insertion_location()
1096 el = token_to_element token, namespace, ail_el
1097 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1099 ail_el.children.splice ail_i, 0, el
1102 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1103 insert_html_element = insert_foreign_element # (token, namespace) ->
1105 # FIXME read implement "foster parenting" part
1106 # FIXME read spec, do this right
1107 # FIXME implement the override target thing
1108 # note: this assumes it's an open tag
1109 # FIXME what part of the spec is this?
1110 # TODO look through all callers of this, and see what they should really be doing.
1111 # eg probably insert_html_element for tokens
1112 tree_insert_element = (el, override_target = null, namespace = null) ->
1114 el.namespace = namespace
1115 dest = adjusted_insertion_location override_target
1116 if el.type is TYPE_START_TAG # means it's a "token"
1117 el = token_to_element el, namespace, dest[0]
1118 unless el.namespace?
1119 namespace = dest.namespace
1120 # fixfull: Document nodes sometimes can't accept more chidren
1121 dest[0].children.splice dest[1], 0, el
1126 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1127 # position should be [node, index_within_children]
1128 insert_comment = (t, position = null) ->
1129 position ?= adjusted_insertion_location()
1130 position[0].children.splice position[1], 0, t
1133 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1134 parse_generic_raw_text = (t) ->
1135 insert_html_element t
1136 tok_state = tok_state_rawtext
1137 original_insertion_mode = insertion_mode
1138 insertion_mode = ins_mode_text
1139 parse_generic_rcdata_text = (t) ->
1140 insert_html_element t
1141 tok_state = tok_state_rcdata
1142 original_insertion_mode = insertion_mode
1143 insertion_mode = ins_mode_text
1145 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1146 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1147 generate_implied_end_tags = (except = null) ->
1148 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1151 # 8.2.5.4 The rules for parsing tokens in HTML content
1152 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1154 # 8.2.5.4.1 The "initial" insertion mode
1155 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1156 ins_mode_initial = (t) ->
1159 if t.type is TYPE_COMMENT
1160 # fixfull this is supposed to be "the last child of the document object"
1163 if t.type is TYPE_DOCTYPE
1164 # FIXME check identifiers, set quirks, etc
1167 insertion_mode = ins_mode_before_html
1170 #fixfull (iframe, quirks)
1171 insertion_mode = ins_mode_before_html
1172 insertion_mode t # reprocess the token
1175 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1176 ins_mode_before_html = (t) ->
1177 if t.type is TYPE_DOCTYPE
1180 if t.type is TYPE_COMMENT
1185 if t.type is TYPE_START_TAG and t.name is 'html'
1186 el = token_to_element t, NS_HTML, doc
1187 open_els.unshift(el)
1188 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1189 insertion_mode = ins_mode_before_head
1191 if t.type is TYPE_END_TAG
1192 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1193 # fall through to "anything else"
1198 html_tok = new_open_tag 'html'
1199 el = token_to_element html_tok, NS_HTML, doc
1200 doc.children.push el
1202 # ?fixfull browsing context
1203 insertion_mode = ins_mode_before_head
1207 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1208 ins_mode_before_head = (t) ->
1211 if t.type is TYPE_COMMENT
1214 if t.type is TYPE_DOCTYPE
1217 if t.type is TYPE_START_TAG and t.name is 'html'
1220 if t.type is TYPE_START_TAG and t.name is 'head'
1221 el = insert_html_element t
1222 head_element_pointer = el
1223 insertion_mode = ins_mode_in_head
1224 if t.type is TYPE_END_TAG
1225 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1226 # fall through to Anything else below
1231 head_tok = new_open_tag 'head'
1232 el = insert_html_element head_tok
1233 head_element_pointer = el
1234 insertion_mode = ins_mode_in_head
1235 insertion_mode t # reprocess current token
1237 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1238 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1239 open_els.shift() # spec says this will be a 'head' node
1240 insertion_mode = ins_mode_after_head
1242 ins_mode_in_head = (t) ->
1243 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1246 if t.type is TYPE_COMMENT
1249 if t.type is TYPE_DOCTYPE
1252 if t.type is TYPE_START_TAG and t.name is 'html'
1255 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1256 el = insert_html_element t
1258 t.acknowledge_self_closing()
1260 if t.type is TYPE_START_TAG and t.name is 'meta'
1261 el = insert_html_element t
1263 t.acknowledge_self_closing()
1264 # fixfull encoding stuff
1266 if t.type is TYPE_START_TAG and t.name is 'title'
1267 parse_generic_rcdata_text t
1269 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1270 parse_generic_raw_text t
1272 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1273 insert_html_element t
1274 insertion_mode = in_head_noscript # FIXME implement
1276 if t.type is TYPE_START_TAG and t.name is 'script'
1277 ail = adjusted_insertion_location()
1278 el = token_to_element t, NS_HTML, ail
1279 el.flag 'parser-inserted', true # FIXME implement
1280 # fixfull frament case
1281 ail[0].children.splice ail[1], 0, el
1283 tok_state = tok_state_script_data
1284 original_insertion_mode = insertion_mode # make sure orig... is defined
1285 insertion_mode = ins_mode_text # FIXME implement
1287 if t.type is TYPE_END_TAG and t.name is 'head'
1288 open_els.shift() # will be a head element... spec says so
1289 insertion_mode = ins_mode_after_head
1291 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1292 ins_mode_in_head_else t
1294 if t.type is TYPE_START_TAG and t.name is 'template'
1295 insert_html_element t
1297 flag_frameset_ok = false
1298 insertion_mode = ins_mode_in_template
1299 template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1301 if t.type is TYPE_END_TAG and t.name is 'template'
1302 if template_tag_is_open()
1303 generate_implied_end_tags
1304 if open_els[0].name isnt 'template'
1307 el = open_els.shift()
1308 if el.name is 'template'
1310 clear_afe_to_marker()
1311 template_insertion_modes.shift()
1312 reset_insertion_mode()
1316 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1319 ins_mode_in_head_else t
1321 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1322 ins_mode_in_head_noscript = (t) ->
1324 console.log "ins_mode_in_head_noscript unimplemented"
1326 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1327 ins_mode_after_head_else = (t) ->
1328 body_tok = new_open_tag 'body'
1329 insert_html_element body_tok
1330 insertion_mode = ins_mode_in_body
1331 insertion_mode t # reprocess token
1333 ins_mode_after_head = (t) ->
1337 if t.type is TYPE_COMMENT
1340 if t.type is TYPE_DOCTYPE
1343 if t.type is TYPE_START_TAG and t.name is 'html'
1346 if t.type is TYPE_START_TAG and t.name is 'body'
1347 insert_html_element t
1348 flag_frameset_ok = false
1349 insertion_mode = ins_mode_in_body
1351 if t.type is TYPE_START_TAG and t.name is 'frameset'
1352 insert_html_element t
1353 insertion_mode = ins_mode_in_frameset
1355 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1357 open_els.unshift head_element_pointer
1359 for el, i of open_els
1360 if el is head_element_pointer
1361 open_els.splice i, 1
1363 console.log "warning: 23904 couldn't find head element in open_els"
1365 if t.type is TYPE_END_TAG and t.name is 'template'
1368 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1369 ins_mode_after_head_else t
1371 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1375 ins_mode_after_head_else t
1377 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1378 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1379 for node, i in open_els
1380 if node.name is name # FIXME check namespace too
1381 generate_implied_end_tags name # arg is exception
1382 parse_error() unless i is 0
1387 if special_elements[node.name]? # FIXME check namespac too
1390 ins_mode_in_body = (t) ->
1396 when "\t", "\u000a", "\u000c", "\u000d", ' '
1397 reconstruct_active_formatting_elements()
1400 reconstruct_active_formatting_elements()
1402 flag_frameset_ok = false
1411 return if template_tag_is_open()
1412 root_attrs = open_els[open_els.length - 1].attrs
1414 root_attrs[k] = v unless root_attrs[k]?
1415 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1416 # FIXME also do this for </template> (end tag)
1417 return ins_mode_in_head t
1424 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1425 close_p_if_in_button_scope()
1426 insert_html_element t
1427 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1428 close_p_if_in_button_scope()
1429 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1432 insert_html_element t
1433 # TODO lots more to implement here
1435 # If the list of active formatting elements
1436 # contains an a element between the end of the list and
1437 # the last marker on the list (or the start of the list
1438 # if there is no marker on the list), then this is a
1439 # parse error; run the adoption agency algorithm for
1440 # the tag name "a", then remove that element from the
1441 # list of active formatting elements and the stack of
1442 # open elements if the adoption agency algorithm didn't
1443 # already remove it (it might not have if the element
1444 # is not in table scope).
1447 if el.type is TYPE_AFE_MARKER
1457 for el, i in open_els
1459 open_els.splice i, 1
1460 reconstruct_active_formatting_elements()
1461 el = insert_html_element t
1463 when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1464 reconstruct_active_formatting_elements()
1465 el = insert_html_element t
1468 # fixfull quirksmode thing
1469 close_p_if_in_button_scope()
1470 insert_html_element t
1471 insertion_mode = ins_mode_in_table
1472 # TODO lots more to implement here
1473 else # any other start tag
1474 reconstruct_active_formatting_elements()
1475 insert_html_element t
1478 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1479 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1482 unless ok_tags[t.name]?
1485 # TODO stack of template insertion modes thing
1490 unless is_in_scope 'body'
1493 # TODO implement parse error and move to tree_after_body
1495 unless is_in_scope 'body' # weird, but it's what the spec says
1498 # TODO implement parse error and move to tree_after_body, reprocess
1499 when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1500 unless is_in_scope t.name, NS_HTML
1503 generate_implied_end_tags()
1504 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1507 el = open_els.shift()
1508 if el.name is t.name and el.namespace is NS_HTML
1510 # TODO lots more close tags to implement here
1512 unless is_in_button_scope 'p'
1514 insert_html_element new_open_tag 'p'
1516 # TODO lots more close tags to implement here
1517 when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1518 adoption_agency t.name
1519 # TODO lots more close tags to implement here
1521 in_body_any_other_end_tag t.name
1524 ins_mode_in_table_else = (t) ->
1526 flag_foster_parenting = true # FIXME
1528 flag_foster_parenting = false
1529 can_in_table = { # FIXME do this inline like everywhere else
1537 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1538 ins_mode_text = (t) ->
1539 if t.type is TYPE_TEXT
1542 if t.type is TYPE_EOF
1544 if open_els[0].name is 'script'
1545 open_els[0].flag 'already started', true
1547 insertion_mode = original_insertion_mode
1550 if t.type is TYPE_END_TAG and t.name is 'script'
1552 insertion_mode = original_insertion_mode
1553 # fixfull the spec seems to assume that I'm going to run the script
1554 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1556 if t.type is TYPE_END_TAG
1558 insertion_mode = original_insertion_mode
1560 console.log 'warning: end of ins_mode_text reached'
1562 # the functions below implement the tokenizer stats described here:
1563 # http://www.w3.org/TR/html5/syntax.html#tokenization
1565 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1566 ins_mode_in_table = (t) ->
1569 if can_in_table[t.name]
1570 original_insertion_mode = insertion_mode
1571 insertion_mode = ins_mode_in_table_text
1574 ins_mode_in_table_else t
1582 clear_stack_to_table_context()
1584 insert_html_element t
1585 insertion_mode = ins_mode_in_caption
1587 clear_stack_to_table_context()
1588 insert_html_element t
1589 insertion_mode = ins_mode_in_column_group
1591 clear_stack_to_table_context()
1592 insert_html_element new_open_tag 'colgroup'
1593 insertion_mode = ins_mode_in_column_group
1595 when 'tbody', 'tfoot', 'thead'
1596 clear_stack_to_table_context()
1597 insert_html_element t
1598 insertion_mode = ins_mode_in_table_body
1599 when 'td', 'th', 'tr'
1600 clear_stack_to_table_context()
1601 insert_html_element new_open_tag 'tbody'
1602 insertion_mode = ins_mode_in_table_body
1606 if is_in_table_scope 'table'
1608 el = open_els.shift()
1609 if el.name is 'table'
1611 reset_insertion_mode()
1613 when 'style', 'script', 'template'
1616 if token_is_input_hidden t
1617 ins_mode_in_table_else t
1620 el = insert_html_element t
1622 t.acknowledge_self_closing()
1625 if form_element_pointer?
1627 if template_tag_is_open()
1629 form_element_pointer = insert_html_element t
1632 ins_mode_in_table_else t
1636 if is_in_table_scope 'table'
1638 el = open_els.shift()
1639 if el.name is 'table'
1641 reset_insertion_mode()
1644 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1649 ins_mode_in_table_else t
1653 ins_mode_in_table_else t
1656 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1657 ins_mode_in_table_text = (t) ->
1658 if t.type is TYPE_TEXT and t.text is "\u0000"
1659 # huh? I thought the tokenizer didn't emit these
1662 if t.type is TYPE_TEXT
1663 pending_table_character_tokens.push t
1667 for old in pending_table_character_tokens
1668 unless is_space_tok old
1672 for old in pending_table_character_tokens
1673 insert_character old
1675 for old in pending_table_character_tokens
1676 ins_mode_table_else old
1677 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1678 insertion_mode = original_insertion_mode
1681 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1682 ins_mode_in_caption = (t) ->
1683 if t.type is TYPE_END_TAG and t.name is 'caption'
1684 if is_in_table_scope 'caption'
1685 generate_implied_end_tags()
1686 if open_els[0].name isnt 'caption'
1689 el = open_els.shift()
1690 if el.name is 'caption'
1692 clear_afe_to_marker()
1693 insertion_mode = in_table
1698 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1700 if is_in_table_scope 'caption'
1702 el = open_els.shift()
1703 if el.name is 'caption'
1705 clear_afe_to_marker()
1706 insertion_mode = in_table
1708 # else fragment case
1710 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1716 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1717 ins_mode_in_column_group = (t) ->
1721 if t.type is TYPE_COMMENT
1724 if t.type is TYPE_DOCTYPE
1727 if t.type is TYPE_START_TAG and t.name is 'html'
1730 if t.type is TYPE_START_TAG and t.name is 'col'
1731 el = insert_html_element t
1733 t.acknowledge_self_closing()
1735 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1736 if open_els[0].name is 'colgroup'
1738 insertion_mode = ins_mode_in_table
1742 if t.type is TYPE_END_TAG and t.name is 'col'
1745 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1748 if t.type is TYPE_EOF
1752 if open_els[0].name isnt 'colgroup'
1756 insertion_mode = ins_mode_in_table
1760 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1761 ins_mode_in_table_body = (t) ->
1762 if t.type is TYPE_START_TAG and t.name is 'tr'
1763 clear_stack_to_table_body_context()
1764 insert_html_element t
1765 insertion_mode = ins_mode_in_row
1767 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1769 clear_stack_to_table_body_context()
1770 insert_html_element new_open_tag 'tr'
1771 insertion_mode = ins_mode_in_row
1774 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1775 unless is_in_table_scope t.name # fixfull check namespace
1778 clear_stack_to_table_body_context()
1780 insertion_mode = ins_mode_in_table
1782 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1785 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1788 if table_scopers[el.name]
1793 clear_stack_to_table_body_context()
1795 insertion_mode = ins_mode_in_table
1798 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1804 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1805 ins_mode_in_row = (t) ->
1806 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1807 clear_stack_to_table_row_context()
1808 insert_html_element t
1809 insertion_mode = ins_mode_in_cell
1812 if t.type is TYPE_END_TAG and t.name is 'tr'
1813 if is_in_table_scope 'tr'
1814 clear_stack_to_table_row_context()
1816 insertion_mode = ins_mode_in_table_body
1820 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1821 if is_in_table_scope 'tr'
1822 clear_stack_to_table_row_context()
1824 insertion_mode = ins_mode_in_table_body
1829 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1830 if is_in_table_scope t.name # fixfull namespace
1831 if is_in_table_scope 'tr'
1832 clear_stack_to_table_row_context()
1834 insertion_mode = ins_mode_in_table_body
1839 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1845 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1847 generate_implied_end_tags()
1848 unless open_els[0].name is 'td' or open_els[0] is 'th'
1851 el = open_els.shift()
1852 if el.name is 'td' or el.name is 'th'
1854 clear_afe_to_marker()
1855 insertion_mode = ins_mode_in_row
1857 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1858 ins_mode_in_cell = (t) ->
1859 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1860 if is_in_table_scope t.name
1861 generate_implied_end_tags()
1862 if open_els[0].name isnt t.name
1865 el = open_els.shift()
1866 if el.name is t.name
1868 clear_afe_to_marker()
1869 insertion_mode = ins_mode_in_row
1873 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1876 if el.name is 'td' or el.name is 'th'
1879 if table_scopers[el.name]
1887 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1890 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1891 if is_in_table_scope t.name # fixfull namespace
1900 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
1901 ins_mode_in_select = (t) ->
1902 if t.type is TYPE_TEXT and t.text is "\u0000"
1905 if t.type is TYPE_TEXT
1908 if t.type is TYPE_COMMENT
1911 if t.type is TYPE_DOCTYPE
1914 if t.type is TYPE_START_TAG and t.name is 'html'
1917 if t.type is TYPE_START_TAG and t.name is 'option'
1918 if open_els[0].name is 'option'
1920 insert_html_element t
1922 if t.type is TYPE_START_TAG and t.name is 'optgroup'
1923 if open_els[0].name is 'option'
1925 if open_els[0].name is 'optgroup'
1927 insert_html_element t
1929 if t.type is TYPE_END_TAG and t.name is 'optgroup'
1930 if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
1932 if open_els[0].name is 'optgroup'
1937 if t.type is TYPE_END_TAG and t.name is 'option'
1938 if open_els[0].name is 'option'
1943 if t.type is TYPE_END_TAG and t.name is 'select'
1944 if is_in_select_scope 'select'
1946 el = open_els.shift()
1947 if el.name is 'select'
1949 reset_insertion_mode()
1953 if t.type is TYPE_START_TAG and t.name is 'select'
1956 el = open_els.shift()
1957 if el.name is 'select'
1959 reset_insertion_mode()
1960 # spec says that this is the same as </select> but it doesn't say
1961 # to check scope first
1963 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
1965 if is_in_select_scope 'select'
1968 el = open_els.shift()
1969 if el.name is 'select'
1971 reset_insertion_mode()
1974 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
1977 if t.type is TYPE_EOF
1984 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
1985 ins_mode_in_select_in_table = (t) ->
1986 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1989 el = open_els.shift()
1990 if el.name is 'select'
1992 reset_insertion_mode()
1995 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1997 unless is_in_table_scope t.name, NS_HTML
2000 el = open_els.shift()
2001 if el.name is 'select'
2003 reset_insertion_mode()
2007 ins_mode_in_select t
2010 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2011 ins_mode_in_template = (t) ->
2012 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2015 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2018 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2019 template_insertion_modes.shift()
2020 template_insertion_modes.unshift ins_mode_in_table
2021 insertion_mode = ins_mode_in_table
2024 if t.type is TYPE_START_TAG and t.name is 'col'
2025 template_insertion_modes.shift()
2026 template_insertion_modes.unshift ins_mode_in_column_group
2027 insertion_mode = ins_mode_in_column_group
2030 if t.type is TYPE_START_TAG and t.name is 'tr'
2031 template_insertion_modes.shift()
2032 template_insertion_modes.unshift ins_mode_in_table_body
2033 insertion_mode = ins_mode_in_table_body
2036 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2037 template_insertion_modes.shift()
2038 template_insertion_modes.unshift ins_mode_in_row
2039 insertion_mode = ins_mode_in_row
2042 if t.type is TYPE_START_TAG
2043 template_insertion_modes.shift()
2044 template_insertion_modes.unshift ins_mode_in_body
2045 insertion_mode = ins_mode_in_body
2048 if t.type is TYPE_END_TAG
2052 unless template_tag_is_open()
2057 el = open_els.shift()
2058 if el.name is 'template' # fixfull check namespace
2060 clear_afe_to_marker()
2061 template_insertion_modes.shift()
2062 reset_insertion_mode()
2065 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2066 ins_mode_after_body = (t) ->
2070 if t.type is TYPE_COMMENT
2071 insert_comment t, [open_els[0], open_els[0].children.length]
2073 if t.type is TYPE_DOCTYPE
2076 if t.type is TYPE_START_TAG and t.name is 'html'
2079 if t.type is TYPE_END_TAG and t.name is 'html'
2080 # fixfull fragment case
2081 insertion_mode = ins_mode_after_after_body
2083 if t.type is TYPE_EOF
2088 insertion_mode = ins_mode_in_body
2091 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2092 ins_mode_in_frameset = (t) ->
2096 if t.type is TYPE_COMMENT
2099 if t.type is TYPE_DOCTYPE
2102 if t.type is TYPE_START_TAG and t.name is 'html'
2105 if t.type is TYPE_START_TAG and t.name is 'frameset'
2106 insert_html_element t
2108 if t.type is TYPE_END_TAG and t.name is 'frameset'
2109 # TODO ?correct for: "if the current node is the root html element"
2110 if open_els.length is 1
2112 return # fragment case
2114 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2115 insertion_mode = ins_mode_after_frameset
2117 if t.type is TYPE_START_TAG and t.name is 'frame'
2118 insert_html_element t
2120 t.acknowledge_self_closing()
2122 if t.type is TYPE_START TAG and t.name is 'noframes'
2125 if t.type is TYPE_EOF
2126 # TODO ?correct for: "if the current node is not the root html element"
2127 if open_els.length isnt 1
2135 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2136 ins_mode_after_frameset = (t) ->
2140 if t.type is TYPE_COMMENT
2143 if t.type is TYPE_DOCTYPE
2146 if t.type is TYPE_START_TAG and t.name is 'html'
2149 if t.type is TYPE_END_TAG and t.name is 'html'
2150 insert_mode = ins_mode_after_after_frameset
2152 if t.type is TYPE_START_TAG and t.name is 'noframes'
2155 if t.type is TYPE_EOF
2162 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2163 ins_mode_after_after_body = (t) ->
2164 if t.type is TYPE_COMMENT
2165 insert_comment t, [doc, doc.children.length]
2167 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2170 if t.type is TYPE_EOF
2175 insertion_mode = ins_mode_in_body
2178 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2179 ins_mode_after_after_frameset = (t) ->
2180 if t.type is TYPE_COMMENT
2181 insert_comment t, [doc, doc.children.length]
2183 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2186 if t.type is TYPE_EOF
2189 if t.type is TYPE_START_TAG and t.name is 'noframes'
2200 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2202 switch c = txt.charAt(cur++)
2204 return new_text_node parse_character_reference()
2206 tok_state = tok_state_tag_open
2209 return new_text_node c
2211 return new_eof_token()
2213 return new_text_node c
2216 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2217 # not needed: tok_state_character_reference_in_data = ->
2218 # just call parse_character_reference()
2220 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2221 tok_state_rcdata = ->
2222 switch c = txt.charAt(cur++)
2224 return new_text_node parse_character_reference()
2226 tok_state = tok_state_rcdata_less_than_sign
2229 return new_character_token "\ufffd"
2231 return new_eof_token()
2233 return new_character_token c
2236 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2237 # not needed: tok_state_character_reference_in_rcdata = ->
2238 # just call parse_character_reference()
2240 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2241 tok_state_rawtext = ->
2242 switch c = txt.charAt(cur++)
2244 tok_state = tok_state_rawtext_less_than_sign
2247 return new_character_token "\ufffd"
2249 return new_eof_token()
2251 return new_character_token c
2254 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2255 tok_state_script_data = ->
2256 switch c = txt.charAt(cur++)
2258 tok_state = tok_state_script_data_less_than_sign
2261 return new_character_token "\ufffd"
2263 return new_eof_token()
2265 return new_character_token c
2268 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2269 tok_state_plaintext = ->
2270 switch c = txt.charAt(cur++)
2273 return new_character_token "\ufffd"
2275 return new_eof_token()
2277 return new_character_token c
2281 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2282 tok_state_tag_open = ->
2283 switch c = txt.charAt(cur++)
2285 tok_state = tok_state_markup_declaration_open
2287 tok_state = tok_state_end_tag_open
2290 tok_cur_tag = new_comment_token '?'
2291 tok_state = tok_state_bogus_comment
2293 if lc_alpha.indexOf(c) > -1
2294 tok_cur_tag = new_open_tag c
2295 tok_state = tok_state_tag_name
2296 else if uc_alpha.indexOf(c) > -1
2297 tok_cur_tag = new_open_tag c.toLowerCase()
2298 tok_state = tok_state_tag_name
2301 tok_state = tok_state_data
2302 cur -= 1 # we didn't parse/handle the char after <
2303 return new_text_node '<'
2306 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2307 tok_state_end_tag_open = ->
2308 switch c = txt.charAt(cur++)
2311 tok_state = tok_state_data
2314 tok_state = tok_state_data
2315 return new_text_node '</'
2317 if uc_alpha.indexOf(c) > -1
2318 tok_cur_tag = new_end_tag c.toLowerCase()
2319 tok_state = tok_state_tag_name
2320 else if lc_alpha.indexOf(c) > -1
2321 tok_cur_tag = new_end_tag c
2322 tok_state = tok_state_tag_name
2325 tok_cur_tag = new_comment_token '/'
2326 tok_state = tok_state_bogus_comment
2329 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2330 tok_state_tag_name = ->
2331 switch c = txt.charAt(cur++)
2332 when "\t", "\n", "\u000c", ' '
2333 tok_state = tok_state_before_attribute_name
2335 tok_state = tok_state_self_closing_start_tag
2337 tok_state = tok_state_data
2343 tok_cur_tag.name += "\ufffd"
2346 tok_state = tok_state_data
2348 if uc_alpha.indexOf(c) > -1
2349 tok_cur_tag.name += c.toLowerCase()
2351 tok_cur_tag.name += c
2354 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2355 tok_state_rcdata_less_than_sign = ->
2356 c = txt.charAt(cur++)
2358 temporary_buffer = ''
2359 tok_state = tok_state_rcdata_end_tag_open
2362 tok_state = tok_state_rcdata
2363 cur -= 1 # reconsume the input character
2364 return new_character_token '<'
2366 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2367 tok_state_rcdata_end_tag_open = ->
2368 c = txt.charAt(cur++)
2369 if uc_alpha.indexOf(c) > -1
2370 tok_cur_tag = new_end_tag c.toLowerCase()
2371 temporary_buffer += c
2372 tok_state = tok_state_rcdata_end_tag_name
2374 if lc_alpha.indexOf(c) > -1
2375 tok_cur_tag = new_end_tag c
2376 temporary_buffer += c
2377 tok_state = tok_state_rcdata_end_tag_name
2380 tok_state = tok_state_rcdata
2381 cur -= 1 # reconsume the input character
2382 return new_character_token "</" # fixfull separate these
2384 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2385 is_appropriate_end_tag = (t) ->
2386 # spec says to check against "the tag name of the last start tag to
2387 # have been emitted from this tokenizer", but this is only called from
2388 # the various "raw" states, which I'm pretty sure all push the start
2389 # token onto open_els. TODO: verify this after the script data states
2391 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2392 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2394 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2395 tok_state_rcdata_end_tag_name = ->
2396 c = txt.charAt(cur++)
2397 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2398 if is_appropriate_end_tag tok_cur_tag
2399 tok_state = tok_state_before_attribute_name
2401 # else fall through to "Anything else"
2403 if is_appropriate_end_tag tok_cur_tag
2404 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2406 # else fall through to "Anything else"
2408 if is_appropriate_end_tag tok_cur_tag
2409 tok_state = tok_state_data
2411 # else fall through to "Anything else"
2412 if uc_alpha.indexOf(c) > -1
2413 tok_cur_tag.name += c.toLowerCase()
2414 temporary_buffer += c
2416 if lc_alpha.indexOf(c) > -1
2417 tok_cur_tag.name += c
2418 temporary_buffer += c
2421 tok_state = tok_state_rcdata
2422 cur -= 1 # reconsume the input character
2423 return new_character_token '</' + temporary_buffer # fixfull separate these
2425 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2426 tok_state_rawtext_less_than_sign = ->
2427 c = txt.charAt(cur++)
2429 temporary_buffer = ''
2430 tok_state = tok_state_rawtext_end_tag_open
2433 tok_state = tok_state_rawtext
2434 cur -= 1 # reconsume the input character
2435 return new_character_token '<'
2437 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2438 tok_state_rawtext_end_tag_open = ->
2439 c = txt.charAt(cur++)
2440 if uc_alpha.indexOf(c) > -1
2441 tok_cur_tag = new_end_tag c.toLowerCase()
2442 temporary_buffer += c
2443 tok_state = tok_state_rawtext_end_tag_name
2445 if lc_alpha.indexOf(c) > -1
2446 tok_cur_tag = new_end_tag c
2447 temporary_buffer += c
2448 tok_state = tok_state_rawtext_end_tag_name
2451 tok_state = tok_state_rawtext
2452 cur -= 1 # reconsume the input character
2453 return new_character_token "</" # fixfull separate these
2455 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2456 tok_state_rawtext_end_tag_name = ->
2457 c = txt.charAt(cur++)
2458 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2459 if is_appropriate_end_tag tok_cur_tag
2460 tok_state = tok_state_before_attribute_name
2462 # else fall through to "Anything else"
2464 if is_appropriate_end_tag tok_cur_tag
2465 tok_state = tok_state_self_closing_start_tag
2467 # else fall through to "Anything else"
2469 if is_appropriate_end_tag tok_cur_tag
2470 tok_state = tok_state_data
2472 # else fall through to "Anything else"
2473 if uc_alpha.indexOf(c) > -1
2474 tok_cur_tag.name += c.toLowerCase()
2475 temporary_buffer += c
2477 if lc_alpha.indexOf(c) > -1
2478 tok_cur_tag.name += c
2479 temporary_buffer += c
2482 tok_state = tok_state_rawtext
2483 cur -= 1 # reconsume the input character
2484 return new_character_token '</' + temporary_buffer # fixfull separate these
2486 # TODO _all_ of the missing states here (17-33) are for parsing script tags
2488 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2489 tok_state_before_attribute_name = ->
2491 switch c = txt.charAt(cur++)
2492 when "\t", "\n", "\u000c", ' '
2495 tok_state = tok_state_self_closing_start_tag
2498 tok_state = tok_state_data
2504 attr_name = "\ufffd"
2505 when '"', "'", '<', '='
2510 tok_state = tok_state_data
2512 if uc_alpha.indexOf(c) > -1
2513 attr_name = c.toLowerCase()
2517 tok_cur_tag.attrs_a.unshift [attr_name, '']
2518 tok_state = tok_state_attribute_name
2521 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2522 tok_state_attribute_name = ->
2523 switch c = txt.charAt(cur++)
2524 when "\t", "\n", "\u000c", ' '
2525 tok_state = tok_state_after_attribute_name
2527 tok_state = tok_state_self_closing_start_tag
2529 tok_state = tok_state_before_attribute_value
2531 tok_state = tok_state_data
2537 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2540 tok_cur_tag.attrs_a[0][0] = c
2543 tok_state = tok_state_data
2545 if uc_alpha.indexOf(c) > -1
2546 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2548 tok_cur_tag.attrs_a[0][0] += c
2551 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2552 tok_state_after_attribute_name = ->
2553 c = txt.charAt(cur++)
2554 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2557 tok_state = tok_state_self_closing_start_tag
2560 tok_state = tok_state_before_attribute_value
2563 tok_state = tok_state_data
2565 if uc_alpha.indexOf(c) > -1
2566 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2567 tok_state = tok_state_attribute_name
2571 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2572 tok_state = tok_state_attribute_name
2576 tok_state = tok_state_data
2577 cur -= 1 # reconsume
2579 if c is '"' or c is "'" or c is '<'
2581 # fall through to Anything else
2583 tok_cur_tag.attrs_a.unshift [c, '']
2584 tok_state = tok_state_attribute_name
2586 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2587 tok_state_before_attribute_value = ->
2588 switch c = txt.charAt(cur++)
2589 when "\t", "\n", "\u000c", ' '
2592 tok_state = tok_state_attribute_value_double_quoted
2594 tok_state = tok_state_attribute_value_unquoted
2597 tok_state = tok_state_attribute_value_single_quoted
2600 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2601 tok_state = tok_state_attribute_value_unquoted
2604 tok_state = tok_state_data
2610 tok_state = tok_state_data
2612 tok_cur_tag.attrs_a[0][1] += c
2613 tok_state = tok_state_attribute_value_unquoted
2616 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2617 tok_state_attribute_value_double_quoted = ->
2618 switch c = txt.charAt(cur++)
2620 tok_state = tok_state_after_attribute_value_quoted
2622 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2625 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2628 tok_state = tok_state_data
2630 tok_cur_tag.attrs_a[0][1] += c
2633 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2634 tok_state_attribute_value_single_quoted = ->
2635 switch c = txt.charAt(cur++)
2637 tok_state = tok_state_after_attribute_value_quoted
2639 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2642 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2645 tok_state = tok_state_data
2647 tok_cur_tag.attrs_a[0][1] += c
2650 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2651 tok_state_attribute_value_unquoted = ->
2652 switch c = txt.charAt(cur++)
2653 when "\t", "\n", "\u000c", ' '
2654 tok_state = tok_state_before_attribute_name
2656 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2658 tok_state = tok_state_data
2663 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2666 tok_state = tok_state_data
2668 # Parse Error if ', <, = or ` (backtick)
2669 tok_cur_tag.attrs_a[0][1] += c
2672 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2673 tok_state_after_attribute_value_quoted = ->
2674 switch c = txt.charAt(cur++)
2675 when "\t", "\n", "\u000c", ' '
2676 tok_state = tok_state_before_attribute_name
2678 tok_state = tok_state_self_closing_start_tag
2680 tok_state = tok_state_data
2686 tok_state = tok_state_data
2689 tok_state = tok_state_before_attribute_name
2690 cur -= 1 # we didn't handle that char
2693 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
2694 # WARNING: put a comment token in tok_cur_tag before setting this state
2695 tok_state_bogus_comment = ->
2696 next_gt = txt.indexOf '>', cur
2698 val = txt.substr cur
2701 val = txt.substr cur, (next_gt - cur)
2703 val = val.replace "\u0000", "\ufffd"
2704 tok_cur_tag.text += val
2705 tok_state = tok_state_data
2708 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
2709 tok_state_markup_declaration_open = ->
2710 if txt.substr(cur, 2) is '--'
2712 tok_cur_tag = new_comment_token ''
2713 tok_state = tok_state_comment_start
2715 if txt.substr(cur, 7).toLowerCase() is 'doctype'
2717 tok_state = tok_state_doctype
2719 acn = adjusted_current_node()
2720 if acn and acn.namespace isnt NS_HTML and text.substr(cur, 7) is '[CDATA['
2722 tok_state = tok_state_cdata_section
2726 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
2727 tok_state = tok_state_bogus_comment
2730 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
2731 tok_state_comment_start = ->
2732 switch c = txt.charAt(cur++)
2734 tok_state = tok_state_comment_start_dash
2737 return new_character_token "\ufffd"
2740 tok_state = tok_state_data
2744 tok_state = tok_state_data
2745 cur -= 1 # Reconsume
2748 tok_cur_tag.text += c
2751 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
2752 tok_state_comment_start_dash = ->
2753 switch c = txt.charAt(cur++)
2755 tok_state = tok_state_comment_end
2758 tok_cur_tag.text += "-\ufffd"
2759 tok_state = tok_state_comment
2762 tok_state = tok_state_data
2766 tok_state = tok_state_data
2767 cur -= 1 # Reconsume
2770 tok_cur_tag.text += "-#{c}"
2771 tok_state = tok_state_comment
2774 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
2775 tok_state_comment = ->
2776 switch c = txt.charAt(cur++)
2778 tok_state = tok_state_comment_end_dash
2781 tok_cur_tag.text += "\ufffd"
2784 tok_state = tok_state_data
2785 cur -= 1 # Reconsume
2788 tok_cur_tag.text += c
2791 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
2792 tok_state_comment_end_dash = ->
2793 switch c = txt.charAt(cur++)
2795 tok_state = tok_state_comment_end
2798 tok_cur_tag.text += "-\ufffd"
2799 tok_state = tok_state_comment
2802 tok_state = tok_state_data
2803 cur -= 1 # Reconsume
2806 tok_cur_tag.text += "-#{c}"
2807 tok_state = tok_state_comment
2810 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
2811 tok_state_comment_end = ->
2812 switch c = txt.charAt(cur++)
2814 tok_state = tok_state_data
2818 tok_cur_tag.text += "--\ufffd"
2819 tok_state = tok_state_comment
2822 tok_state = tok_state_comment_end_bang
2825 tok_cur_tag.text += '-'
2828 tok_state = tok_state_data
2829 cur -= 1 # Reconsume
2833 tok_cur_tag.text += "--#{c}"
2834 tok_state = tok_state_comment
2837 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
2838 tok_state_comment_end_bang = ->
2839 switch c = txt.charAt(cur++)
2841 tok_cur_tag.text += "--!#{c}"
2842 tok_state = tok_state_comment_end_dash
2844 tok_state = tok_state_data
2848 tok_cur_tag.text += "--!\ufffd"
2849 tok_state = tok_state_comment
2852 tok_state = tok_state_data
2853 cur -= 1 # Reconsume
2856 tok_cur_tag.text += "--!#{c}"
2857 tok_state = tok_state_comment
2860 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
2861 tok_state_doctype = ->
2862 switch c = txt.charAt(cur++)
2863 when "\t", "\u000a", "\u000c", ' '
2864 tok_state = tok_state_before_doctype_name
2867 tok_state = tok_state_data
2868 el = new_doctype_token ''
2869 el.flag 'force-quirks', true
2870 cur -= 1 # Reconsume
2874 tok_state = tok_state_before_doctype_name
2875 cur -= 1 # Reconsume
2878 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
2879 tok_state_before_doctype_name = ->
2880 c = txt.charAt(cur++)
2881 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2883 if uc_alpha.indexOf(c) > -1
2884 tok_cur_tag = new_doctype_token c.toLowerCase()
2885 tok_state = tok_state_doctype_name
2889 tok_cur_tag = new_doctype_token "\ufffd"
2890 tok_state = tok_state_doctype_name
2894 el = new_doctype_token ''
2895 el.flag 'force-quirks', true
2896 tok_state = tok_state_data
2900 tok_state = tok_state_data
2901 el = new_doctype_token ''
2902 el.flag 'force-quirks', true
2903 cur -= 1 # Reconsume
2906 tok_cur_tag = new_doctype_token c
2907 tok_state = tok_state_doctype_name
2910 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
2911 tok_state_doctype_name = ->
2912 c = txt.charAt(cur++)
2913 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2914 tok_state = tok_state_after_doctype_name
2917 tok_state = tok_state_data
2919 if uc_alpha.indexOf(c) > -1
2920 tok_cur_tag.name += c.toLowerCase()
2924 tok_cur_tag.name += "\ufffd"
2928 tok_state = tok_state_data
2929 tok_cur_tag.flag 'force-quirks', true
2930 cur -= 1 # Reconsume
2933 tok_cur_tag.name += c
2936 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
2937 tok_state_after_doctype_name = ->
2938 c = txt.charAt(cur++)
2939 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2942 tok_state = tok_state_data
2946 tok_state = tok_state_data
2947 tok_cur_tag.flag 'force-quirks', true
2948 cur -= 1 # Reconsume
2951 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
2953 tok_state = tok_state_after_doctype_public_keyword
2955 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
2957 tok_state = tok_state_after_doctype_system_keyword
2960 tok_cur_tag.flag 'force-quirks', true
2961 tok_state = tok_state_bogus_doctype
2964 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
2965 tok_state_after_doctype_public_keyword = ->
2966 c = txt.charAt(cur++)
2967 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2968 tok_state = tok_state_before_doctype_public_identifier
2972 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
2973 tok_state = tok_state_doctype_public_identifier_double_quoted
2977 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
2978 tok_state = tok_state_doctype_public_identifier_single_quoted
2982 tok_cur_tag.flag 'force-quirks', true
2983 tok_state = tok_state_data
2987 tok_state = tok_state_data
2988 tok_cur_tag.flag 'force-quirks', true
2989 cur -= 1 # Reconsume
2993 tok_cur_tag.flag 'force-quirks', true
2994 tok_state = tok_state_bogus_doctype
2997 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
2998 tok_state_before_doctype_public_identifier = ->
2999 c = txt.charAt(cur++)
3000 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3004 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3005 tok_state = tok_state_doctype_public_identifier_double_quoted
3009 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3010 tok_state = tok_state_doctype_public_identifier_single_quoted
3014 tok_cur_tag.flag 'force-quirks', true
3015 tok_state = tok_state_data
3019 tok_state = tok_state_data
3020 tok_cur_tag.flag 'force-quirks', true
3021 cur -= 1 # Reconsume
3025 tok_cur_tag.flag 'force-quirks', true
3026 tok_state = tok_state_bogus_doctype
3030 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
3031 tok_state_doctype_public_identifier_double_quoted = ->
3032 c = txt.charAt(cur++)
3034 tok_state = tok_state_after_doctype_public_identifier
3038 tok_cur_tag.public_identifier += "\ufffd"
3042 tok_cur_tag.flag 'force-quirks', true
3043 tok_state = tok_state_data
3047 tok_state = tok_state_data
3048 tok_cur_tag.flag 'force-quirks', true
3049 cur -= 1 # Reconsume
3052 tok_cur_tag.public_identifier += c
3055 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
3056 tok_state_doctype_public_identifier_single_quoted = ->
3057 c = txt.charAt(cur++)
3059 tok_state = tok_state_after_doctype_public_identifier
3063 tok_cur_tag.public_identifier += "\ufffd"
3067 tok_cur_tag.flag 'force-quirks', true
3068 tok_state = tok_state_data
3072 tok_state = tok_state_data
3073 tok_cur_tag.flag 'force-quirks', true
3074 cur -= 1 # Reconsume
3077 tok_cur_tag.public_identifier += c
3080 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
3081 tok_state_after_doctype_public_identifier = ->
3082 c = txt.charAt(cur++)
3083 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3084 tok_state = tok_state_between_doctype_public_and_system_identifiers
3087 tok_state = tok_state_data
3091 tok_cur_tag.system_identifier = ''
3092 tok_state = tok_state_doctype_system_identifier_double_quoted
3096 tok_cur_tag.system_identifier = ''
3097 tok_state = tok_state_doctype_system_identifier_single_quoted
3101 tok_state = tok_state_data
3102 tok_cur_tag.flag 'force-quirks', true
3103 cur -= 1 # Reconsume
3107 tok_cur_tag.flag 'force-quirks', true
3108 tok_state = tok_state_bogus_doctype
3111 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
3112 tok_state_between_doctype_public_and_system_identifiers = ->
3113 c = txt.charAt(cur++)
3114 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3117 tok_state = tok_state_data
3121 tok_cur_tag.system_identifier = ''
3122 tok_state = tok_state_doctype_system_identifier_double_quoted
3126 tok_cur_tag.system_identifier = ''
3127 tok_state = tok_state_doctype_system_identifier_single_quoted
3131 tok_state = tok_state_data
3132 tok_cur_tag.flag 'force-quirks', true
3133 cur -= 1 # Reconsume
3137 tok_cur_tag.flag 'force-quirks', true
3138 tok_state = tok_state_bogus_doctype
3141 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
3142 tok_state_after_doctype_system_keyword = ->
3143 c = txt.charAt(cur++)
3144 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3145 tok_state = tok_state_before_doctype_system_identifier
3149 tok_cur_tag.system_identifier = ''
3150 tok_state = tok_state_doctype_system_identifier_double_quoted
3154 tok_cur_tag.system_identifier = ''
3155 tok_state = tok_state_doctype_system_identifier_single_quoted
3159 tok_cur_tag.flag 'force-quirks', true
3160 tok_state = tok_state_data
3164 tok_state = tok_state_data
3165 tok_cur_tag.flag 'force-quirks', true
3166 cur -= 1 # Reconsume
3170 tok_cur_tag.flag 'force-quirks', true
3171 tok_state = tok_state_bogus_doctype
3174 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
3175 tok_state_before_doctype_system_identifier = ->
3176 c = txt.charAt(cur++)
3177 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3180 tok_cur_tag.system_identifier = ''
3181 tok_state = tok_state_doctype_system_identifier_double_quoted
3184 tok_cur_tag.system_identifier = ''
3185 tok_state = tok_state_doctype_system_identifier_single_quoted
3189 tok_cur_tag.flag 'force-quirks', true
3190 tok_state = tok_state_data
3194 tok_state = tok_state_data
3195 tok_cur_tag.flag 'force-quirks', true
3196 cur -= 1 # Reconsume
3200 tok_cur_tag.flag 'force-quirks', true
3201 tok_state = tok_state_bogus_doctype
3204 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
3205 tok_state_doctype_system_identifier_double_quoted = ->
3206 c = txt.charAt(cur++)
3208 tok_state = tok_state_after_doctype_system_identifier
3212 tok_cur_tag.system_identifier += "\ufffd"
3216 tok_cur_tag.flag 'force-quirks', true
3217 tok_state = tok_state_data
3221 tok_state = tok_state_data
3222 tok_cur_tag.flag 'force-quirks', true
3223 cur -= 1 # Reconsume
3226 tok_cur_tag.system_identifier += c
3229 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
3230 tok_state_doctype_system_identifier_single_quoted = ->
3231 c = txt.charAt(cur++)
3233 tok_state = tok_state_after_doctype_system_identifier
3237 tok_cur_tag.system_identifier += "\ufffd"
3241 tok_cur_tag.flag 'force-quirks', true
3242 tok_state = tok_state_data
3246 tok_state = tok_state_data
3247 tok_cur_tag.flag 'force-quirks', true
3248 cur -= 1 # Reconsume
3251 tok_cur_tag.system_identifier += c
3254 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
3255 tok_state_after_doctype_system_identifier = ->
3256 c = txt.charAt(cur++)
3257 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3260 tok_state = tok_state_data
3264 tok_state = tok_state_data
3265 tok_cur_tag.flag 'force-quirks', true
3266 cur -= 1 # Reconsume
3270 # do _not_ tok_cur_tag.flag 'force-quirks', true
3271 tok_state = tok_state_bogus_doctype
3274 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
3275 tok_state_bogus_doctype = ->
3276 c = txt.charAt(cur++)
3278 tok_state = tok_state_data
3281 tok_state = tok_state_data
3282 cur -= 1 # Reconsume
3288 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
3289 # Don't set this as a state, just call it
3290 # returns a string (NOT a text node)
3291 parse_character_reference = (allowed_char = null, in_attr = false) ->
3292 if cur >= txt.length
3294 switch c = txt.charAt(cur)
3295 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
3296 # explicitly not a parse error
3299 # there has to be "one or more" alnums between & and ; to be a parse error
3302 if cur + 1 >= txt.length
3304 if txt.charAt(cur + 1).toLowerCase() is 'x'
3313 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
3317 if txt.charAt(start + i) is ';'
3319 # FIXME This is supposed to generate parse errors for some chars
3320 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
3327 if alnum.indexOf(txt.charAt(cur + i)) is -1
3330 # exit early, because parse_error() below needs at least one alnum
3332 if txt.charAt(cur + i) is ';'
3333 i += 1 # include ';' terminator in value
3334 decoded = decode_named_char_ref txt.substr(cur, i)
3341 # no ';' terminator (only legacy char refs)
3343 for i in [2..max] # no prefix matches, so ok to check shortest first
3344 c = legacy_char_refs[txt.substr(cur, i)]
3347 if txt.charAt(cur + i) is '='
3348 # "because some legacy user agents will
3349 # misinterpret the markup in those cases"
3352 if alnum.indexOf(txt.charAt(cur + i)) > -1
3353 # this makes attributes forgiving about url args
3355 # ok, and besides the weird exceptions for attributes...
3356 # return the matching char
3357 cur += i # consume entity chars
3358 parse_error() # because no terminating ";"
3362 return # never reached
3364 # tree constructor initialization
3365 # see comments on TYPE_TAG/etc for the structure of this data
3366 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
3368 afe = [] # active formatting elements
3369 template_insertion_modes = []
3370 insertion_mode = ins_mode_initial
3371 original_insertion_mode = insertion_mode # TODO check spec
3372 flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
3373 flag_frameset_ok = true
3375 flag_foster_parenting = false
3376 form_element_pointer = null
3377 temporary_buffer = null
3378 pending_table_character_tokens = []
3379 head_element_pointer = null
3380 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
3381 context_element = null # FIXME initialize from args.fragment
3383 # tokenizer initialization
3384 tok_state = tok_state_data
3391 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
3394 serialize_els = (els, shallow, show_ids) ->
3400 serialized += t.serialize shallow, show_ids
3403 # TODO export TYPE_*
3404 module.exports.parse_html = parse_html
3405 module.exports.debug_log_reset = debug_log_reset
3406 module.exports.debug_log_each = debug_log_each
3407 module.exports.TYPE_TAG = TYPE_TAG
3408 module.exports.TYPE_TEXT = TYPE_TEXT
3409 module.exports.TYPE_COMMENT = TYPE_COMMENT
3410 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE