1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
55 module = exports: window.wheic
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
79 debug_log_each = (cb) ->
80 for str in g_debug_log
85 constructor: (type, args = {}) ->
86 @type = type # one of the TYPE_* constants above
87 @name = args.name ? '' # tag name
88 @text = args.text ? '' # contents for text/comment nodes
89 @attrs = args.attrs ? {}
90 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91 @children = args.children ? []
92 @namespace = args.namespace ? NS_HTML
93 @parent = args.parent ? null
94 @token = args.token ? null
98 @id = "#{++prev_node_id}"
99 shallow_clone: -> # return a new node that's the same except without the children or parent
100 # WARNING this doesn't work right on open tags that are still being parsed
102 attrs[k] = v for k, v of @attrs
103 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id, token: @token
104 acknowledge_self_closing: ->
106 @token.flag 'did_self_close'
108 @flag 'did_self_close', true
111 serialize: (shallow = false, show_ids = false) -> # for unit tests
116 ret += JSON.stringify @name
131 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
137 ret += c.serialize shallow, show_ids
141 ret += JSON.stringify @text
144 ret += JSON.stringify @text
150 when TYPE_AAA_BOOKMARK
151 ret += 'aaa_bookmark'
154 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
157 # helpers: (only take args that are normally known when parser creates nodes)
158 new_open_tag = (name) ->
159 return new Node TYPE_START_TAG, name: name
160 new_end_tag = (name) ->
161 return new Node TYPE_END_TAG, name: name
162 new_element = (name) ->
163 return new Node TYPE_TAG, name: name
164 new_text_node = (txt) ->
165 return new Node TYPE_TEXT, text: txt
166 new_character_token = new_text_node
167 new_comment_token = (txt) ->
168 return new Node TYPE_COMMENT, text: txt
169 new_doctype_token = (name) ->
170 return new Node TYPE_DOCTYPE, name: name
172 return new Node TYPE_EOF
174 return new Node TYPE_AFE_MARKER
175 new_aaa_bookmark = ->
176 return new Node TYPE_AAA_BOOKMARK
178 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
179 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
180 digits = "0123456789"
181 alnum = lc_alpha + uc_alpha + digits
182 hex_chars = digits + "abcdefABCDEF"
184 # some SVG elements have dashes in them
185 tag_name_chars = alnum + "-"
187 # http://www.w3.org/TR/html5/infrastructure.html#space-character
188 space_chars = "\u0009\u000a\u000c\u000d\u0020"
190 return txt.length is 1 and space_chars.indexOf(txt) > -1
191 is_space_tok = (t) ->
192 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
194 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
195 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
197 # These are the character references that don't need a terminating semicolon
198 # min length: 2, max: 6, none are a prefix of any other.
200 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
201 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
202 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
203 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
204 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
205 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
206 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
207 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
208 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
209 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
210 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
211 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
212 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
213 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
214 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
215 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
216 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
220 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
221 raw_text_elements = ['script', 'style']
222 escapable_raw_text_elements = ['textarea', 'title']
223 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
225 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
226 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
227 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
228 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
229 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
230 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
231 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
232 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
233 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
234 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
235 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
236 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
237 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
238 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
242 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
244 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
245 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
246 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
247 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
248 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
249 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
250 'determinant', 'diff', 'divergence', 'divide', 'domain',
251 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
252 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
253 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
254 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
255 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
256 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
257 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
258 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
259 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
260 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
261 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
262 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
263 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
264 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
265 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
266 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
267 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
268 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
269 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
270 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
271 'vectorproduct', 'xor'
273 # foreign_elements = [svg_elements..., mathml_elements...]
274 #normal_elements = All other allowed HTML elements are normal elements.
278 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
279 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
280 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
281 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
282 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
283 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
284 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
285 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
286 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
287 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
288 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
289 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
290 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
291 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
292 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
293 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
294 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
295 wbr:NS_HTML, xmp:NS_HTML,
298 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
299 'annotation-xml':NS_MATHML,
302 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
305 formatting_elements = {
306 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
307 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
311 foster_parenting_targets = {
333 el_is_special = (e) ->
334 return special_elements[e.name] is e.namespace
336 # decode_named_char_ref()
338 # The list of named character references is _huge_ so ask the browser to decode
339 # for us instead of wasting bandwidth/space on including the table here.
341 # Pass without the "&" but with the ";" examples:
342 # for "&" pass "amp;"
343 # for "′" pass "x2032;"
346 textarea: document.createElement('textarea')
348 # TODO test this in IE8
349 decode_named_char_ref = (txt) ->
351 decoded = g_dncr.cache[txt]
352 return decoded if decoded?
353 g_dncr.textarea.innerHTML = txt
354 decoded = g_dncr.textarea.value
355 return null if decoded is txt
356 return g_dncr.cache[txt] = decoded
358 parse_html = (txt, parse_error_cb = null) ->
359 cur = 0 # index of next char in txt to be parsed
360 # declare doc and tokenizer variables so they're in scope below
362 open_els = null # stack of open elements
363 afe = null # active formatting elements
364 template_insertion_modes = null
365 insertion_mode = null
366 original_insertion_mode = null
368 tok_cur_tag = null # partially parsed tag
369 flag_scripting = null
370 flag_frameset_ok = null
372 flag_foster_parenting = null
373 form_element_pointer = null
374 temporary_buffer = null
375 pending_table_character_tokens = null
376 head_element_pointer = null
377 flag_fragment_parsing = null
378 context_element = null
387 console.log "Parse error at character #{cur} of #{txt.length}"
389 afe_push = (new_el) ->
392 if el.name is new_el.name and el.namespace is new_el.namespace
394 continue unless new_el.attrs[k] is v
395 for k, v of new_el.attrs
396 continue unless el.attrs[k] is v
403 afe.unshift new_afe_marker()
405 # the functions below impliment the Tree Contstruction algorithm
406 # http://www.w3.org/TR/html5/syntax.html#tree-construction
408 # But first... the helpers
409 template_tag_is_open = ->
411 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
414 is_in_scope_x = (tag_name, scope, namespace) ->
416 if t.name is tag_name and (namespace is null or namespace is t.namespace)
418 if scope[t.name] is t.namespace
421 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
423 if t.name is tag_name and (namespace is null or namespace is t.namespace)
425 if scope[t.name] is t.namespace
427 if scope2[t.name] is t.namespace
430 standard_scopers = { # FIXME these are supposed to be namespace specific
431 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
432 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
433 template: NS_HTML, mi: NS_MATHML,
435 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
436 'annotation-xml': NS_MATHML,
438 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
440 button_scopers = button: NS_HTML
441 li_scopers = ol: NS_HTML, ul: NS_HTML
442 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
443 is_in_scope = (tag_name, namespace = null) ->
444 return is_in_scope_x tag_name, standard_scopers, namespace
445 is_in_button_scope = (tag_name, namespace = null) ->
446 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
447 is_in_table_scope = (tag_name, namespace = null) ->
448 return is_in_scope_x tag_name, table_scopers, namespace
449 is_in_select_scope = (tag_name, namespace = null) ->
451 if t.name is tag_name and (namespace is null or namespace is t.namespace)
453 if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
456 # this checks for a particular element, not by name
457 el_is_in_scope = (el) ->
461 if standard_scopers[t.name] is t.namespace
465 clear_to_table_stopers = {
470 clear_stack_to_table_context = ->
472 if clear_to_table_stopers[open_els[0].name]?
476 clear_to_table_body_stopers = {
483 clear_stack_to_table_body_context = ->
485 if clear_to_table_body_stopers[open_els[0].name]?
489 clear_to_table_row_stopers = {
494 clear_stack_to_table_row_context = ->
496 if clear_to_table_row_stopers[open_els[0].name]?
500 clear_afe_to_marker = ->
503 if el.type is TYPE_AFE_MARKER
507 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
508 reset_insertion_mode = ->
509 # 1. Let last be false.
511 # 2. Let node be the last node in the stack of open elements.
513 node = open_els[node_i]
514 # 3. Loop: If node is the first node in the stack of open elements,
515 # then set last to true, and, if the parser was originally created as
516 # part of the HTML fragment parsing algorithm (fragment case) set node
517 # to the context element.
519 if node_i is open_els.length - 1
521 # fixfull (fragment case)
523 # 4. If node is a select element, run these substeps:
524 if node.name is 'select'
525 # 1. If last is true, jump to the step below labeled done.
527 # 2. Let ancestor be node.
530 # 3. Loop: If ancestor is the first node in the stack of
531 # open elements, jump to the step below labeled done.
533 if ancestor_i is open_els.length - 1
535 # 4. Let ancestor be the node before ancestor in the stack
538 ancestor = open_els[ancestor_i]
539 # 5. If ancestor is a template node, jump to the step below
541 if ancestor.name is 'template'
543 # 6. If ancestor is a table node, switch the insertion mode
544 # to "in select in table" and abort these steps.
545 if ancestor.name is 'table'
546 insertion_mode = ins_mode_in_select_in_table
548 # 7. Jump back to the step labeled loop.
549 # 8. Done: Switch the insertion mode to "in select" and abort
551 insertion_mode = ins_mode_in_select
553 # 5. If node is a td or th element and last is false, then switch
554 # the insertion mode to "in cell" and abort these steps.
555 if (node.name is 'td' or node.name is 'th') and last is false
556 insertion_mode = ins_mode_in_cell
558 # 6. If node is a tr element, then switch the insertion mode to "in
559 # row" and abort these steps.
561 insertion_mode = ins_mode_in_row
563 # 7. If node is a tbody, thead, or tfoot element, then switch the
564 # insertion mode to "in table body" and abort these steps.
565 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
566 insertion_mode = ins_mode_in_table_body
568 # 8. If node is a caption element, then switch the insertion mode
569 # to "in caption" and abort these steps.
570 if node.name is 'caption'
571 insertion_mode = ins_mode_in_caption
573 # 9. If node is a colgroup element, then switch the insertion mode
574 # to "in column group" and abort these steps.
575 if node.name is 'colgroup'
576 insertion_mode = ins_mode_in_column_group
578 # 10. If node is a table element, then switch the insertion mode to
579 # "in table" and abort these steps.
580 if node.name is 'table'
581 insertion_mode = ins_mode_in_table
583 # 11. If node is a template element, then switch the insertion mode
584 # to the current template insertion mode and abort these steps.
585 # fixfull (template insertion mode stack)
587 # 12. If node is a head element and last is true, then switch the
588 # insertion mode to "in body" ("in body"! not "in head"!) and abort
589 # these steps. (fragment case)
590 if node.name is 'head' and last
591 insertion_mode = ins_mode_in_body
593 # 13. If node is a head element and last is false, then switch the
594 # insertion mode to "in head" and abort these steps.
595 if node.name is 'head' and last is false
596 insertion_mode = ins_mode_in_head
598 # 14. If node is a body element, then switch the insertion mode to
599 # "in body" and abort these steps.
600 if node.name is 'body'
601 insertion_mode = ins_mode_in_body
603 # 15. If node is a frameset element, then switch the insertion mode
604 # to "in frameset" and abort these steps. (fragment case)
605 if node.name is 'frameset'
606 insertion_mode = ins_mode_in_frameset
608 # 16. If node is an html element, run these substeps:
609 if node.name is 'html'
610 # 1. If the head element pointer is null, switch the insertion
611 # mode to "before head" and abort these steps. (fragment case)
612 # fixfull (fragment case)
614 # 2. Otherwise, the head element pointer is not null, switch
615 # the insertion mode to "after head" and abort these steps.
616 insertion_mode = ins_mode_in_body # FIXME fixfull
618 # 17. If last is true, then switch the insertion mode to "in body"
619 # and abort these steps. (fragment case)
621 insertion_mode = ins_mode_in_body
623 # 18. Let node now be the node before node in the stack of open
626 node = open_els[node_i]
627 # 19. Return to the step labeled loop.
631 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
632 adjusted_current_node = ->
633 if open_els.length is 1 and flag_fragment_parsing
634 return context_element
637 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
638 # this implementation is structured (mostly) as described at the link above.
639 # capitalized comments are the "labels" described at the link above.
640 reconstruct_active_formatting_elements = ->
641 return if afe.length is 0
642 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
647 if i is afe.length - 1
650 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
655 el = afe[i].shallow_clone()
656 tree_insert_element el
661 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
662 # adoption agency algorithm
664 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
665 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
666 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
667 adoption_agency = (subject) ->
668 debug_log "adoption_agency()"
669 debug_log "tree: #{serialize_els doc.children, false, true}"
670 debug_log "open_els: #{serialize_els open_els, true, true}"
671 debug_log "afe: #{serialize_els afe, true, true}"
672 if open_els[0].name is subject
675 # remove it from the list of active formatting elements (if found)
680 debug_log "aaa: starting off with subject on top of stack, exiting"
687 # 5. Let formatting element be the last element in the list of
688 # active formatting elements that: is between the end of the list
689 # and the last scope marker in the list, if any, or the start of
690 # the list otherwise, and has the tag name subject.
692 for t, fe_of_afe in afe
693 if t.type is TYPE_AFE_MARKER
698 # If there is no such element, then abort these steps and instead
699 # act as described in the "any other end tag" entry above.
701 debug_log "aaa: fe not found in afe"
702 in_body_any_other_end_tag subject
704 # 6. If formatting element is not in the stack of open elements,
705 # then this is a parse error; remove the element from the list, and
708 for t, fe_of_open_els in open_els
713 debug_log "aaa: fe not found in open_els"
715 # "remove it from the list" must mean afe, since it's not in open_els
716 afe.splice fe_of_afe, 1
718 # 7. If formatting element is in the stack of open elements, but
719 # the element is not in scope, then this is a parse error; abort
721 unless el_is_in_scope fe
722 debug_log "aaa: fe not in scope"
725 # 8. If formatting element is not the current node, this is a parse
726 # error. (But do not abort these steps.)
727 unless open_els[0] is fe
730 # 9. Let furthest block be the topmost node in the stack of open
731 # elements that is lower in the stack than formatting element, and
732 # is an element in the special category. There might not be one.
734 fb_of_open_els = null
741 # and continue, to see if there's one that's more "topmost"
742 # 10. If there is no furthest block, then the UA must first pop all
743 # the nodes from the bottom of the stack of open elements, from the
744 # current node up to and including formatting element, then remove
745 # formatting element from the list of active formatting elements,
746 # and finally abort these steps.
748 debug_log "aaa: no fb"
752 afe.splice fe_of_afe, 1
754 # 11. Let common ancestor be the element immediately above
755 # formatting element in the stack of open elements.
756 ca = open_els[fe_of_open_els + 1] # common ancestor
758 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
759 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
760 bookmark = new_aaa_bookmark()
763 afe.splice i, 0, bookmark
765 node = last_node = fb
769 # 3. Let node be the element immediately above node in the
770 # stack of open elements, or if node is no longer in the stack
771 # of open elements (e.g. because it got removed by this
772 # algorithm), the element that was immediately above node in
773 # the stack of open elements before node was removed.
777 node_next = open_els[i + 1]
779 node = node_next ? node_above
780 debug_log "inner loop #{inner}"
781 debug_log "tree: #{serialize_els doc.children, false, true}"
782 debug_log "open_els: #{serialize_els open_els, true, true}"
783 debug_log "afe: #{serialize_els afe, true, true}"
784 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
785 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
786 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
787 debug_log "node: #{node.serialize true, true}"
788 # TODO make sure node_above gets re-set if/when node is removed from open_els
790 # 4. If node is formatting element, then go to the next step in
791 # the overall algorithm.
795 # 5. If inner loop counter is greater than three and node is in
796 # the list of active formatting elements, then remove node from
797 # the list of active formatting elements.
803 debug_log "max out inner"
808 # 6. If node is not in the list of active formatting elements,
809 # then remove node from the stack of open elements and then go
810 # back to the step labeled inner loop.
812 debug_log "not in afe"
815 node_above = open_els[i + 1]
819 debug_log "the bones"
820 # 7. create an element for the token for which the element node
821 # was created, in the HTML namespace, with common ancestor as
822 # the intended parent; replace the entry for node in the list
823 # of active formatting elements with an entry for the new
824 # element, replace the entry for node in the stack of open
825 # elements with an entry for the new element, and let node be
827 new_node = node.shallow_clone()
831 debug_log "replaced in afe"
835 node_above = open_els[i + 1]
836 open_els[i] = new_node
837 debug_log "replaced in open_els"
840 # 8. If last node is furthest block, then move the
841 # aforementioned bookmark to be immediately after the new node
842 # in the list of active formatting elements.
847 debug_log "removed bookmark"
851 # "after" means lower
852 afe.splice i, 0, bookmark # "after as <-
853 debug_log "placed bookmark after node"
854 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
856 # 9. Insert last node into node, first removing it from its
857 # previous parent node if any.
859 debug_log "last_node has parent"
860 for c, i in last_node.parent.children
862 debug_log "removing last_node from parent"
863 last_node.parent.children.splice i, 1
865 node.children.push last_node
866 last_node.parent = node
867 # 10. Let last node be node.
870 # 11. Return to the step labeled inner loop.
871 # 14. Insert whatever last node ended up being in the previous step
872 # at the appropriate place for inserting a node, but using common
873 # ancestor as the override target.
875 # In the case where fe is immediately followed by fb:
876 # * inner loop exits out early (node==fe)
878 # * last_node is still in the tree (not a duplicate)
880 debug_log "FEFIRST? last_node has parent"
881 for c, i in last_node.parent.children
883 debug_log "removing last_node from parent"
884 last_node.parent.children.splice i, 1
887 debug_log "after aaa inner loop"
888 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
889 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
890 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
891 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
892 debug_log "tree: #{serialize_els doc.children, false, true}"
897 # can't use standard insert token thing, because it's already in
898 # open_els and must stay at it's current position in open_els
899 dest = adjusted_insertion_location ca
900 dest[0].children.splice dest[1], 0, last_node
901 last_node.parent = dest[0]
904 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
905 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
906 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
907 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
908 debug_log "tree: #{serialize_els doc.children, false, true}"
910 # 15. Create an element for the token for which formatting element
911 # was created, in the HTML namespace, with furthest block as the
913 new_element = fe.shallow_clone() # FIXME intended parent thing
914 # 16. Take all of the child nodes of furthest block and append them
915 # to the element created in the last step.
916 while fb.children.length
917 t = fb.children.shift()
918 t.parent = new_element
919 new_element.children.push t
920 # 17. Append that new element to furthest block.
921 new_element.parent = fb
922 fb.children.push new_element
923 # 18. Remove formatting element from the list of active formatting
924 # elements, and insert the new element into the list of active
925 # formatting elements at the position of the aforementioned
935 # 19. Remove formatting element from the stack of open elements,
936 # and insert the new element into the stack of open elements
937 # immediately below the position of furthest block in that stack.
944 open_els.splice i, 0, new_element
946 # 20. Jump back to the step labeled outer loop.
947 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
948 debug_log "tree: #{serialize_els doc.children, false, true}"
949 debug_log "open_els: #{serialize_els open_els, true, true}"
950 debug_log "afe: #{serialize_els afe, true, true}"
953 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
955 generate_implied_end_tags 'p' # arg is exception
956 if open_els[0].name isnt 'p'
958 while open_els.length > 1 # just in case
959 el = open_els.shift()
962 close_p_if_in_button_scope = ->
963 if is_in_button_scope 'p'
966 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
967 # aka insert_a_character = (t) ->
968 insert_character = (t) ->
969 dest = adjusted_insertion_location()
970 # fixfull check for Document node
972 prev = dest[0].children[dest[1] - 1]
973 if prev.type is TYPE_TEXT
976 dest[0].children.splice dest[1], 0, t
979 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
980 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
981 adjusted_insertion_location = (override_target = null) ->
982 # 1. If there was an override target specified, then let target be the
985 target = override_target
986 else # Otherwise, let target be the current node.
988 # 2. Determine the adjusted insertion location using the first matching
989 # steps from the following list:
991 # If foster parenting is enabled and target is a table, tbody, tfoot,
992 # thead, or tr element Foster parenting happens when content is
993 # misnested in tables.
994 if flag_foster_parenting and foster_parenting_targets[target.name]
995 loop # once. this is here so we can ``break`` to "abort these substeps"
996 # 1. Let last template be the last template element in the
997 # stack of open elements, if any.
999 last_template_i = null
1000 for el, i in open_els
1001 if el.name is 'template'
1005 # 2. Let last table be the last table element in the stack of
1006 # open elements, if any.
1009 for el, i in open_els
1010 if el.name is 'table'
1014 # 3. If there is a last template and either there is no last
1015 # table, or there is one, but last template is lower (more
1016 # recently added) than last table in the stack of open
1017 # elements, then: let adjusted insertion location be inside
1018 # last template's template contents, after its last child (if
1019 # any), and abort these substeps.
1020 if last_template and (last_table is null or last_template_i < last_table_i)
1021 target = template # fixfull should be it's contents
1022 target_i = target.children.length
1024 # 4. If there is no last table, then let adjusted insertion
1025 # location be inside the first element in the stack of open
1026 # elements (the html element), after its last child (if any),
1027 # and abort these substeps. (fragment case)
1028 if last_table is null
1030 target = open_els[open_els.length - 1]
1031 target_i = target.children.length
1032 # 5. If last table has a parent element, then let adjusted
1033 # insertion location be inside last table's parent element,
1034 # immediately before last table, and abort these substeps.
1035 if last_table.parent?
1036 for c, i in last_table.parent.children
1038 target = last_table.parent
1042 # 6. Let previous element be the element immediately above last
1043 # table in the stack of open elements.
1045 # huh? how could it not have a parent?
1046 previous_element = open_els[last_table_i + 1]
1047 # 7. Let adjusted insertion location be inside previous
1048 # element, after its last child (if any).
1049 target = previous_element
1050 target_i = target.children.length
1051 # Note: These steps are involved in part because it's possible
1052 # for elements, the table element in this case in particular,
1053 # to have been moved by a script around in the DOM, or indeed
1054 # removed from the DOM entirely, after the element was inserted
1056 break # don't really loop
1058 # Otherwise Let adjusted insertion location be inside target, after
1059 # its last child (if any).
1060 target_i = target.children.length
1062 # 3. If the adjusted insertion location is inside a template element,
1063 # let it instead be inside the template element's template contents,
1064 # after its last child (if any).
1065 # fixfull (template)
1067 # 4. Return the adjusted insertion location.
1068 return [target, target_i]
1070 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1071 # aka create_an_element_for_token
1072 token_to_element = (t, namespace, intended_parent) ->
1073 t.type = TYPE_TAG # not TYPE_START_TAG
1074 # convert attributes into a hash
1076 while t.attrs_a.length
1078 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1079 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1081 # TODO 2. If the newly created element has an xmlns attribute in the
1082 # XMLNS namespace whose value is not exactly the same as the element's
1083 # namespace, that is a parse error. Similarly, if the newly created
1084 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1085 # value is not the XLink Namespace, that is a parse error.
1087 # fixfull: the spec says stuff about form pointers and ownerDocument
1091 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1092 insert_foreign_element = (token, namespace) ->
1093 ail = adjusted_insertion_location()
1096 el = token_to_element token, namespace, ail_el
1097 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1099 ail_el.children.splice ail_i, 0, el
1102 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1103 insert_html_element = insert_foreign_element # (token, namespace) ->
1105 # FIXME read implement "foster parenting" part
1106 # FIXME read spec, do this right
1107 # FIXME implement the override target thing
1108 # note: this assumes it's an open tag
1109 # FIXME what part of the spec is this?
1110 # TODO look through all callers of this, and see what they should really be doing.
1111 # eg probably insert_html_element for tokens
1112 tree_insert_element = (el, override_target = null, namespace = null) ->
1114 el.namespace = namespace
1115 dest = adjusted_insertion_location override_target
1116 if el.type is TYPE_START_TAG # means it's a "token"
1117 el = token_to_element el, namespace, dest[0]
1118 unless el.namespace?
1119 namespace = dest.namespace
1120 # fixfull: Document nodes sometimes can't accept more chidren
1121 dest[0].children.splice dest[1], 0, el
1126 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1127 # position should be [node, index_within_children]
1128 insert_comment = (t, position = null) ->
1129 position ?= adjusted_insertion_location()
1130 position[0].children.splice position[1], 0, t
1133 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1134 parse_generic_raw_text = (t) ->
1135 insert_html_element t
1136 tok_state = tok_state_rawtext
1137 original_insertion_mode = insertion_mode
1138 insertion_mode = ins_mode_text
1139 parse_generic_rcdata_text = (t) ->
1140 insert_html_element t
1141 tok_state = tok_state_rcdata
1142 original_insertion_mode = insertion_mode
1143 insertion_mode = ins_mode_text
1145 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1146 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1147 generate_implied_end_tags = (except = null) ->
1148 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1151 # 8.2.5.4 The rules for parsing tokens in HTML content
1152 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1154 # 8.2.5.4.1 The "initial" insertion mode
1155 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1156 ins_mode_initial = (t) ->
1159 if t.type is TYPE_COMMENT
1163 if t.type is TYPE_DOCTYPE
1164 # FIXME check identifiers, set quirks, etc
1167 insertion_mode = ins_mode_before_html
1170 #fixfull (iframe, quirks)
1171 insertion_mode = ins_mode_before_html
1172 insertion_mode t # reprocess the token
1175 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1176 ins_mode_before_html = (t) ->
1177 if t.type is TYPE_DOCTYPE
1180 if t.type is TYPE_COMMENT
1185 if t.type is TYPE_START_TAG and t.name is 'html'
1186 el = token_to_element t, NS_HTML, doc
1187 doc.children.push el
1188 open_els.unshift(el)
1189 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1190 insertion_mode = ins_mode_before_head
1192 if t.type is TYPE_END_TAG
1193 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1194 # fall through to "anything else"
1199 html_tok = new_open_tag 'html'
1200 el = token_to_element html_tok, NS_HTML, doc
1201 doc.children.push el
1203 # ?fixfull browsing context
1204 insertion_mode = ins_mode_before_head
1208 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1209 ins_mode_before_head = (t) ->
1212 if t.type is TYPE_COMMENT
1215 if t.type is TYPE_DOCTYPE
1218 if t.type is TYPE_START_TAG and t.name is 'html'
1221 if t.type is TYPE_START_TAG and t.name is 'head'
1222 el = insert_html_element t
1223 head_element_pointer = el
1224 insertion_mode = ins_mode_in_head
1225 if t.type is TYPE_END_TAG
1226 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1227 # fall through to Anything else below
1232 head_tok = new_open_tag 'head'
1233 el = insert_html_element head_tok
1234 head_element_pointer = el
1235 insertion_mode = ins_mode_in_head
1236 insertion_mode t # reprocess current token
1238 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1239 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1240 open_els.shift() # spec says this will be a 'head' node
1241 insertion_mode = ins_mode_after_head
1243 ins_mode_in_head = (t) ->
1244 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1247 if t.type is TYPE_COMMENT
1250 if t.type is TYPE_DOCTYPE
1253 if t.type is TYPE_START_TAG and t.name is 'html'
1256 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1257 el = insert_html_element t
1259 t.acknowledge_self_closing()
1261 if t.type is TYPE_START_TAG and t.name is 'meta'
1262 el = insert_html_element t
1264 t.acknowledge_self_closing()
1265 # fixfull encoding stuff
1267 if t.type is TYPE_START_TAG and t.name is 'title'
1268 parse_generic_rcdata_text t
1270 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1271 parse_generic_raw_text t
1273 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1274 insert_html_element t
1275 insertion_mode = in_head_noscript # FIXME implement
1277 if t.type is TYPE_START_TAG and t.name is 'script'
1278 ail = adjusted_insertion_location()
1279 el = token_to_element t, NS_HTML, ail
1280 el.flag 'parser-inserted', true # FIXME implement
1281 # fixfull frament case
1282 ail[0].children.splice ail[1], 0, el
1284 tok_state = tok_state_script_data
1285 original_insertion_mode = insertion_mode # make sure orig... is defined
1286 insertion_mode = ins_mode_text # FIXME implement
1288 if t.type is TYPE_END_TAG and t.name is 'head'
1289 open_els.shift() # will be a head element... spec says so
1290 insertion_mode = ins_mode_after_head
1292 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1293 ins_mode_in_head_else t
1295 if t.type is TYPE_START_TAG and t.name is 'template'
1296 insert_html_element t
1298 flag_frameset_ok = false
1299 insertion_mode = ins_mode_in_template
1300 template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1302 if t.type is TYPE_END_TAG and t.name is 'template'
1303 if template_tag_is_open()
1304 generate_implied_end_tags
1305 if open_els[0].name isnt 'template'
1308 el = open_els.shift()
1309 if el.name is 'template'
1311 clear_afe_to_marker()
1312 template_insertion_modes.shift()
1313 reset_insertion_mode()
1317 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1320 ins_mode_in_head_else t
1322 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1323 ins_mode_in_head_noscript = (t) ->
1325 console.log "ins_mode_in_head_noscript unimplemented"
1327 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1328 ins_mode_after_head_else = (t) ->
1329 body_tok = new_open_tag 'body'
1330 insert_html_element body_tok
1331 insertion_mode = ins_mode_in_body
1332 insertion_mode t # reprocess token
1334 ins_mode_after_head = (t) ->
1338 if t.type is TYPE_COMMENT
1341 if t.type is TYPE_DOCTYPE
1344 if t.type is TYPE_START_TAG and t.name is 'html'
1347 if t.type is TYPE_START_TAG and t.name is 'body'
1348 insert_html_element t
1349 flag_frameset_ok = false
1350 insertion_mode = ins_mode_in_body
1352 if t.type is TYPE_START_TAG and t.name is 'frameset'
1353 insert_html_element t
1354 insertion_mode = ins_mode_in_frameset
1356 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1358 open_els.unshift head_element_pointer
1360 for el, i of open_els
1361 if el is head_element_pointer
1362 open_els.splice i, 1
1364 console.log "warning: 23904 couldn't find head element in open_els"
1366 if t.type is TYPE_END_TAG and t.name is 'template'
1369 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1370 ins_mode_after_head_else t
1372 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1376 ins_mode_after_head_else t
1378 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1379 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1380 for node, i in open_els
1381 if node.name is name # FIXME check namespace too
1382 generate_implied_end_tags name # arg is exception
1383 parse_error() unless i is 0
1388 if special_elements[node.name]? # FIXME check namespac too
1391 ins_mode_in_body = (t) ->
1397 when "\t", "\u000a", "\u000c", "\u000d", ' '
1398 reconstruct_active_formatting_elements()
1401 reconstruct_active_formatting_elements()
1403 flag_frameset_ok = false
1412 return if template_tag_is_open()
1413 root_attrs = open_els[open_els.length - 1].attrs
1415 root_attrs[k] = v unless root_attrs[k]?
1416 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1417 # FIXME also do this for </template> (end tag)
1418 return ins_mode_in_head t
1425 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1426 close_p_if_in_button_scope()
1427 insert_html_element t
1428 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1429 close_p_if_in_button_scope()
1430 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1433 insert_html_element t
1434 # TODO lots more to implement here
1436 # If the list of active formatting elements
1437 # contains an a element between the end of the list and
1438 # the last marker on the list (or the start of the list
1439 # if there is no marker on the list), then this is a
1440 # parse error; run the adoption agency algorithm for
1441 # the tag name "a", then remove that element from the
1442 # list of active formatting elements and the stack of
1443 # open elements if the adoption agency algorithm didn't
1444 # already remove it (it might not have if the element
1445 # is not in table scope).
1448 if el.type is TYPE_AFE_MARKER
1458 for el, i in open_els
1460 open_els.splice i, 1
1461 reconstruct_active_formatting_elements()
1462 el = insert_html_element t
1464 when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1465 reconstruct_active_formatting_elements()
1466 el = insert_html_element t
1469 # fixfull quirksmode thing
1470 close_p_if_in_button_scope()
1471 insert_html_element t
1472 insertion_mode = ins_mode_in_table
1473 # TODO lots more to implement here
1474 else # any other start tag
1475 reconstruct_active_formatting_elements()
1476 insert_html_element t
1479 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1480 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1483 unless ok_tags[t.name]?
1486 # TODO stack of template insertion modes thing
1491 unless is_in_scope 'body'
1494 # TODO implement parse error and move to tree_after_body
1496 unless is_in_scope 'body' # weird, but it's what the spec says
1499 # TODO implement parse error and move to tree_after_body, reprocess
1500 when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1501 unless is_in_scope t.name, NS_HTML
1504 generate_implied_end_tags()
1505 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1508 el = open_els.shift()
1509 if el.name is t.name and el.namespace is NS_HTML
1511 # TODO lots more close tags to implement here
1513 unless is_in_button_scope 'p'
1515 insert_html_element new_open_tag 'p'
1517 # TODO lots more close tags to implement here
1518 when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1519 adoption_agency t.name
1520 # TODO lots more close tags to implement here
1522 in_body_any_other_end_tag t.name
1525 ins_mode_in_table_else = (t) ->
1527 flag_foster_parenting = true # FIXME
1529 flag_foster_parenting = false
1530 can_in_table = { # FIXME do this inline like everywhere else
1538 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1539 ins_mode_text = (t) ->
1540 if t.type is TYPE_TEXT
1543 if t.type is TYPE_EOF
1545 if open_els[0].name is 'script'
1546 open_els[0].flag 'already started', true
1548 insertion_mode = original_insertion_mode
1551 if t.type is TYPE_END_TAG and t.name is 'script'
1553 insertion_mode = original_insertion_mode
1554 # fixfull the spec seems to assume that I'm going to run the script
1555 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1557 if t.type is TYPE_END_TAG
1559 insertion_mode = original_insertion_mode
1561 console.log 'warning: end of ins_mode_text reached'
1563 # the functions below implement the tokenizer stats described here:
1564 # http://www.w3.org/TR/html5/syntax.html#tokenization
1566 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1567 ins_mode_in_table = (t) ->
1570 if can_in_table[t.name]
1571 original_insertion_mode = insertion_mode
1572 insertion_mode = ins_mode_in_table_text
1575 ins_mode_in_table_else t
1583 clear_stack_to_table_context()
1585 insert_html_element t
1586 insertion_mode = ins_mode_in_caption
1588 clear_stack_to_table_context()
1589 insert_html_element t
1590 insertion_mode = ins_mode_in_column_group
1592 clear_stack_to_table_context()
1593 insert_html_element new_open_tag 'colgroup'
1594 insertion_mode = ins_mode_in_column_group
1596 when 'tbody', 'tfoot', 'thead'
1597 clear_stack_to_table_context()
1598 insert_html_element t
1599 insertion_mode = ins_mode_in_table_body
1600 when 'td', 'th', 'tr'
1601 clear_stack_to_table_context()
1602 insert_html_element new_open_tag 'tbody'
1603 insertion_mode = ins_mode_in_table_body
1607 if is_in_table_scope 'table'
1609 el = open_els.shift()
1610 if el.name is 'table'
1612 reset_insertion_mode()
1614 when 'style', 'script', 'template'
1617 if token_is_input_hidden t
1618 ins_mode_in_table_else t
1621 el = insert_html_element t
1623 t.acknowledge_self_closing()
1626 if form_element_pointer?
1628 if template_tag_is_open()
1630 form_element_pointer = insert_html_element t
1633 ins_mode_in_table_else t
1637 if is_in_table_scope 'table'
1639 el = open_els.shift()
1640 if el.name is 'table'
1642 reset_insertion_mode()
1645 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1650 ins_mode_in_table_else t
1654 ins_mode_in_table_else t
1657 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1658 ins_mode_in_table_text = (t) ->
1659 if t.type is TYPE_TEXT and t.text is "\u0000"
1660 # huh? I thought the tokenizer didn't emit these
1663 if t.type is TYPE_TEXT
1664 pending_table_character_tokens.push t
1668 for old in pending_table_character_tokens
1669 unless is_space_tok old
1673 for old in pending_table_character_tokens
1674 insert_character old
1676 for old in pending_table_character_tokens
1677 ins_mode_table_else old
1678 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1679 insertion_mode = original_insertion_mode
1682 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1683 ins_mode_in_caption = (t) ->
1684 if t.type is TYPE_END_TAG and t.name is 'caption'
1685 if is_in_table_scope 'caption'
1686 generate_implied_end_tags()
1687 if open_els[0].name isnt 'caption'
1690 el = open_els.shift()
1691 if el.name is 'caption'
1693 clear_afe_to_marker()
1694 insertion_mode = in_table
1699 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1701 if is_in_table_scope 'caption'
1703 el = open_els.shift()
1704 if el.name is 'caption'
1706 clear_afe_to_marker()
1707 insertion_mode = in_table
1709 # else fragment case
1711 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1717 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1718 ins_mode_in_column_group = (t) ->
1722 if t.type is TYPE_COMMENT
1725 if t.type is TYPE_DOCTYPE
1728 if t.type is TYPE_START_TAG and t.name is 'html'
1731 if t.type is TYPE_START_TAG and t.name is 'col'
1732 el = insert_html_element t
1734 t.acknowledge_self_closing()
1736 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1737 if open_els[0].name is 'colgroup'
1739 insertion_mode = ins_mode_in_table
1743 if t.type is TYPE_END_TAG and t.name is 'col'
1746 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1749 if t.type is TYPE_EOF
1753 if open_els[0].name isnt 'colgroup'
1757 insertion_mode = ins_mode_in_table
1761 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1762 ins_mode_in_table_body = (t) ->
1763 if t.type is TYPE_START_TAG and t.name is 'tr'
1764 clear_stack_to_table_body_context()
1765 insert_html_element t
1766 insertion_mode = ins_mode_in_row
1768 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1770 clear_stack_to_table_body_context()
1771 insert_html_element new_open_tag 'tr'
1772 insertion_mode = ins_mode_in_row
1775 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1776 unless is_in_table_scope t.name # fixfull check namespace
1779 clear_stack_to_table_body_context()
1781 insertion_mode = ins_mode_in_table
1783 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1786 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1789 if table_scopers[el.name]
1794 clear_stack_to_table_body_context()
1796 insertion_mode = ins_mode_in_table
1799 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1805 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1806 ins_mode_in_row = (t) ->
1807 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1808 clear_stack_to_table_row_context()
1809 insert_html_element t
1810 insertion_mode = ins_mode_in_cell
1813 if t.type is TYPE_END_TAG and t.name is 'tr'
1814 if is_in_table_scope 'tr'
1815 clear_stack_to_table_row_context()
1817 insertion_mode = ins_mode_in_table_body
1821 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1822 if is_in_table_scope 'tr'
1823 clear_stack_to_table_row_context()
1825 insertion_mode = ins_mode_in_table_body
1830 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1831 if is_in_table_scope t.name # fixfull namespace
1832 if is_in_table_scope 'tr'
1833 clear_stack_to_table_row_context()
1835 insertion_mode = ins_mode_in_table_body
1840 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1846 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1848 generate_implied_end_tags()
1849 unless open_els[0].name is 'td' or open_els[0] is 'th'
1852 el = open_els.shift()
1853 if el.name is 'td' or el.name is 'th'
1855 clear_afe_to_marker()
1856 insertion_mode = ins_mode_in_row
1858 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1859 ins_mode_in_cell = (t) ->
1860 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1861 if is_in_table_scope t.name
1862 generate_implied_end_tags()
1863 if open_els[0].name isnt t.name
1866 el = open_els.shift()
1867 if el.name is t.name
1869 clear_afe_to_marker()
1870 insertion_mode = ins_mode_in_row
1874 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1877 if el.name is 'td' or el.name is 'th'
1880 if table_scopers[el.name]
1888 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1891 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1892 if is_in_table_scope t.name # fixfull namespace
1901 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
1902 ins_mode_in_select = (t) ->
1903 if t.type is TYPE_TEXT and t.text is "\u0000"
1906 if t.type is TYPE_TEXT
1909 if t.type is TYPE_COMMENT
1912 if t.type is TYPE_DOCTYPE
1915 if t.type is TYPE_START_TAG and t.name is 'html'
1918 if t.type is TYPE_START_TAG and t.name is 'option'
1919 if open_els[0].name is 'option'
1921 insert_html_element t
1923 if t.type is TYPE_START_TAG and t.name is 'optgroup'
1924 if open_els[0].name is 'option'
1926 if open_els[0].name is 'optgroup'
1928 insert_html_element t
1930 if t.type is TYPE_END_TAG and t.name is 'optgroup'
1931 if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
1933 if open_els[0].name is 'optgroup'
1938 if t.type is TYPE_END_TAG and t.name is 'option'
1939 if open_els[0].name is 'option'
1944 if t.type is TYPE_END_TAG and t.name is 'select'
1945 if is_in_select_scope 'select'
1947 el = open_els.shift()
1948 if el.name is 'select'
1950 reset_insertion_mode()
1954 if t.type is TYPE_START_TAG and t.name is 'select'
1957 el = open_els.shift()
1958 if el.name is 'select'
1960 reset_insertion_mode()
1961 # spec says that this is the same as </select> but it doesn't say
1962 # to check scope first
1964 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
1966 if is_in_select_scope 'select'
1969 el = open_els.shift()
1970 if el.name is 'select'
1972 reset_insertion_mode()
1975 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
1978 if t.type is TYPE_EOF
1985 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
1986 ins_mode_in_select_in_table = (t) ->
1987 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1990 el = open_els.shift()
1991 if el.name is 'select'
1993 reset_insertion_mode()
1996 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1998 unless is_in_table_scope t.name, NS_HTML
2001 el = open_els.shift()
2002 if el.name is 'select'
2004 reset_insertion_mode()
2008 ins_mode_in_select t
2011 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2012 ins_mode_in_template = (t) ->
2013 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2016 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2019 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2020 template_insertion_modes.shift()
2021 template_insertion_modes.unshift ins_mode_in_table
2022 insertion_mode = ins_mode_in_table
2025 if t.type is TYPE_START_TAG and t.name is 'col'
2026 template_insertion_modes.shift()
2027 template_insertion_modes.unshift ins_mode_in_column_group
2028 insertion_mode = ins_mode_in_column_group
2031 if t.type is TYPE_START_TAG and t.name is 'tr'
2032 template_insertion_modes.shift()
2033 template_insertion_modes.unshift ins_mode_in_table_body
2034 insertion_mode = ins_mode_in_table_body
2037 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2038 template_insertion_modes.shift()
2039 template_insertion_modes.unshift ins_mode_in_row
2040 insertion_mode = ins_mode_in_row
2043 if t.type is TYPE_START_TAG
2044 template_insertion_modes.shift()
2045 template_insertion_modes.unshift ins_mode_in_body
2046 insertion_mode = ins_mode_in_body
2049 if t.type is TYPE_END_TAG
2053 unless template_tag_is_open()
2058 el = open_els.shift()
2059 if el.name is 'template' # fixfull check namespace
2061 clear_afe_to_marker()
2062 template_insertion_modes.shift()
2063 reset_insertion_mode()
2066 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2067 ins_mode_after_body = (t) ->
2071 if t.type is TYPE_COMMENT
2072 insert_comment t, [open_els[0], open_els[0].children.length]
2074 if t.type is TYPE_DOCTYPE
2077 if t.type is TYPE_START_TAG and t.name is 'html'
2080 if t.type is TYPE_END_TAG and t.name is 'html'
2081 # fixfull fragment case
2082 insertion_mode = ins_mode_after_after_body
2084 if t.type is TYPE_EOF
2089 insertion_mode = ins_mode_in_body
2092 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2093 ins_mode_in_frameset = (t) ->
2097 if t.type is TYPE_COMMENT
2100 if t.type is TYPE_DOCTYPE
2103 if t.type is TYPE_START_TAG and t.name is 'html'
2106 if t.type is TYPE_START_TAG and t.name is 'frameset'
2107 insert_html_element t
2109 if t.type is TYPE_END_TAG and t.name is 'frameset'
2110 # TODO ?correct for: "if the current node is the root html element"
2111 if open_els.length is 1
2113 return # fragment case
2115 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2116 insertion_mode = ins_mode_after_frameset
2118 if t.type is TYPE_START_TAG and t.name is 'frame'
2119 insert_html_element t
2121 t.acknowledge_self_closing()
2123 if t.type is TYPE_START TAG and t.name is 'noframes'
2126 if t.type is TYPE_EOF
2127 # TODO ?correct for: "if the current node is not the root html element"
2128 if open_els.length isnt 1
2136 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2137 ins_mode_after_frameset = (t) ->
2141 if t.type is TYPE_COMMENT
2144 if t.type is TYPE_DOCTYPE
2147 if t.type is TYPE_START_TAG and t.name is 'html'
2150 if t.type is TYPE_END_TAG and t.name is 'html'
2151 insert_mode = ins_mode_after_after_frameset
2153 if t.type is TYPE_START_TAG and t.name is 'noframes'
2156 if t.type is TYPE_EOF
2163 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2164 ins_mode_after_after_body = (t) ->
2165 if t.type is TYPE_COMMENT
2166 insert_comment t, [doc, doc.children.length]
2168 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2171 if t.type is TYPE_EOF
2176 insertion_mode = ins_mode_in_body
2179 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2180 ins_mode_after_after_frameset = (t) ->
2181 if t.type is TYPE_COMMENT
2182 insert_comment t, [doc, doc.children.length]
2184 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2187 if t.type is TYPE_EOF
2190 if t.type is TYPE_START_TAG and t.name is 'noframes'
2201 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2203 switch c = txt.charAt(cur++)
2205 return new_text_node parse_character_reference()
2207 tok_state = tok_state_tag_open
2210 return new_text_node c
2212 return new_eof_token()
2214 return new_text_node c
2217 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2218 # not needed: tok_state_character_reference_in_data = ->
2219 # just call parse_character_reference()
2221 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2222 tok_state_rcdata = ->
2223 switch c = txt.charAt(cur++)
2225 return new_text_node parse_character_reference()
2227 tok_state = tok_state_rcdata_less_than_sign
2230 return new_character_token "\ufffd"
2232 return new_eof_token()
2234 return new_character_token c
2237 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2238 # not needed: tok_state_character_reference_in_rcdata = ->
2239 # just call parse_character_reference()
2241 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2242 tok_state_rawtext = ->
2243 switch c = txt.charAt(cur++)
2245 tok_state = tok_state_rawtext_less_than_sign
2248 return new_character_token "\ufffd"
2250 return new_eof_token()
2252 return new_character_token c
2255 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2256 tok_state_script_data = ->
2257 switch c = txt.charAt(cur++)
2259 tok_state = tok_state_script_data_less_than_sign
2262 return new_character_token "\ufffd"
2264 return new_eof_token()
2266 return new_character_token c
2269 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2270 tok_state_plaintext = ->
2271 switch c = txt.charAt(cur++)
2274 return new_character_token "\ufffd"
2276 return new_eof_token()
2278 return new_character_token c
2282 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2283 tok_state_tag_open = ->
2284 switch c = txt.charAt(cur++)
2286 tok_state = tok_state_markup_declaration_open
2288 tok_state = tok_state_end_tag_open
2291 tok_cur_tag = new_comment_token '?'
2292 tok_state = tok_state_bogus_comment
2294 if lc_alpha.indexOf(c) > -1
2295 tok_cur_tag = new_open_tag c
2296 tok_state = tok_state_tag_name
2297 else if uc_alpha.indexOf(c) > -1
2298 tok_cur_tag = new_open_tag c.toLowerCase()
2299 tok_state = tok_state_tag_name
2302 tok_state = tok_state_data
2303 cur -= 1 # we didn't parse/handle the char after <
2304 return new_text_node '<'
2307 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2308 tok_state_end_tag_open = ->
2309 switch c = txt.charAt(cur++)
2312 tok_state = tok_state_data
2315 tok_state = tok_state_data
2316 return new_text_node '</'
2318 if uc_alpha.indexOf(c) > -1
2319 tok_cur_tag = new_end_tag c.toLowerCase()
2320 tok_state = tok_state_tag_name
2321 else if lc_alpha.indexOf(c) > -1
2322 tok_cur_tag = new_end_tag c
2323 tok_state = tok_state_tag_name
2326 tok_cur_tag = new_comment_token '/'
2327 tok_state = tok_state_bogus_comment
2330 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2331 tok_state_tag_name = ->
2332 switch c = txt.charAt(cur++)
2333 when "\t", "\n", "\u000c", ' '
2334 tok_state = tok_state_before_attribute_name
2336 tok_state = tok_state_self_closing_start_tag
2338 tok_state = tok_state_data
2344 tok_cur_tag.name += "\ufffd"
2347 tok_state = tok_state_data
2349 if uc_alpha.indexOf(c) > -1
2350 tok_cur_tag.name += c.toLowerCase()
2352 tok_cur_tag.name += c
2355 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2356 tok_state_rcdata_less_than_sign = ->
2357 c = txt.charAt(cur++)
2359 temporary_buffer = ''
2360 tok_state = tok_state_rcdata_end_tag_open
2363 tok_state = tok_state_rcdata
2364 cur -= 1 # reconsume the input character
2365 return new_character_token '<'
2367 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2368 tok_state_rcdata_end_tag_open = ->
2369 c = txt.charAt(cur++)
2370 if uc_alpha.indexOf(c) > -1
2371 tok_cur_tag = new_end_tag c.toLowerCase()
2372 temporary_buffer += c
2373 tok_state = tok_state_rcdata_end_tag_name
2375 if lc_alpha.indexOf(c) > -1
2376 tok_cur_tag = new_end_tag c
2377 temporary_buffer += c
2378 tok_state = tok_state_rcdata_end_tag_name
2381 tok_state = tok_state_rcdata
2382 cur -= 1 # reconsume the input character
2383 return new_character_token "</" # fixfull separate these
2385 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2386 is_appropriate_end_tag = (t) ->
2387 # spec says to check against "the tag name of the last start tag to
2388 # have been emitted from this tokenizer", but this is only called from
2389 # the various "raw" states, which I'm pretty sure all push the start
2390 # token onto open_els. TODO: verify this after the script data states
2392 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2393 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2395 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2396 tok_state_rcdata_end_tag_name = ->
2397 c = txt.charAt(cur++)
2398 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2399 if is_appropriate_end_tag tok_cur_tag
2400 tok_state = tok_state_before_attribute_name
2402 # else fall through to "Anything else"
2404 if is_appropriate_end_tag tok_cur_tag
2405 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2407 # else fall through to "Anything else"
2409 if is_appropriate_end_tag tok_cur_tag
2410 tok_state = tok_state_data
2412 # else fall through to "Anything else"
2413 if uc_alpha.indexOf(c) > -1
2414 tok_cur_tag.name += c.toLowerCase()
2415 temporary_buffer += c
2417 if lc_alpha.indexOf(c) > -1
2418 tok_cur_tag.name += c
2419 temporary_buffer += c
2422 tok_state = tok_state_rcdata
2423 cur -= 1 # reconsume the input character
2424 return new_character_token '</' + temporary_buffer # fixfull separate these
2426 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2427 tok_state_rawtext_less_than_sign = ->
2428 c = txt.charAt(cur++)
2430 temporary_buffer = ''
2431 tok_state = tok_state_rawtext_end_tag_open
2434 tok_state = tok_state_rawtext
2435 cur -= 1 # reconsume the input character
2436 return new_character_token '<'
2438 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2439 tok_state_rawtext_end_tag_open = ->
2440 c = txt.charAt(cur++)
2441 if uc_alpha.indexOf(c) > -1
2442 tok_cur_tag = new_end_tag c.toLowerCase()
2443 temporary_buffer += c
2444 tok_state = tok_state_rawtext_end_tag_name
2446 if lc_alpha.indexOf(c) > -1
2447 tok_cur_tag = new_end_tag c
2448 temporary_buffer += c
2449 tok_state = tok_state_rawtext_end_tag_name
2452 tok_state = tok_state_rawtext
2453 cur -= 1 # reconsume the input character
2454 return new_character_token "</" # fixfull separate these
2456 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2457 tok_state_rawtext_end_tag_name = ->
2458 c = txt.charAt(cur++)
2459 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2460 if is_appropriate_end_tag tok_cur_tag
2461 tok_state = tok_state_before_attribute_name
2463 # else fall through to "Anything else"
2465 if is_appropriate_end_tag tok_cur_tag
2466 tok_state = tok_state_self_closing_start_tag
2468 # else fall through to "Anything else"
2470 if is_appropriate_end_tag tok_cur_tag
2471 tok_state = tok_state_data
2473 # else fall through to "Anything else"
2474 if uc_alpha.indexOf(c) > -1
2475 tok_cur_tag.name += c.toLowerCase()
2476 temporary_buffer += c
2478 if lc_alpha.indexOf(c) > -1
2479 tok_cur_tag.name += c
2480 temporary_buffer += c
2483 tok_state = tok_state_rawtext
2484 cur -= 1 # reconsume the input character
2485 return new_character_token '</' + temporary_buffer # fixfull separate these
2487 # TODO _all_ of the missing states here (17-33) are for parsing script tags
2489 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2490 tok_state_before_attribute_name = ->
2492 switch c = txt.charAt(cur++)
2493 when "\t", "\n", "\u000c", ' '
2496 tok_state = tok_state_self_closing_start_tag
2499 tok_state = tok_state_data
2505 attr_name = "\ufffd"
2506 when '"', "'", '<', '='
2511 tok_state = tok_state_data
2513 if uc_alpha.indexOf(c) > -1
2514 attr_name = c.toLowerCase()
2518 tok_cur_tag.attrs_a.unshift [attr_name, '']
2519 tok_state = tok_state_attribute_name
2522 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2523 tok_state_attribute_name = ->
2524 switch c = txt.charAt(cur++)
2525 when "\t", "\n", "\u000c", ' '
2526 tok_state = tok_state_after_attribute_name
2528 tok_state = tok_state_self_closing_start_tag
2530 tok_state = tok_state_before_attribute_value
2532 tok_state = tok_state_data
2538 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2541 tok_cur_tag.attrs_a[0][0] = c
2544 tok_state = tok_state_data
2546 if uc_alpha.indexOf(c) > -1
2547 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2549 tok_cur_tag.attrs_a[0][0] += c
2552 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2553 tok_state_after_attribute_name = ->
2554 c = txt.charAt(cur++)
2555 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2558 tok_state = tok_state_self_closing_start_tag
2561 tok_state = tok_state_before_attribute_value
2564 tok_state = tok_state_data
2566 if uc_alpha.indexOf(c) > -1
2567 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2568 tok_state = tok_state_attribute_name
2572 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2573 tok_state = tok_state_attribute_name
2577 tok_state = tok_state_data
2578 cur -= 1 # reconsume
2580 if c is '"' or c is "'" or c is '<'
2582 # fall through to Anything else
2584 tok_cur_tag.attrs_a.unshift [c, '']
2585 tok_state = tok_state_attribute_name
2587 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2588 tok_state_before_attribute_value = ->
2589 switch c = txt.charAt(cur++)
2590 when "\t", "\n", "\u000c", ' '
2593 tok_state = tok_state_attribute_value_double_quoted
2595 tok_state = tok_state_attribute_value_unquoted
2598 tok_state = tok_state_attribute_value_single_quoted
2601 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2602 tok_state = tok_state_attribute_value_unquoted
2605 tok_state = tok_state_data
2611 tok_state = tok_state_data
2613 tok_cur_tag.attrs_a[0][1] += c
2614 tok_state = tok_state_attribute_value_unquoted
2617 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2618 tok_state_attribute_value_double_quoted = ->
2619 switch c = txt.charAt(cur++)
2621 tok_state = tok_state_after_attribute_value_quoted
2623 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2626 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2629 tok_state = tok_state_data
2631 tok_cur_tag.attrs_a[0][1] += c
2634 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2635 tok_state_attribute_value_single_quoted = ->
2636 switch c = txt.charAt(cur++)
2638 tok_state = tok_state_after_attribute_value_quoted
2640 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2643 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2646 tok_state = tok_state_data
2648 tok_cur_tag.attrs_a[0][1] += c
2651 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2652 tok_state_attribute_value_unquoted = ->
2653 switch c = txt.charAt(cur++)
2654 when "\t", "\n", "\u000c", ' '
2655 tok_state = tok_state_before_attribute_name
2657 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2659 tok_state = tok_state_data
2664 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2667 tok_state = tok_state_data
2669 # Parse Error if ', <, = or ` (backtick)
2670 tok_cur_tag.attrs_a[0][1] += c
2673 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2674 tok_state_after_attribute_value_quoted = ->
2675 switch c = txt.charAt(cur++)
2676 when "\t", "\n", "\u000c", ' '
2677 tok_state = tok_state_before_attribute_name
2679 tok_state = tok_state_self_closing_start_tag
2681 tok_state = tok_state_data
2687 tok_state = tok_state_data
2690 tok_state = tok_state_before_attribute_name
2691 cur -= 1 # we didn't handle that char
2694 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
2695 # WARNING: put a comment token in tok_cur_tag before setting this state
2696 tok_state_bogus_comment = ->
2697 next_gt = txt.indexOf '>', cur
2699 val = txt.substr cur
2702 val = txt.substr cur, (next_gt - cur)
2704 val = val.replace "\u0000", "\ufffd"
2705 tok_cur_tag.text += val
2706 tok_state = tok_state_data
2709 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
2710 tok_state_markup_declaration_open = ->
2711 if txt.substr(cur, 2) is '--'
2713 tok_cur_tag = new_comment_token ''
2714 tok_state = tok_state_comment_start
2716 if txt.substr(cur, 7).toLowerCase() is 'doctype'
2718 tok_state = tok_state_doctype
2720 acn = adjusted_current_node()
2721 if acn and acn.namespace isnt NS_HTML and text.substr(cur, 7) is '[CDATA['
2723 tok_state = tok_state_cdata_section
2727 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
2728 tok_state = tok_state_bogus_comment
2731 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
2732 tok_state_comment_start = ->
2733 switch c = txt.charAt(cur++)
2735 tok_state = tok_state_comment_start_dash
2738 return new_character_token "\ufffd"
2741 tok_state = tok_state_data
2745 tok_state = tok_state_data
2746 cur -= 1 # Reconsume
2749 tok_cur_tag.text += c
2752 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
2753 tok_state_comment_start_dash = ->
2754 switch c = txt.charAt(cur++)
2756 tok_state = tok_state_comment_end
2759 tok_cur_tag.text += "-\ufffd"
2760 tok_state = tok_state_comment
2763 tok_state = tok_state_data
2767 tok_state = tok_state_data
2768 cur -= 1 # Reconsume
2771 tok_cur_tag.text += "-#{c}"
2772 tok_state = tok_state_comment
2775 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
2776 tok_state_comment = ->
2777 switch c = txt.charAt(cur++)
2779 tok_state = tok_state_comment_end_dash
2782 tok_cur_tag.text += "\ufffd"
2785 tok_state = tok_state_data
2786 cur -= 1 # Reconsume
2789 tok_cur_tag.text += c
2792 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
2793 tok_state_comment_end_dash = ->
2794 switch c = txt.charAt(cur++)
2796 tok_state = tok_state_comment_end
2799 tok_cur_tag.text += "-\ufffd"
2800 tok_state = tok_state_comment
2803 tok_state = tok_state_data
2804 cur -= 1 # Reconsume
2807 tok_cur_tag.text += "-#{c}"
2808 tok_state = tok_state_comment
2811 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
2812 tok_state_comment_end = ->
2813 switch c = txt.charAt(cur++)
2815 tok_state = tok_state_data
2819 tok_cur_tag.text += "--\ufffd"
2820 tok_state = tok_state_comment
2823 tok_state = tok_state_comment_end_bang
2826 tok_cur_tag.text += '-'
2829 tok_state = tok_state_data
2830 cur -= 1 # Reconsume
2834 tok_cur_tag.text += "--#{c}"
2835 tok_state = tok_state_comment
2838 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
2839 tok_state_comment_end_bang = ->
2840 switch c = txt.charAt(cur++)
2842 tok_cur_tag.text += "--!#{c}"
2843 tok_state = tok_state_comment_end_dash
2845 tok_state = tok_state_data
2849 tok_cur_tag.text += "--!\ufffd"
2850 tok_state = tok_state_comment
2853 tok_state = tok_state_data
2854 cur -= 1 # Reconsume
2857 tok_cur_tag.text += "--!#{c}"
2858 tok_state = tok_state_comment
2861 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
2862 tok_state_doctype = ->
2863 switch c = txt.charAt(cur++)
2864 when "\t", "\u000a", "\u000c", ' '
2865 tok_state = tok_state_before_doctype_name
2868 tok_state = tok_state_data
2869 el = new_doctype_token ''
2870 el.flag 'force-quirks', true
2871 cur -= 1 # Reconsume
2875 tok_state = tok_state_before_doctype_name
2876 cur -= 1 # Reconsume
2879 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
2880 tok_state_before_doctype_name = ->
2881 c = txt.charAt(cur++)
2882 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2884 if uc_alpha.indexOf(c) > -1
2885 tok_cur_tag = new_doctype_token c.toLowerCase()
2886 tok_state = tok_state_doctype_name
2890 tok_cur_tag = new_doctype_token "\ufffd"
2891 tok_state = tok_state_doctype_name
2895 el = new_doctype_token ''
2896 el.flag 'force-quirks', true
2897 tok_state = tok_state_data
2901 tok_state = tok_state_data
2902 el = new_doctype_token ''
2903 el.flag 'force-quirks', true
2904 cur -= 1 # Reconsume
2907 tok_cur_tag = new_doctype_token c
2908 tok_state = tok_state_doctype_name
2911 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
2912 tok_state_doctype_name = ->
2913 c = txt.charAt(cur++)
2914 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2915 tok_state = tok_state_after_doctype_name
2918 tok_state = tok_state_data
2920 if uc_alpha.indexOf(c) > -1
2921 tok_cur_tag.name += c.toLowerCase()
2925 tok_cur_tag.name += "\ufffd"
2929 tok_state = tok_state_data
2930 tok_cur_tag.flag 'force-quirks', true
2931 cur -= 1 # Reconsume
2934 tok_cur_tag.name += c
2937 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
2938 tok_state_after_doctype_name = ->
2939 c = txt.charAt(cur++)
2940 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2943 tok_state = tok_state_data
2947 tok_state = tok_state_data
2948 tok_cur_tag.flag 'force-quirks', true
2949 cur -= 1 # Reconsume
2952 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
2954 tok_state = tok_state_after_doctype_public_keyword
2956 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
2958 tok_state = tok_state_after_doctype_system_keyword
2961 tok_cur_tag.flag 'force-quirks', true
2962 tok_state = tok_state_bogus_doctype
2965 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
2966 tok_state_after_doctype_public_keyword = ->
2967 c = txt.charAt(cur++)
2968 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2969 tok_state = tok_state_before_doctype_public_identifier
2973 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
2974 tok_state = tok_state_doctype_public_identifier_double_quoted
2978 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
2979 tok_state = tok_state_doctype_public_identifier_single_quoted
2983 tok_cur_tag.flag 'force-quirks', true
2984 tok_state = tok_state_data
2988 tok_state = tok_state_data
2989 tok_cur_tag.flag 'force-quirks', true
2990 cur -= 1 # Reconsume
2994 tok_cur_tag.flag 'force-quirks', true
2995 tok_state = tok_state_bogus_doctype
2998 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
2999 tok_state_before_doctype_public_identifier = ->
3000 c = txt.charAt(cur++)
3001 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3005 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3006 tok_state = tok_state_doctype_public_identifier_double_quoted
3010 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3011 tok_state = tok_state_doctype_public_identifier_single_quoted
3015 tok_cur_tag.flag 'force-quirks', true
3016 tok_state = tok_state_data
3020 tok_state = tok_state_data
3021 tok_cur_tag.flag 'force-quirks', true
3022 cur -= 1 # Reconsume
3026 tok_cur_tag.flag 'force-quirks', true
3027 tok_state = tok_state_bogus_doctype
3031 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
3032 tok_state_doctype_public_identifier_double_quoted = ->
3033 c = txt.charAt(cur++)
3035 tok_state = tok_state_after_doctype_public_identifier
3039 tok_cur_tag.public_identifier += "\ufffd"
3043 tok_cur_tag.flag 'force-quirks', true
3044 tok_state = tok_state_data
3048 tok_state = tok_state_data
3049 tok_cur_tag.flag 'force-quirks', true
3050 cur -= 1 # Reconsume
3053 tok_cur_tag.public_identifier += c
3056 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
3057 tok_state_doctype_public_identifier_single_quoted = ->
3058 c = txt.charAt(cur++)
3060 tok_state = tok_state_after_doctype_public_identifier
3064 tok_cur_tag.public_identifier += "\ufffd"
3068 tok_cur_tag.flag 'force-quirks', true
3069 tok_state = tok_state_data
3073 tok_state = tok_state_data
3074 tok_cur_tag.flag 'force-quirks', true
3075 cur -= 1 # Reconsume
3078 tok_cur_tag.public_identifier += c
3081 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
3082 tok_state_after_doctype_public_identifier = ->
3083 c = txt.charAt(cur++)
3084 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3085 tok_state = tok_state_between_doctype_public_and_system_identifiers
3088 tok_state = tok_state_data
3092 tok_cur_tag.system_identifier = ''
3093 tok_state = tok_state_doctype_system_identifier_double_quoted
3097 tok_cur_tag.system_identifier = ''
3098 tok_state = tok_state_doctype_system_identifier_single_quoted
3102 tok_state = tok_state_data
3103 tok_cur_tag.flag 'force-quirks', true
3104 cur -= 1 # Reconsume
3108 tok_cur_tag.flag 'force-quirks', true
3109 tok_state = tok_state_bogus_doctype
3112 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
3113 tok_state_between_doctype_public_and_system_identifiers = ->
3114 c = txt.charAt(cur++)
3115 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3118 tok_state = tok_state_data
3122 tok_cur_tag.system_identifier = ''
3123 tok_state = tok_state_doctype_system_identifier_double_quoted
3127 tok_cur_tag.system_identifier = ''
3128 tok_state = tok_state_doctype_system_identifier_single_quoted
3132 tok_state = tok_state_data
3133 tok_cur_tag.flag 'force-quirks', true
3134 cur -= 1 # Reconsume
3138 tok_cur_tag.flag 'force-quirks', true
3139 tok_state = tok_state_bogus_doctype
3142 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
3143 tok_state_after_doctype_system_keyword = ->
3144 c = txt.charAt(cur++)
3145 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3146 tok_state = tok_state_before_doctype_system_identifier
3150 tok_cur_tag.system_identifier = ''
3151 tok_state = tok_state_doctype_system_identifier_double_quoted
3155 tok_cur_tag.system_identifier = ''
3156 tok_state = tok_state_doctype_system_identifier_single_quoted
3160 tok_cur_tag.flag 'force-quirks', true
3161 tok_state = tok_state_data
3165 tok_state = tok_state_data
3166 tok_cur_tag.flag 'force-quirks', true
3167 cur -= 1 # Reconsume
3171 tok_cur_tag.flag 'force-quirks', true
3172 tok_state = tok_state_bogus_doctype
3175 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
3176 tok_state_before_doctype_system_identifier = ->
3177 c = txt.charAt(cur++)
3178 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3181 tok_cur_tag.system_identifier = ''
3182 tok_state = tok_state_doctype_system_identifier_double_quoted
3185 tok_cur_tag.system_identifier = ''
3186 tok_state = tok_state_doctype_system_identifier_single_quoted
3190 tok_cur_tag.flag 'force-quirks', true
3191 tok_state = tok_state_data
3195 tok_state = tok_state_data
3196 tok_cur_tag.flag 'force-quirks', true
3197 cur -= 1 # Reconsume
3201 tok_cur_tag.flag 'force-quirks', true
3202 tok_state = tok_state_bogus_doctype
3205 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
3206 tok_state_doctype_system_identifier_double_quoted = ->
3207 c = txt.charAt(cur++)
3209 tok_state = tok_state_after_doctype_system_identifier
3213 tok_cur_tag.system_identifier += "\ufffd"
3217 tok_cur_tag.flag 'force-quirks', true
3218 tok_state = tok_state_data
3222 tok_state = tok_state_data
3223 tok_cur_tag.flag 'force-quirks', true
3224 cur -= 1 # Reconsume
3227 tok_cur_tag.system_identifier += c
3230 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
3231 tok_state_doctype_system_identifier_single_quoted = ->
3232 c = txt.charAt(cur++)
3234 tok_state = tok_state_after_doctype_system_identifier
3238 tok_cur_tag.system_identifier += "\ufffd"
3242 tok_cur_tag.flag 'force-quirks', true
3243 tok_state = tok_state_data
3247 tok_state = tok_state_data
3248 tok_cur_tag.flag 'force-quirks', true
3249 cur -= 1 # Reconsume
3252 tok_cur_tag.system_identifier += c
3255 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
3256 tok_state_after_doctype_system_identifier = ->
3257 c = txt.charAt(cur++)
3258 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3261 tok_state = tok_state_data
3265 tok_state = tok_state_data
3266 tok_cur_tag.flag 'force-quirks', true
3267 cur -= 1 # Reconsume
3271 # do _not_ tok_cur_tag.flag 'force-quirks', true
3272 tok_state = tok_state_bogus_doctype
3275 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
3276 tok_state_bogus_doctype = ->
3277 c = txt.charAt(cur++)
3279 tok_state = tok_state_data
3282 tok_state = tok_state_data
3283 cur -= 1 # Reconsume
3289 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
3290 # Don't set this as a state, just call it
3291 # returns a string (NOT a text node)
3292 parse_character_reference = (allowed_char = null, in_attr = false) ->
3293 if cur >= txt.length
3295 switch c = txt.charAt(cur)
3296 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
3297 # explicitly not a parse error
3300 # there has to be "one or more" alnums between & and ; to be a parse error
3303 if cur + 1 >= txt.length
3305 if txt.charAt(cur + 1).toLowerCase() is 'x'
3314 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
3318 if txt.charAt(start + i) is ';'
3320 # FIXME This is supposed to generate parse errors for some chars
3321 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
3328 if alnum.indexOf(txt.charAt(cur + i)) is -1
3331 # exit early, because parse_error() below needs at least one alnum
3333 if txt.charAt(cur + i) is ';'
3334 i += 1 # include ';' terminator in value
3335 decoded = decode_named_char_ref txt.substr(cur, i)
3342 # no ';' terminator (only legacy char refs)
3344 for i in [2..max] # no prefix matches, so ok to check shortest first
3345 c = legacy_char_refs[txt.substr(cur, i)]
3348 if txt.charAt(cur + i) is '='
3349 # "because some legacy user agents will
3350 # misinterpret the markup in those cases"
3353 if alnum.indexOf(txt.charAt(cur + i)) > -1
3354 # this makes attributes forgiving about url args
3356 # ok, and besides the weird exceptions for attributes...
3357 # return the matching char
3358 cur += i # consume entity chars
3359 parse_error() # because no terminating ";"
3363 return # never reached
3365 # tree constructor initialization
3366 # see comments on TYPE_TAG/etc for the structure of this data
3367 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
3369 afe = [] # active formatting elements
3370 template_insertion_modes = []
3371 insertion_mode = ins_mode_initial
3372 original_insertion_mode = insertion_mode # TODO check spec
3373 flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
3374 flag_frameset_ok = true
3376 flag_foster_parenting = false
3377 form_element_pointer = null
3378 temporary_buffer = null
3379 pending_table_character_tokens = []
3380 head_element_pointer = null
3381 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
3382 context_element = null # FIXME initialize from args.fragment
3384 # tokenizer initialization
3385 tok_state = tok_state_data
3392 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
3395 serialize_els = (els, shallow, show_ids) ->
3401 serialized += t.serialize shallow, show_ids
3404 # TODO export TYPE_*
3405 module.exports.parse_html = parse_html
3406 module.exports.debug_log_reset = debug_log_reset
3407 module.exports.debug_log_each = debug_log_each
3408 module.exports.TYPE_TAG = TYPE_TAG
3409 module.exports.TYPE_TEXT = TYPE_TEXT
3410 module.exports.TYPE_COMMENT = TYPE_COMMENT
3411 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE