1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
55 module = exports: window.wheic
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
79 debug_log_each = (cb) ->
80 for str in g_debug_log
85 constructor: (type, args = {}) ->
86 @type = type # one of the TYPE_* constants above
87 @name = args.name ? '' # tag name
88 @text = args.text ? '' # contents for text/comment nodes
89 @attrs = args.attrs ? {}
90 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91 @children = args.children ? []
92 @namespace = args.namespace ? NS_HTML
93 @parent = args.parent ? null
94 @token = args.token ? null
98 @id = "#{++prev_node_id}"
99 shallow_clone: -> # return a new node that's the same except without the children or parent
100 # WARNING this doesn't work right on open tags that are still being parsed
102 attrs[k] = v for k, v of @attrs
103 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id, token: @token
104 acknowledge_self_closing: ->
106 @token.flag 'did_self_close'
108 @flag 'did_self_close', true
111 serialize: (shallow = false, show_ids = false) -> # for unit tests
116 ret += JSON.stringify @name
131 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
137 ret += c.serialize shallow, show_ids
141 ret += JSON.stringify @text
144 ret += JSON.stringify @text
150 when TYPE_AAA_BOOKMARK
151 ret += 'aaa_bookmark'
154 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
157 # helpers: (only take args that are normally known when parser creates nodes)
158 new_open_tag = (name) ->
159 return new Node TYPE_START_TAG, name: name
160 new_end_tag = (name) ->
161 return new Node TYPE_END_TAG, name: name
162 new_element = (name) ->
163 return new Node TYPE_TAG, name: name
164 new_text_node = (txt) ->
165 return new Node TYPE_TEXT, text: txt
166 new_character_token = new_text_node
167 new_comment_token = (txt) ->
168 return new Node TYPE_COMMENT, text: txt
170 return new Node TYPE_EOF
172 return new Node TYPE_AFE_MARKER
173 new_aaa_bookmark = ->
174 return new Node TYPE_AAA_BOOKMARK
176 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
177 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
178 digits = "0123456789"
179 alnum = lc_alpha + uc_alpha + digits
180 hex_chars = digits + "abcdefABCDEF"
182 # some SVG elements have dashes in them
183 tag_name_chars = alnum + "-"
185 # http://www.w3.org/TR/html5/infrastructure.html#space-character
186 space_chars = "\u0009\u000a\u000c\u000d\u0020"
188 return txt.length is 1 and space_chars.indexOf(txt) > -1
189 is_space_tok = (t) ->
190 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
192 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
193 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
195 # These are the character references that don't need a terminating semicolon
196 # min length: 2, max: 6, none are a prefix of any other.
198 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
199 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
200 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
201 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
202 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
203 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
204 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
205 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
206 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
207 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
208 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
209 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
210 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
211 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
212 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
213 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
214 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
218 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
219 raw_text_elements = ['script', 'style']
220 escapable_raw_text_elements = ['textarea', 'title']
221 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
223 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
224 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
225 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
226 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
227 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
228 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
229 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
230 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
231 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
232 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
233 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
234 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
235 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
236 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
240 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
242 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
243 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
244 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
245 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
246 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
247 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
248 'determinant', 'diff', 'divergence', 'divide', 'domain',
249 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
250 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
251 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
252 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
253 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
254 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
255 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
256 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
257 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
258 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
259 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
260 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
261 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
262 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
263 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
264 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
265 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
266 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
267 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
268 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
269 'vectorproduct', 'xor'
271 # foreign_elements = [svg_elements..., mathml_elements...]
272 #normal_elements = All other allowed HTML elements are normal elements.
276 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
277 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
278 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
279 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
280 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
281 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
282 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
283 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
284 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
285 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
286 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
287 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
288 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
289 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
290 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
291 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
292 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
293 wbr:NS_HTML, xmp:NS_HTML,
296 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
297 'annotation-xml':NS_MATHML,
300 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
303 formatting_elements = {
304 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
305 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
309 foster_parenting_targets = {
331 el_is_special = (e) ->
332 return special_elements[e.name] is e.namespace
334 # decode_named_char_ref()
336 # The list of named character references is _huge_ so ask the browser to decode
337 # for us instead of wasting bandwidth/space on including the table here.
339 # Pass without the "&" but with the ";" examples:
340 # for "&" pass "amp;"
341 # for "′" pass "x2032;"
344 textarea: document.createElement('textarea')
346 # TODO test this in IE8
347 decode_named_char_ref = (txt) ->
349 decoded = g_dncr.cache[txt]
350 return decoded if decoded?
351 g_dncr.textarea.innerHTML = txt
352 decoded = g_dncr.textarea.value
353 return null if decoded is txt
354 return g_dncr.cache[txt] = decoded
356 parse_html = (txt, parse_error_cb = null) ->
357 cur = 0 # index of next char in txt to be parsed
358 # declare doc and tokenizer variables so they're in scope below
360 open_els = null # stack of open elements
361 afe = null # active formatting elements
362 template_insertion_modes = null
363 insertion_mode = null
364 original_insertion_mode = null
366 tok_cur_tag = null # partially parsed tag
367 flag_scripting = null
368 flag_frameset_ok = null
370 flag_foster_parenting = null
371 form_element_pointer = null
372 temporary_buffer = null
373 pending_table_character_tokens = null
374 head_element_pointer = null
375 flag_fragment_parsing = null
376 context_element = null
385 console.log "Parse error at character #{cur} of #{txt.length}"
387 afe_push = (new_el) ->
390 if el.name is new_el.name and el.namespace is new_el.namespace
392 continue unless new_el.attrs[k] is v
393 for k, v of new_el.attrs
394 continue unless el.attrs[k] is v
401 afe.unshift new_afe_marker()
403 # the functions below impliment the Tree Contstruction algorithm
404 # http://www.w3.org/TR/html5/syntax.html#tree-construction
406 # But first... the helpers
407 template_tag_is_open = ->
409 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
412 is_in_scope_x = (tag_name, scope, namespace) ->
414 if t.name is tag_name and (namespace is null or namespace is t.namespace)
416 if scope[t.name] is t.namespace
419 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
421 if t.name is tag_name and (namespace is null or namespace is t.namespace)
423 if scope[t.name] is t.namespace
425 if scope2[t.name] is t.namespace
428 standard_scopers = { # FIXME these are supposed to be namespace specific
429 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
430 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
431 template: NS_HTML, mi: NS_MATHML,
433 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
434 'annotation-xml': NS_MATHML,
436 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
438 button_scopers = button: NS_HTML
439 li_scopers = ol: NS_HTML, ul: NS_HTML
440 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
441 is_in_scope = (tag_name, namespace = null) ->
442 return is_in_scope_x tag_name, standard_scopers, namespace
443 is_in_button_scope = (tag_name, namespace = null) ->
444 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
445 is_in_table_scope = (tag_name, namespace = null) ->
446 return is_in_scope_x tag_name, table_scopers, namespace
447 is_in_select_scope = (tag_name, namespace = null) ->
449 if t.name is tag_name and (namespace is null or namespace is t.namespace)
451 if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
454 # this checks for a particular element, not by name
455 el_is_in_scope = (el) ->
459 if standard_scopers[t.name] is t.namespace
463 clear_to_table_stopers = {
468 clear_stack_to_table_context = ->
470 if clear_to_table_stopers[open_els[0].name]?
474 clear_to_table_body_stopers = {
481 clear_stack_to_table_body_context = ->
483 if clear_to_table_body_stopers[open_els[0].name]?
487 clear_to_table_row_stopers = {
492 clear_stack_to_table_row_context = ->
494 if clear_to_table_row_stopers[open_els[0].name]?
498 clear_afe_to_marker = ->
501 if el.type is TYPE_AFE_MARKER
505 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
506 reset_insertion_mode = ->
507 # 1. Let last be false.
509 # 2. Let node be the last node in the stack of open elements.
511 node = open_els[node_i]
512 # 3. Loop: If node is the first node in the stack of open elements,
513 # then set last to true, and, if the parser was originally created as
514 # part of the HTML fragment parsing algorithm (fragment case) set node
515 # to the context element.
517 if node_i is open_els.length - 1
519 # fixfull (fragment case)
521 # 4. If node is a select element, run these substeps:
522 if node.name is 'select'
523 # 1. If last is true, jump to the step below labeled done.
525 # 2. Let ancestor be node.
528 # 3. Loop: If ancestor is the first node in the stack of
529 # open elements, jump to the step below labeled done.
531 if ancestor_i is open_els.length - 1
533 # 4. Let ancestor be the node before ancestor in the stack
536 ancestor = open_els[ancestor_i]
537 # 5. If ancestor is a template node, jump to the step below
539 if ancestor.name is 'template'
541 # 6. If ancestor is a table node, switch the insertion mode
542 # to "in select in table" and abort these steps.
543 if ancestor.name is 'table'
544 insertion_mode = ins_mode_in_select_in_table
546 # 7. Jump back to the step labeled loop.
547 # 8. Done: Switch the insertion mode to "in select" and abort
549 insertion_mode = ins_mode_in_select
551 # 5. If node is a td or th element and last is false, then switch
552 # the insertion mode to "in cell" and abort these steps.
553 if (node.name is 'td' or node.name is 'th') and last is false
554 insertion_mode = ins_mode_in_cell
556 # 6. If node is a tr element, then switch the insertion mode to "in
557 # row" and abort these steps.
559 insertion_mode = ins_mode_in_row
561 # 7. If node is a tbody, thead, or tfoot element, then switch the
562 # insertion mode to "in table body" and abort these steps.
563 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
564 insertion_mode = ins_mode_in_table_body
566 # 8. If node is a caption element, then switch the insertion mode
567 # to "in caption" and abort these steps.
568 if node.name is 'caption'
569 insertion_mode = ins_mode_in_caption
571 # 9. If node is a colgroup element, then switch the insertion mode
572 # to "in column group" and abort these steps.
573 if node.name is 'colgroup'
574 insertion_mode = ins_mode_in_column_group
576 # 10. If node is a table element, then switch the insertion mode to
577 # "in table" and abort these steps.
578 if node.name is 'table'
579 insertion_mode = ins_mode_in_table
581 # 11. If node is a template element, then switch the insertion mode
582 # to the current template insertion mode and abort these steps.
583 # fixfull (template insertion mode stack)
585 # 12. If node is a head element and last is true, then switch the
586 # insertion mode to "in body" ("in body"! not "in head"!) and abort
587 # these steps. (fragment case)
588 if node.name is 'head' and last
589 insertion_mode = ins_mode_in_body
591 # 13. If node is a head element and last is false, then switch the
592 # insertion mode to "in head" and abort these steps.
593 if node.name is 'head' and last is false
594 insertion_mode = ins_mode_in_head
596 # 14. If node is a body element, then switch the insertion mode to
597 # "in body" and abort these steps.
598 if node.name is 'body'
599 insertion_mode = ins_mode_in_body
601 # 15. If node is a frameset element, then switch the insertion mode
602 # to "in frameset" and abort these steps. (fragment case)
603 if node.name is 'frameset'
604 insertion_mode = ins_mode_in_frameset
606 # 16. If node is an html element, run these substeps:
607 if node.name is 'html'
608 # 1. If the head element pointer is null, switch the insertion
609 # mode to "before head" and abort these steps. (fragment case)
610 # fixfull (fragment case)
612 # 2. Otherwise, the head element pointer is not null, switch
613 # the insertion mode to "after head" and abort these steps.
614 insertion_mode = ins_mode_in_body # FIXME fixfull
616 # 17. If last is true, then switch the insertion mode to "in body"
617 # and abort these steps. (fragment case)
619 insertion_mode = ins_mode_in_body
621 # 18. Let node now be the node before node in the stack of open
624 node = open_els[node_i]
625 # 19. Return to the step labeled loop.
629 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
630 adjusted_current_node = ->
631 if open_els.length is 1 and flag_fragment_parsing
632 return context_element
635 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
636 # this implementation is structured (mostly) as described at the link above.
637 # capitalized comments are the "labels" described at the link above.
638 reconstruct_active_formatting_elements = ->
639 return if afe.length is 0
640 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
645 if i is afe.length - 1
648 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
653 el = afe[i].shallow_clone()
654 tree_insert_element el
659 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
660 # adoption agency algorithm
662 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
663 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
664 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
665 adoption_agency = (subject) ->
666 debug_log "adoption_agency()"
667 debug_log "tree: #{serialize_els doc.children, false, true}"
668 debug_log "open_els: #{serialize_els open_els, true, true}"
669 debug_log "afe: #{serialize_els afe, true, true}"
670 if open_els[0].name is subject
673 # remove it from the list of active formatting elements (if found)
678 debug_log "aaa: starting off with subject on top of stack, exiting"
685 # 5. Let formatting element be the last element in the list of
686 # active formatting elements that: is between the end of the list
687 # and the last scope marker in the list, if any, or the start of
688 # the list otherwise, and has the tag name subject.
690 for t, fe_of_afe in afe
691 if t.type is TYPE_AFE_MARKER
696 # If there is no such element, then abort these steps and instead
697 # act as described in the "any other end tag" entry above.
699 debug_log "aaa: fe not found in afe"
700 in_body_any_other_end_tag subject
702 # 6. If formatting element is not in the stack of open elements,
703 # then this is a parse error; remove the element from the list, and
706 for t, fe_of_open_els in open_els
711 debug_log "aaa: fe not found in open_els"
713 # "remove it from the list" must mean afe, since it's not in open_els
714 afe.splice fe_of_afe, 1
716 # 7. If formatting element is in the stack of open elements, but
717 # the element is not in scope, then this is a parse error; abort
719 unless el_is_in_scope fe
720 debug_log "aaa: fe not in scope"
723 # 8. If formatting element is not the current node, this is a parse
724 # error. (But do not abort these steps.)
725 unless open_els[0] is fe
728 # 9. Let furthest block be the topmost node in the stack of open
729 # elements that is lower in the stack than formatting element, and
730 # is an element in the special category. There might not be one.
732 fb_of_open_els = null
739 # and continue, to see if there's one that's more "topmost"
740 # 10. If there is no furthest block, then the UA must first pop all
741 # the nodes from the bottom of the stack of open elements, from the
742 # current node up to and including formatting element, then remove
743 # formatting element from the list of active formatting elements,
744 # and finally abort these steps.
746 debug_log "aaa: no fb"
750 afe.splice fe_of_afe, 1
752 # 11. Let common ancestor be the element immediately above
753 # formatting element in the stack of open elements.
754 ca = open_els[fe_of_open_els + 1] # common ancestor
756 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
757 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
758 bookmark = new_aaa_bookmark()
761 afe.splice i, 0, bookmark
763 node = last_node = fb
767 # 3. Let node be the element immediately above node in the
768 # stack of open elements, or if node is no longer in the stack
769 # of open elements (e.g. because it got removed by this
770 # algorithm), the element that was immediately above node in
771 # the stack of open elements before node was removed.
775 node_next = open_els[i + 1]
777 node = node_next ? node_above
778 debug_log "inner loop #{inner}"
779 debug_log "tree: #{serialize_els doc.children, false, true}"
780 debug_log "open_els: #{serialize_els open_els, true, true}"
781 debug_log "afe: #{serialize_els afe, true, true}"
782 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
783 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
784 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
785 debug_log "node: #{node.serialize true, true}"
786 # TODO make sure node_above gets re-set if/when node is removed from open_els
788 # 4. If node is formatting element, then go to the next step in
789 # the overall algorithm.
793 # 5. If inner loop counter is greater than three and node is in
794 # the list of active formatting elements, then remove node from
795 # the list of active formatting elements.
801 debug_log "max out inner"
806 # 6. If node is not in the list of active formatting elements,
807 # then remove node from the stack of open elements and then go
808 # back to the step labeled inner loop.
810 debug_log "not in afe"
813 node_above = open_els[i + 1]
817 debug_log "the bones"
818 # 7. create an element for the token for which the element node
819 # was created, in the HTML namespace, with common ancestor as
820 # the intended parent; replace the entry for node in the list
821 # of active formatting elements with an entry for the new
822 # element, replace the entry for node in the stack of open
823 # elements with an entry for the new element, and let node be
825 new_node = node.shallow_clone()
829 debug_log "replaced in afe"
833 node_above = open_els[i + 1]
834 open_els[i] = new_node
835 debug_log "replaced in open_els"
838 # 8. If last node is furthest block, then move the
839 # aforementioned bookmark to be immediately after the new node
840 # in the list of active formatting elements.
845 debug_log "removed bookmark"
849 # "after" means lower
850 afe.splice i, 0, bookmark # "after as <-
851 debug_log "placed bookmark after node"
852 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
854 # 9. Insert last node into node, first removing it from its
855 # previous parent node if any.
857 debug_log "last_node has parent"
858 for c, i in last_node.parent.children
860 debug_log "removing last_node from parent"
861 last_node.parent.children.splice i, 1
863 node.children.push last_node
864 last_node.parent = node
865 # 10. Let last node be node.
868 # 11. Return to the step labeled inner loop.
869 # 14. Insert whatever last node ended up being in the previous step
870 # at the appropriate place for inserting a node, but using common
871 # ancestor as the override target.
873 # In the case where fe is immediately followed by fb:
874 # * inner loop exits out early (node==fe)
876 # * last_node is still in the tree (not a duplicate)
878 debug_log "FEFIRST? last_node has parent"
879 for c, i in last_node.parent.children
881 debug_log "removing last_node from parent"
882 last_node.parent.children.splice i, 1
885 debug_log "after aaa inner loop"
886 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
887 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
888 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
889 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
890 debug_log "tree: #{serialize_els doc.children, false, true}"
895 # can't use standard insert token thing, because it's already in
896 # open_els and must stay at it's current position in open_els
897 dest = adjusted_insertion_location ca
898 dest[0].children.splice dest[1], 0, last_node
899 last_node.parent = dest[0]
902 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
903 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
904 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
905 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
906 debug_log "tree: #{serialize_els doc.children, false, true}"
908 # 15. Create an element for the token for which formatting element
909 # was created, in the HTML namespace, with furthest block as the
911 new_element = fe.shallow_clone() # FIXME intended parent thing
912 # 16. Take all of the child nodes of furthest block and append them
913 # to the element created in the last step.
914 while fb.children.length
915 t = fb.children.shift()
916 t.parent = new_element
917 new_element.children.push t
918 # 17. Append that new element to furthest block.
919 new_element.parent = fb
920 fb.children.push new_element
921 # 18. Remove formatting element from the list of active formatting
922 # elements, and insert the new element into the list of active
923 # formatting elements at the position of the aforementioned
933 # 19. Remove formatting element from the stack of open elements,
934 # and insert the new element into the stack of open elements
935 # immediately below the position of furthest block in that stack.
942 open_els.splice i, 0, new_element
944 # 20. Jump back to the step labeled outer loop.
945 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
946 debug_log "tree: #{serialize_els doc.children, false, true}"
947 debug_log "open_els: #{serialize_els open_els, true, true}"
948 debug_log "afe: #{serialize_els afe, true, true}"
951 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
953 generate_implied_end_tags 'p' # arg is exception
954 if open_els[0].name isnt 'p'
956 while open_els.length > 1 # just in case
957 el = open_els.shift()
960 close_p_if_in_button_scope = ->
961 if is_in_button_scope 'p'
964 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
965 # aka insert_a_character = (t) ->
966 insert_character = (t) ->
967 dest = adjusted_insertion_location()
968 # fixfull check for Document node
970 prev = dest[0].children[dest[1] - 1]
971 if prev.type is TYPE_TEXT
974 dest[0].children.splice dest[1], 0, t
977 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
978 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
979 adjusted_insertion_location = (override_target = null) ->
980 # 1. If there was an override target specified, then let target be the
983 target = override_target
984 else # Otherwise, let target be the current node.
986 # 2. Determine the adjusted insertion location using the first matching
987 # steps from the following list:
989 # If foster parenting is enabled and target is a table, tbody, tfoot,
990 # thead, or tr element Foster parenting happens when content is
991 # misnested in tables.
992 if flag_foster_parenting and foster_parenting_targets[target.name]
993 loop # once. this is here so we can ``break`` to "abort these substeps"
994 # 1. Let last template be the last template element in the
995 # stack of open elements, if any.
997 last_template_i = null
998 for el, i in open_els
999 if el.name is 'template'
1003 # 2. Let last table be the last table element in the stack of
1004 # open elements, if any.
1007 for el, i in open_els
1008 if el.name is 'table'
1012 # 3. If there is a last template and either there is no last
1013 # table, or there is one, but last template is lower (more
1014 # recently added) than last table in the stack of open
1015 # elements, then: let adjusted insertion location be inside
1016 # last template's template contents, after its last child (if
1017 # any), and abort these substeps.
1018 if last_template and (last_table is null or last_template_i < last_table_i)
1019 target = template # fixfull should be it's contents
1020 target_i = target.children.length
1022 # 4. If there is no last table, then let adjusted insertion
1023 # location be inside the first element in the stack of open
1024 # elements (the html element), after its last child (if any),
1025 # and abort these substeps. (fragment case)
1026 if last_table is null
1028 target = open_els[open_els.length - 1]
1029 target_i = target.children.length
1030 # 5. If last table has a parent element, then let adjusted
1031 # insertion location be inside last table's parent element,
1032 # immediately before last table, and abort these substeps.
1033 if last_table.parent?
1034 for c, i in last_table.parent.children
1036 target = last_table.parent
1040 # 6. Let previous element be the element immediately above last
1041 # table in the stack of open elements.
1043 # huh? how could it not have a parent?
1044 previous_element = open_els[last_table_i + 1]
1045 # 7. Let adjusted insertion location be inside previous
1046 # element, after its last child (if any).
1047 target = previous_element
1048 target_i = target.children.length
1049 # Note: These steps are involved in part because it's possible
1050 # for elements, the table element in this case in particular,
1051 # to have been moved by a script around in the DOM, or indeed
1052 # removed from the DOM entirely, after the element was inserted
1054 break # don't really loop
1056 # Otherwise Let adjusted insertion location be inside target, after
1057 # its last child (if any).
1058 target_i = target.children.length
1060 # 3. If the adjusted insertion location is inside a template element,
1061 # let it instead be inside the template element's template contents,
1062 # after its last child (if any).
1063 # fixfull (template)
1065 # 4. Return the adjusted insertion location.
1066 return [target, target_i]
1068 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1069 # aka create_an_element_for_token
1070 token_to_element = (t, namespace, intended_parent) ->
1071 t.type = TYPE_TAG # not TYPE_START_TAG
1072 # convert attributes into a hash
1074 while t.attrs_a.length
1076 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1077 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1079 # TODO 2. If the newly created element has an xmlns attribute in the
1080 # XMLNS namespace whose value is not exactly the same as the element's
1081 # namespace, that is a parse error. Similarly, if the newly created
1082 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1083 # value is not the XLink Namespace, that is a parse error.
1085 # fixfull: the spec says stuff about form pointers and ownerDocument
1089 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1090 insert_foreign_element = (token, namespace) ->
1091 ail = adjusted_insertion_location()
1094 el = token_to_element token, namespace, ail_el
1095 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1097 ail_el.children.splice ail_i, 0, el
1100 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1101 insert_html_element = insert_foreign_element # (token, namespace) ->
1103 # FIXME read implement "foster parenting" part
1104 # FIXME read spec, do this right
1105 # FIXME implement the override target thing
1106 # note: this assumes it's an open tag
1107 # FIXME what part of the spec is this?
1108 # TODO look through all callers of this, and see what they should really be doing.
1109 # eg probably insert_html_element for tokens
1110 tree_insert_element = (el, override_target = null, namespace = null) ->
1112 el.namespace = namespace
1113 dest = adjusted_insertion_location override_target
1114 if el.type is TYPE_START_TAG # means it's a "token"
1115 el = token_to_element el, namespace, dest[0]
1116 unless el.namespace?
1117 namespace = dest.namespace
1118 # fixfull: Document nodes sometimes can't accept more chidren
1119 dest[0].children.splice dest[1], 0, el
1124 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1125 # position should be [node, index_within_children]
1126 insert_comment = (t, position = null) ->
1127 position ?= adjusted_insertion_location()
1128 position[0].children.splice position[1], 0, t
1131 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1132 parse_generic_raw_text = (t) ->
1133 insert_html_element t
1134 tok_state = tok_state_rawtext
1135 original_insertion_mode = insertion_mode
1136 insertion_mode = ins_mode_text
1137 parse_generic_rcdata_text = (t) ->
1138 insert_html_element t
1139 tok_state = tok_state_rcdata
1140 original_insertion_mode = insertion_mode
1141 insertion_mode = ins_mode_text
1143 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1144 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1145 generate_implied_end_tags = (except = null) ->
1146 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1149 # 8.2.5.4 The rules for parsing tokens in HTML content
1150 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1152 # 8.2.5.4.1 The "initial" insertion mode
1153 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1154 ins_mode_initial = (t) ->
1157 if t.type is TYPE_COMMENT
1158 # fixfull this is supposed to be "the last child of the document object"
1161 if t.type is TYPE_DOCTYPE
1165 insertion_mode = ins_mode_before_html
1168 #fixfull (iframe, quirks)
1169 insertion_mode = ins_mode_before_html
1170 insertion_mode t # reprocess the token
1173 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1174 ins_mode_before_html = (t) ->
1175 if t.type is TYPE_DOCTYPE
1178 if t.type is TYPE_COMMENT
1183 if t.type is TYPE_START_TAG and t.name is 'html'
1184 el = token_to_element t, NS_HTML, doc
1185 open_els.unshift(el)
1186 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1187 insertion_mode = ins_mode_before_head
1189 if t.type is TYPE_END_TAG
1190 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1191 # fall through to "anything else"
1196 html_tok = new_open_tag 'html'
1197 el = token_to_element html_tok, NS_HTML, doc
1198 doc.children.push el
1200 # ?fixfull browsing context
1201 insertion_mode = ins_mode_before_head
1205 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1206 ins_mode_before_head = (t) ->
1209 if t.type is TYPE_COMMENT
1212 if t.type is TYPE_DOCTYPE
1215 if t.type is TYPE_START_TAG and t.name is 'html'
1218 if t.type is TYPE_START_TAG and t.name is 'head'
1219 el = insert_html_element t
1220 head_element_pointer = el
1221 insertion_mode = ins_mode_in_head
1222 if t.type is TYPE_END_TAG
1223 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1224 # fall through to Anything else below
1229 head_tok = new_open_tag 'head'
1230 el = insert_html_element head_tok
1231 head_element_pointer = el
1232 insertion_mode = ins_mode_in_head
1233 insertion_mode t # reprocess current token
1235 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1236 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1237 open_els.shift() # spec says this will be a 'head' node
1238 insertion_mode = ins_mode_after_head
1240 ins_mode_in_head = (t) ->
1241 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1244 if t.type is TYPE_COMMENT
1247 if t.type is TYPE_DOCTYPE
1250 if t.type is TYPE_START_TAG and t.name is 'html'
1253 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1254 el = insert_html_element t
1256 t.acknowledge_self_closing()
1258 if t.type is TYPE_START_TAG and t.name is 'meta'
1259 el = insert_html_element t
1261 t.acknowledge_self_closing()
1262 # fixfull encoding stuff
1264 if t.type is TYPE_START_TAG and t.name is 'title'
1265 parse_generic_rcdata_element t
1267 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1268 parse_generic_raw_text t
1270 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1271 insert_html_element t
1272 insertion_mode = in_head_noscript # FIXME implement
1274 if t.type is TYPE_START_TAG and t.name is 'script'
1275 ail = adjusted_insertion_location()
1276 el = token_to_element t, NS_HTML, ail
1277 el.flag_parser_inserted true # FIXME implement
1278 # fixfull frament case
1279 ail[0].children.splice ail[1], 0, el
1281 tok_state = tok_state_script_data
1282 original_insertion_mode = insertion_mode # make sure orig... is defined
1283 insertion_mode = ins_mode_text # FIXME implement
1285 if t.type is TYPE_END_TAG and t.name is 'head'
1286 open_els.shift() # will be a head element... spec says so
1287 insertion_mode = ins_mode_after_head
1289 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1290 ins_mode_in_head_else t
1292 if t.type is TYPE_START_TAG and t.name is 'template'
1293 insert_html_element t
1295 flag_frameset_ok = false
1296 insertion_mode = ins_mode_in_template
1297 template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1299 if t.type is TYPE_END_TAG and t.name is 'template'
1300 if template_tag_is_open()
1301 generate_implied_end_tags
1302 if open_els[0].name isnt 'template'
1305 el = open_els.shift()
1306 if el.name is 'template'
1308 clear_afe_to_marker()
1309 template_insertion_modes.shift()
1310 reset_insertion_mode()
1314 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1317 ins_mode_in_head_else t
1319 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1320 ins_mode_in_head_noscript = (t) ->
1322 console.log "ins_mode_in_head_noscript unimplemented"
1324 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1325 ins_mode_after_head_else = (t) ->
1326 body_tok = new_open_tag 'body'
1327 insert_html_element body_tok
1328 insertion_mode = ins_mode_in_body
1329 insertion_mode t # reprocess token
1331 ins_mode_after_head = (t) ->
1335 if t.type is TYPE_COMMENT
1338 if t.type is TYPE_DOCTYPE
1341 if t.type is TYPE_START_TAG and t.name is 'html'
1344 if t.type is TYPE_START_TAG and t.name is 'body'
1345 insert_html_element t
1346 flag_frameset_ok = false
1347 insertion_mode = ins_mode_in_body
1349 if t.type is TYPE_START_TAG and t.name is 'frameset'
1350 insert_html_element t
1351 insertion_mode = ins_mode_in_frameset
1353 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1355 open_els.unshift head_element_pointer
1357 for el, i of open_els
1358 if el is head_element_pointer
1359 open_els.splice i, 1
1361 console.log "warning: 23904 couldn't find head element in open_els"
1363 if t.type is TYPE_END_TAG and t.name is 'template'
1366 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1367 ins_mode_after_head_else t
1369 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1373 ins_mode_after_head_else t
1375 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1376 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1377 for node, i in open_els
1378 if node.name is name # FIXME check namespace too
1379 generate_implied_end_tags name # arg is exception
1380 parse_error() unless i is 0
1385 if special_elements[node.name]? # FIXME check namespac too
1388 ins_mode_in_body = (t) ->
1394 when "\t", "\u000a", "\u000c", "\u000d", ' '
1395 reconstruct_active_formatting_elements()
1398 reconstruct_active_formatting_elements()
1400 flag_frameset_ok = false
1409 return if template_tag_is_open()
1410 root_attrs = open_els[open_els.length - 1].attrs
1412 root_attrs[k] = v unless root_attrs[k]?
1413 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1414 # FIXME also do this for </template> (end tag)
1415 return ins_mode_in_head t
1422 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1423 close_p_if_in_button_scope()
1424 insert_html_element t
1425 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1426 close_p_if_in_button_scope()
1427 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1430 insert_html_element t
1431 # TODO lots more to implement here
1433 # If the list of active formatting elements
1434 # contains an a element between the end of the list and
1435 # the last marker on the list (or the start of the list
1436 # if there is no marker on the list), then this is a
1437 # parse error; run the adoption agency algorithm for
1438 # the tag name "a", then remove that element from the
1439 # list of active formatting elements and the stack of
1440 # open elements if the adoption agency algorithm didn't
1441 # already remove it (it might not have if the element
1442 # is not in table scope).
1445 if el.type is TYPE_AFE_MARKER
1455 for el, i in open_els
1457 open_els.splice i, 1
1458 reconstruct_active_formatting_elements()
1459 el = insert_html_element t
1461 when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1462 reconstruct_active_formatting_elements()
1463 el = insert_html_element t
1466 # fixfull quirksmode thing
1467 close_p_if_in_button_scope()
1468 insert_html_element t
1469 insertion_mode = ins_mode_in_table
1470 # TODO lots more to implement here
1471 else # any other start tag
1472 reconstruct_active_formatting_elements()
1473 insert_html_element t
1476 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1477 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1480 unless ok_tags[t.name]?
1483 # TODO stack of template insertion modes thing
1488 unless is_in_scope 'body'
1491 # TODO implement parse error and move to tree_after_body
1493 unless is_in_scope 'body' # weird, but it's what the spec says
1496 # TODO implement parse error and move to tree_after_body, reprocess
1497 when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1498 unless is_in_scope t.name, NS_HTML
1501 generate_implied_end_tags()
1502 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1505 el = open_els.shift()
1506 if el.name is t.name and el.namespace is NS_HTML
1508 # TODO lots more close tags to implement here
1510 unless is_in_button_scope 'p'
1512 insert_html_element new_open_tag 'p'
1514 # TODO lots more close tags to implement here
1515 when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1516 adoption_agency t.name
1517 # TODO lots more close tags to implement here
1519 in_body_any_other_end_tag t.name
1522 ins_mode_in_table_else = (t) ->
1524 flag_foster_parenting = true # FIXME
1526 flag_foster_parenting = false
1527 can_in_table = { # FIXME do this inline like everywhere else
1535 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1536 ins_mode_text = (t) ->
1537 if t.type is TYPE_TEXT
1540 if t.type is TYPE_EOF
1542 if open_els[0].name is 'script'
1543 open_els[0].flag 'already started', true
1545 insertion_mode = original_insertion_mode
1548 if t.type is TYPE_END_TAG and t.name is 'script'
1550 insertion_mode = original_insertion_mode
1551 # fixfull the spec seems to assume that I'm going to run the script
1552 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1554 if t.type is TYPE_END_TAG
1556 insertion_mode = original_insertion_mode
1558 console.log 'warning: end of ins_mode_text reached'
1560 # the functions below implement the tokenizer stats described here:
1561 # http://www.w3.org/TR/html5/syntax.html#tokenization
1563 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1564 ins_mode_in_table = (t) ->
1567 if can_in_table[t.name]
1568 original_insertion_mode = insertion_mode
1569 insertion_mode = ins_mode_in_table_text
1572 ins_mode_in_table_else t
1580 clear_stack_to_table_context()
1582 insert_html_element t
1583 insertion_mode = ins_mode_in_caption
1585 clear_stack_to_table_context()
1586 insert_html_element t
1587 insertion_mode = ins_mode_in_column_group
1589 clear_stack_to_table_context()
1590 insert_html_element new_open_tag 'colgroup'
1591 insertion_mode = ins_mode_in_column_group
1593 when 'tbody', 'tfoot', 'thead'
1594 clear_stack_to_table_context()
1595 insert_html_element t
1596 insertion_mode = ins_mode_in_table_body
1597 when 'td', 'th', 'tr'
1598 clear_stack_to_table_context()
1599 insert_html_element new_open_tag 'tbody'
1600 insertion_mode = ins_mode_in_table_body
1604 if is_in_table_scope 'table'
1606 el = open_els.shift()
1607 if el.name is 'table'
1609 reset_insertion_mode()
1611 when 'style', 'script', 'template'
1614 if token_is_input_hidden t
1615 ins_mode_in_table_else t
1618 el = insert_html_element t
1620 t.acknowledge_self_closing()
1623 if form_element_pointer?
1625 if template_tag_is_open()
1627 form_element_pointer = insert_html_element t
1630 ins_mode_in_table_else t
1634 if is_in_table_scope 'table'
1636 el = open_els.shift()
1637 if el.name is 'table'
1639 reset_insertion_mode()
1642 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1647 ins_mode_in_table_else t
1651 ins_mode_in_table_else t
1654 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1655 ins_mode_in_table_text = (t) ->
1656 if t.type is TYPE_TEXT and t.text is "\u0000"
1657 # huh? I thought the tokenizer didn't emit these
1660 if t.type is TYPE_TEXT
1661 pending_table_character_tokens.push t
1665 for old in pending_table_character_tokens
1666 unless is_space_tok old
1670 for old in pending_table_character_tokens
1671 insert_character old
1673 for old in pending_table_character_tokens
1674 ins_mode_table_else old
1675 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1676 insertion_mode = original_insertion_mode
1679 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1680 ins_mode_in_caption = (t) ->
1681 if t.type is TYPE_END_TAG and t.name is 'caption'
1682 if is_in_table_scope 'caption'
1683 generate_implied_end_tags()
1684 if open_els[0].name isnt 'caption'
1687 el = open_els.shift()
1688 if el.name is 'caption'
1690 clear_afe_to_marker()
1691 insertion_mode = in_table
1696 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1698 if is_in_table_scope 'caption'
1700 el = open_els.shift()
1701 if el.name is 'caption'
1703 clear_afe_to_marker()
1704 insertion_mode = in_table
1706 # else fragment case
1708 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1714 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1715 ins_mode_in_column_group = (t) ->
1719 if t.type is TYPE_COMMENT
1722 if t.type is TYPE_DOCTYPE
1725 if t.type is TYPE_START_TAG and t.name is 'html'
1728 if t.type is TYPE_START_TAG and t.name is 'col'
1729 el = insert_html_element t
1731 t.acknowledge_self_closing()
1733 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1734 if open_els[0].name is 'colgroup'
1736 insertion_mode = ins_mode_in_table
1740 if t.type is TYPE_END_TAG and t.name is 'col'
1743 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1746 if t.type is TYPE_EOF
1750 if open_els[0].name isnt 'colgroup'
1754 insertion_mode = ins_mode_in_table
1758 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1759 ins_mode_in_table_body = (t) ->
1760 if t.type is TYPE_START_TAG and t.name is 'tr'
1761 clear_stack_to_table_body_context()
1762 insert_html_element t
1763 insertion_mode = ins_mode_in_row
1765 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1767 clear_stack_to_table_body_context()
1768 insert_html_element new_open_tag 'tr'
1769 insertion_mode = ins_mode_in_row
1772 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1773 unless is_in_table_scope t.name # fixfull check namespace
1776 clear_stack_to_table_body_context()
1778 insertion_mode = ins_mode_in_table
1780 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1783 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1786 if table_scopers[el.name]
1791 clear_stack_to_table_body_context()
1793 insertion_mode = ins_mode_in_table
1796 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1802 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1803 ins_mode_in_row = (t) ->
1804 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1805 clear_stack_to_table_row_context()
1806 insert_html_element t
1807 insertion_mode = ins_mode_in_cell
1810 if t.type is TYPE_END_TAG and t.name is 'tr'
1811 if is_in_table_scope 'tr'
1812 clear_stack_to_table_row_context()
1814 insertion_mode = ins_mode_in_table_body
1818 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1819 if is_in_table_scope 'tr'
1820 clear_stack_to_table_row_context()
1822 insertion_mode = ins_mode_in_table_body
1827 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1828 if is_in_table_scope t.name # fixfull namespace
1829 if is_in_table_scope 'tr'
1830 clear_stack_to_table_row_context()
1832 insertion_mode = ins_mode_in_table_body
1837 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1843 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1845 generate_implied_end_tags()
1846 unless open_els[0].name is 'td' or open_els[0] is 'th'
1849 el = open_els.shift()
1850 if el.name is 'td' or el.name is 'th'
1852 clear_afe_to_marker()
1853 insertion_mode = ins_mode_in_row
1855 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1856 ins_mode_in_cell = (t) ->
1857 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1858 if is_in_table_scope t.name
1859 generate_implied_end_tags()
1860 if open_els[0].name isnt t.name
1863 el = open_els.shift()
1864 if el.name is t.name
1866 clear_afe_to_marker()
1867 insertion_mode = ins_mode_in_row
1871 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1874 if el.name is 'td' or el.name is 'th'
1877 if table_scopers[el.name]
1885 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1888 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1889 if is_in_table_scope t.name # fixfull namespace
1898 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
1899 ins_mode_in_select = (t) ->
1900 if t.type is TYPE_TEXT and t.text is "\u0000"
1903 if t.type is TYPE_TEXT
1906 if t.type is TYPE_COMMENT
1909 if t.type is TYPE_DOCTYPE
1912 if t.type is TYPE_START_TAG and t.name is 'html'
1915 if t.type is TYPE_START_TAG and t.name is 'option'
1916 if open_els[0].name is 'option'
1918 insert_html_element t
1920 if t.type is TYPE_START_TAG and t.name is 'optgroup'
1921 if open_els[0].name is 'option'
1923 if open_els[0].name is 'optgroup'
1925 insert_html_element t
1927 if t.type is TYPE_END_TAG and t.name is 'optgroup'
1928 if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
1930 if open_els[0].name is 'optgroup'
1935 if t.type is TYPE_END_TAG and t.name is 'option'
1936 if open_els[0].name is 'option'
1941 if t.type is TYPE_END_TAG and t.name is 'select'
1942 if is_in_select_scope 'select'
1944 el = open_els.shift()
1945 if el.name is 'select'
1947 reset_insertion_mode()
1951 if t.type is TYPE_START_TAG and t.name is 'select'
1954 el = open_els.shift()
1955 if el.name is 'select'
1957 reset_insertion_mode()
1958 # spec says that this is the same as </select> but it doesn't say
1959 # to check scope first
1961 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
1963 if is_in_select_scope 'select'
1966 el = open_els.shift()
1967 if el.name is 'select'
1969 reset_insertion_mode()
1972 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
1975 if t.type is TYPE_EOF
1982 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
1983 ins_mode_in_select_in_table = (t) ->
1984 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1987 el = open_els.shift()
1988 if el.name is 'select'
1990 reset_insertion_mode()
1993 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1995 unless is_in_table_scope t.name, NS_HTML
1998 el = open_els.shift()
1999 if el.name is 'select'
2001 reset_insertion_mode()
2005 ins_mode_in_select t
2008 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2009 ins_mode_in_template = (t) ->
2010 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2013 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2016 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2017 template_insertion_modes.shift()
2018 template_insertion_modes.unshift ins_mode_in_table
2019 insertion_mode = ins_mode_in_table
2022 if t.type is TYPE_START_TAG and t.name is 'col'
2023 template_insertion_modes.shift()
2024 template_insertion_modes.unshift ins_mode_in_column_group
2025 insertion_mode = ins_mode_in_column_group
2028 if t.type is TYPE_START_TAG and t.name is 'tr'
2029 template_insertion_modes.shift()
2030 template_insertion_modes.unshift ins_mode_in_table_body
2031 insertion_mode = ins_mode_in_table_body
2034 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2035 template_insertion_modes.shift()
2036 template_insertion_modes.unshift ins_mode_in_row
2037 insertion_mode = ins_mode_in_row
2040 if t.type is TYPE_START_TAG
2041 template_insertion_modes.shift()
2042 template_insertion_modes.unshift ins_mode_in_body
2043 insertion_mode = ins_mode_in_body
2046 if t.type is TYPE_END_TAG
2050 unless template_tag_is_open()
2055 el = open_els.shift()
2056 if el.name is 'template' # fixfull check namespace
2058 clear_afe_to_marker()
2059 template_insertion_modes.shift()
2060 reset_insertion_mode()
2063 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2064 ins_mode_after_body = (t) ->
2068 if t.type is TYPE_COMMENT
2069 insert_comment t, [open_els[0], open_els[0].children.length]
2071 if t.type is TYPE_DOCTYPE
2074 if t.type is TYPE_START_TAG and t.name is 'html'
2077 if t.type is TYPE_END_TAG and t.name is 'html'
2078 # fixfull fragment case
2079 insertion_mode = ins_mode_after_after_body
2081 if t.type is TYPE_EOF
2086 insertion_mode = ins_mode_in_body
2089 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2090 ins_mode_in_frameset = (t) ->
2094 if t.type is TYPE_COMMENT
2097 if t.type is TYPE_DOCTYPE
2100 if t.type is TYPE_START_TAG and t.name is 'html'
2103 if t.type is TYPE_START_TAG and t.name is 'frameset'
2104 insert_html_element t
2106 if t.type is TYPE_END_TAG and t.name is 'frameset'
2107 # TODO ?correct for: "if the current node is the root html element"
2108 if open_els.length is 1
2110 return # fragment case
2112 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2113 insertion_mode = ins_mode_after_frameset
2115 if t.type is TYPE_START_TAG and t.name is 'frame'
2116 insert_html_element t
2118 t.acknowledge_self_closing()
2120 if t.type is TYPE_START TAG and t.name is 'noframes'
2123 if t.type is TYPE_EOF
2124 # TODO ?correct for: "if the current node is not the root html element"
2125 if open_els.length isnt 1
2133 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2134 ins_mode_after_frameset = (t) ->
2138 if t.type is TYPE_COMMENT
2141 if t.type is TYPE_DOCTYPE
2144 if t.type is TYPE_START_TAG and t.name is 'html'
2147 if t.type is TYPE_END_TAG and t.name is 'html'
2148 insert_mode = ins_mode_after_after_frameset
2150 if t.type is TYPE_START_TAG and t.name is 'noframes'
2153 if t.type is TYPE_EOF
2160 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2161 ins_mode_after_after_body = (t) ->
2162 if t.type is TYPE_COMMENT
2163 insert_comment t, [doc, doc.children.length]
2165 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2168 if t.type is TYPE_EOF
2173 insertion_mode = ins_mode_in_body
2176 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2177 ins_mode_after_after_frameset = (t) ->
2178 if t.type is TYPE_COMMENT
2179 insert_comment t, [doc, doc.children.length]
2181 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2184 if t.type is TYPE_EOF
2187 if t.type is TYPE_START_TAG and t.name is 'noframes'
2198 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2200 switch c = txt.charAt(cur++)
2202 return new_text_node parse_character_reference()
2204 tok_state = tok_state_tag_open
2207 return new_text_node c
2209 return new_eof_token()
2211 return new_text_node c
2214 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2215 # not needed: tok_state_character_reference_in_data = ->
2216 # just call parse_character_reference()
2218 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2219 tok_state_rcdata = ->
2220 switch c = txt.charAt(cur++)
2222 return new_text_node parse_character_reference()
2224 tok_state = tok_state_rcdata_less_than_sign
2227 return new_character_token "\ufffd"
2229 return new_eof_token()
2231 return new_character_token c
2234 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2235 # not needed: tok_state_character_reference_in_rcdata = ->
2236 # just call parse_character_reference()
2238 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2239 tok_state_rawtext = ->
2240 switch c = txt.charAt(cur++)
2242 tok_state = tok_state_rawtext_less_than_sign
2245 return new_character_token "\ufffd"
2247 return new_eof_token()
2249 return new_character_token c
2252 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2253 tok_state_script_data = ->
2254 switch c = txt.charAt(cur++)
2256 tok_state = tok_state_script_data_less_than_sign
2259 return new_character_token "\ufffd"
2261 return new_eof_token()
2263 return new_character_token c
2266 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2267 tok_state_plaintext = ->
2268 switch c = txt.charAt(cur++)
2271 return new_character_token "\ufffd"
2273 return new_eof_token()
2275 return new_character_token c
2279 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2280 tok_state_tag_open = ->
2281 switch c = txt.charAt(cur++)
2283 tok_state = tok_state_markup_declaration_open
2285 tok_state = tok_state_end_tag_open
2288 tok_cur_tag = new_comment_token '?'
2289 tok_state = tok_state_bogus_comment
2291 if lc_alpha.indexOf(c) > -1
2292 tok_cur_tag = new_open_tag c
2293 tok_state = tok_state_tag_name
2294 else if uc_alpha.indexOf(c) > -1
2295 tok_cur_tag = new_open_tag c.toLowerCase()
2296 tok_state = tok_state_tag_name
2299 tok_state = tok_state_data
2300 cur -= 1 # we didn't parse/handle the char after <
2301 return new_text_node '<'
2304 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2305 tok_state_end_tag_open = ->
2306 switch c = txt.charAt(cur++)
2309 tok_state = tok_state_data
2312 tok_state = tok_state_data
2313 return new_text_node '</'
2315 if uc_alpha.indexOf(c) > -1
2316 tok_cur_tag = new_end_tag c.toLowerCase()
2317 tok_state = tok_state_tag_name
2318 else if lc_alpha.indexOf(c) > -1
2319 tok_cur_tag = new_end_tag c
2320 tok_state = tok_state_tag_name
2323 tok_cur_tag = new_comment_token '/'
2324 tok_state = tok_state_bogus_comment
2327 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2328 tok_state_tag_name = ->
2329 switch c = txt.charAt(cur++)
2330 when "\t", "\n", "\u000c", ' '
2331 tok_state = tok_state_before_attribute_name
2333 tok_state = tok_state_self_closing_start_tag
2335 tok_state = tok_state_data
2341 tok_cur_tag.name += "\ufffd"
2344 tok_state = tok_state_data
2346 if uc_alpha.indexOf(c) > -1
2347 tok_cur_tag.name += c.toLowerCase()
2349 tok_cur_tag.name += c
2352 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2353 tok_state_rcdata_less_than_sign = ->
2354 c = txt.charAt(cur++)
2356 temporary_buffer = ''
2357 tok_state = tok_state_rcdata_end_tag_open
2360 tok_state = tok_state_rcdata
2361 cur -= 1 # reconsume the input character
2362 return new_character_token '<'
2364 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2365 tok_state_rcdata_end_tag_open = ->
2366 c = txt.charAt(cur++)
2367 if uc_alpha.indexOf(c) > -1
2368 tok_cur_tag = new_end_tag c.toLowerCase()
2369 temporary_buffer += c
2370 tok_state = tok_state_rcdata_end_tag_name
2372 if lc_alpha.indexOf(c) > -1
2373 tok_cur_tag = new_end_tag c
2374 temporary_buffer += c
2375 tok_state = tok_state_rcdata_end_tag_name
2378 tok_state = tok_state_rcdata
2379 cur -= 1 # reconsume the input character
2380 return new_character_token "</" # fixfull separate these
2382 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2383 is_appropriate_end_tag = (t) ->
2384 # spec says to check against "the tag name of the last start tag to
2385 # have been emitted from this tokenizer", but this is only called from
2386 # the various "raw" states, which I'm pretty sure all push the start
2387 # token onto open_els. TODO: verify this after the script data states
2389 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2390 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2392 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2393 tok_state_rcdata_end_tag_name = ->
2394 c = txt.charAt(cur++)
2395 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2396 if is_appropriate_end_tag tok_cur_tag
2397 tok_state = tok_state_before_attribute_name
2399 # else fall through to "Anything else"
2401 if is_appropriate_end_tag tok_cur_tag
2402 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2404 # else fall through to "Anything else"
2406 if is_appropriate_end_tag tok_cur_tag
2407 tok_state = tok_state_data
2409 # else fall through to "Anything else"
2410 if uc_alpha.indexOf(c) > -1
2411 tok_cur_tag.name += c.toLowerCase()
2412 temporary_buffer += c
2414 if lc_alpha.indexOf(c) > -1
2415 tok_cur_tag.name += c
2416 temporary_buffer += c
2419 tok_state = tok_state_rcdata
2420 cur -= 1 # reconsume the input character
2421 return new_character_token '</' + temporary_buffer # fixfull separate these
2423 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2424 tok_state_rawtext_less_than_sign = ->
2425 c = txt.charAt(cur++)
2427 temporary_buffer = ''
2428 tok_state = tok_state_rawtext_end_tag_open
2431 tok_state = tok_state_rawtext
2432 cur -= 1 # reconsume the input character
2433 return new_character_token '<'
2435 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2436 tok_state_rawtext_end_tag_open = ->
2437 c = txt.charAt(cur++)
2438 if uc_alpha.indexOf(c) > -1
2439 tok_cur_tag = new_end_tag c.toLowerCase()
2440 temporary_buffer += c
2441 tok_state = tok_state_rawtext_end_tag_name
2443 if lc_alpha.indexOf(c) > -1
2444 tok_cur_tag = new_end_tag c
2445 temporary_buffer += c
2446 tok_state = tok_state_rawtext_end_tag_name
2449 tok_state = tok_state_rawtext
2450 cur -= 1 # reconsume the input character
2451 return new_character_token "</" # fixfull separate these
2453 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2454 tok_state_rawtext_end_tag_name = ->
2455 c = txt.charAt(cur++)
2456 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2457 if is_appropriate_end_tag tok_cur_tag
2458 tok_state = tok_state_before_attribute_name
2460 # else fall through to "Anything else"
2462 if is_appropriate_end_tag tok_cur_tag
2463 tok_state = tok_state_self_closing_start_tag
2465 # else fall through to "Anything else"
2467 if is_appropriate_end_tag tok_cur_tag
2468 tok_state = tok_state_data
2470 # else fall through to "Anything else"
2471 if uc_alpha.indexOf(c) > -1
2472 tok_cur_tag.name += c.toLowerCase()
2473 temporary_buffer += c
2475 if lc_alpha.indexOf(c) > -1
2476 tok_cur_tag.name += c
2477 temporary_buffer += c
2480 tok_state = tok_state_rawtext
2481 cur -= 1 # reconsume the input character
2482 return new_character_token '</' + temporary_buffer # fixfull separate these
2484 # TODO _all_ of the missing states here (17-33) are for parsing script tags
2486 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2487 tok_state_before_attribute_name = ->
2489 switch c = txt.charAt(cur++)
2490 when "\t", "\n", "\u000c", ' '
2493 tok_state = tok_state_self_closing_start_tag
2496 tok_state = tok_state_data
2502 attr_name = "\ufffd"
2503 when '"', "'", '<', '='
2508 tok_state = tok_state_data
2510 if uc_alpha.indexOf(c) > -1
2511 attr_name = c.toLowerCase()
2515 tok_cur_tag.attrs_a.unshift [attr_name, '']
2516 tok_state = tok_state_attribute_name
2519 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2520 tok_state_attribute_name = ->
2521 switch c = txt.charAt(cur++)
2522 when "\t", "\n", "\u000c", ' '
2523 tok_state = tok_state_after_attribute_name
2525 tok_state = tok_state_self_closing_start_tag
2527 tok_state = tok_state_before_attribute_value
2529 tok_state = tok_state_data
2535 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2538 tok_cur_tag.attrs_a[0][0] = c
2541 tok_state = tok_state_data
2543 if uc_alpha.indexOf(c) > -1
2544 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2546 tok_cur_tag.attrs_a[0][0] += c
2549 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2550 tok_state_after_attribute_name = ->
2551 c = txt.charAt(cur++)
2552 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2555 tok_state = tok_state_self_closing_start_tag
2558 tok_state = tok_state_before_attribute_value
2561 tok_state = tok_state_data
2563 if uc_alpha.indexOf(c) > -1
2564 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2565 tok_state = tok_state_attribute_name
2569 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2570 tok_state = tok_state_attribute_name
2574 tok_state = tok_state_data
2575 cur -= 1 # reconsume
2577 if c is '"' or c is "'" or c is '<'
2579 # fall through to Anything else
2581 tok_cur_tag.attrs_a.unshift [c, '']
2582 tok_state = tok_state_attribute_name
2584 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2585 tok_state_before_attribute_value = ->
2586 switch c = txt.charAt(cur++)
2587 when "\t", "\n", "\u000c", ' '
2590 tok_state = tok_state_attribute_value_double_quoted
2592 tok_state = tok_state_attribute_value_unquoted
2595 tok_state = tok_state_attribute_value_single_quoted
2598 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2599 tok_state = tok_state_attribute_value_unquoted
2602 tok_state = tok_state_data
2608 tok_state = tok_state_data
2610 tok_cur_tag.attrs_a[0][1] += c
2611 tok_state = tok_state_attribute_value_unquoted
2614 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2615 tok_state_attribute_value_double_quoted = ->
2616 switch c = txt.charAt(cur++)
2618 tok_state = tok_state_after_attribute_value_quoted
2620 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2623 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2626 tok_state = tok_state_data
2628 tok_cur_tag.attrs_a[0][1] += c
2631 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2632 tok_state_attribute_value_single_quoted = ->
2633 switch c = txt.charAt(cur++)
2635 tok_state = tok_state_after_attribute_value_quoted
2637 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2640 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2643 tok_state = tok_state_data
2645 tok_cur_tag.attrs_a[0][1] += c
2648 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2649 tok_state_attribute_value_unquoted = ->
2650 switch c = txt.charAt(cur++)
2651 when "\t", "\n", "\u000c", ' '
2652 tok_state = tok_state_before_attribute_name
2654 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2656 tok_state = tok_state_data
2661 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2664 tok_state = tok_state_data
2666 # Parse Error if ', <, = or ` (backtick)
2667 tok_cur_tag.attrs_a[0][1] += c
2670 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2671 tok_state_after_attribute_value_quoted = ->
2672 switch c = txt.charAt(cur++)
2673 when "\t", "\n", "\u000c", ' '
2674 tok_state = tok_state_before_attribute_name
2676 tok_state = tok_state_self_closing_start_tag
2678 tok_state = tok_state_data
2684 tok_state = tok_state_data
2687 tok_state = tok_state_before_attribute_name
2688 cur -= 1 # we didn't handle that char
2691 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
2692 # WARNING: put a comment token in tok_cur_tag before setting this state
2693 tok_state_bogus_comment = ->
2694 next_gt = txt.indexOf '>', cur
2696 val = txt.substr cur
2699 val = txt.substr cur, (next_gt - cur)
2701 val = val.replace "\u0000", "\ufffd"
2702 tok_cur_tag.text += val
2703 tok_state = tok_state_data
2706 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
2707 tok_state_markup_declaration_open = ->
2708 if txt.substr(cur, 2) is '--'
2710 tok_cur_tag = new_comment_token ''
2711 tok_state = tok_state_comment_start
2713 if txt.substr(cur, 7).toLowerCase() is 'doctype'
2715 tok_state = tok_state_doctype
2717 acn = adjusted_current_node()
2718 if acn and acn.namespace isnt NS_HTML and text.substr(cur, 7) is '[CDATA['
2720 tok_state = tok_state_cdata_section
2724 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
2725 tok_state = tok_state_bogus_comment
2728 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
2729 tok_state_comment_start = ->
2730 switch c = txt.charAt(cur++)
2732 tok_state = tok_state_comment_start_dash
2735 return new_character_token "\ufffd"
2738 tok_state = tok_state_data
2742 tok_state = tok_state_data
2743 cur -= 1 # Reconsume
2746 tok_cur_tag.text += c
2749 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
2750 tok_state_comment_start_dash = ->
2751 switch c = txt.charAt(cur++)
2753 tok_state = tok_state_comment_end
2756 tok_cur_tag.text += "-\ufffd"
2757 tok_state = tok_state_comment
2760 tok_state = tok_state_data
2764 tok_state = tok_state_data
2765 cur -= 1 # Reconsume
2768 tok_cur_tag.text += "-#{c}"
2769 tok_state = tok_state_comment
2772 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
2773 tok_state_comment = ->
2774 switch c = txt.charAt(cur++)
2776 tok_state = tok_state_comment_end_dash
2779 tok_cur_tag.text += "\ufffd"
2782 tok_state = tok_state_data
2783 cur -= 1 # Reconsume
2786 tok_cur_tag.text += c
2789 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
2790 tok_state_comment_end_dash = ->
2791 switch c = txt.charAt(cur++)
2793 tok_state = tok_state_comment_end
2796 tok_cur_tag.text += "-\ufffd"
2797 tok_state = tok_state_comment
2800 tok_state = tok_state_data
2801 cur -= 1 # Reconsume
2804 tok_cur_tag.text += "-#{c}"
2805 tok_state = tok_state_comment
2808 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
2809 tok_state_comment_end = ->
2810 switch c = txt.charAt(cur++)
2812 tok_state = tok_state_data
2816 tok_cur_tag.text += "--\ufffd"
2817 tok_state = tok_state_comment
2820 tok_state = tok_state_comment_end_bang
2823 tok_cur_tag.text += '-'
2826 tok_state = tok_state_data
2827 cur -= 1 # Reconsume
2831 tok_cur_tag.text += "--#{c}"
2832 tok_state = tok_state_comment
2835 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
2836 tok_state_comment_end_bang = ->
2837 switch c = txt.charAt(cur++)
2839 tok_cur_tag.text += "--!#{c}"
2840 tok_state = tok_state_comment_end_dash
2842 tok_state = tok_state_data
2846 tok_cur_tag.text += "--!\ufffd"
2847 tok_state = tok_state_comment
2850 tok_state = tok_state_data
2851 cur -= 1 # Reconsume
2854 tok_cur_tag.text += "--!#{c}"
2855 tok_state = tok_state_comment
2859 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
2860 # Don't set this as a state, just call it
2861 # returns a string (NOT a text node)
2862 parse_character_reference = (allowed_char = null, in_attr = false) ->
2863 if cur >= txt.length
2865 switch c = txt.charAt(cur)
2866 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
2867 # explicitly not a parse error
2870 # there has to be "one or more" alnums between & and ; to be a parse error
2873 if cur + 1 >= txt.length
2875 if txt.charAt(cur + 1).toLowerCase() is 'x'
2884 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
2888 if txt.charAt(start + i) is ';'
2890 # FIXME This is supposed to generate parse errors for some chars
2891 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
2898 if alnum.indexOf(txt.charAt(cur + i)) is -1
2901 # exit early, because parse_error() below needs at least one alnum
2903 if txt.charAt(cur + i) is ';'
2904 i += 1 # include ';' terminator in value
2905 decoded = decode_named_char_ref txt.substr(cur, i)
2912 # no ';' terminator (only legacy char refs)
2914 for i in [2..max] # no prefix matches, so ok to check shortest first
2915 c = legacy_char_refs[txt.substr(cur, i)]
2918 if txt.charAt(cur + i) is '='
2919 # "because some legacy user agents will
2920 # misinterpret the markup in those cases"
2923 if alnum.indexOf(txt.charAt(cur + i)) > -1
2924 # this makes attributes forgiving about url args
2926 # ok, and besides the weird exceptions for attributes...
2927 # return the matching char
2928 cur += i # consume entity chars
2929 parse_error() # because no terminating ";"
2933 return # never reached
2935 # tree constructor initialization
2936 # see comments on TYPE_TAG/etc for the structure of this data
2937 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
2939 afe = [] # active formatting elements
2940 template_insertion_modes = []
2941 insertion_mode = ins_mode_initial
2942 original_insertion_mode = insertion_mode # TODO check spec
2943 flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
2944 flag_frameset_ok = true
2946 flag_foster_parenting = false
2947 form_element_pointer = null
2948 temporary_buffer = null
2949 pending_table_character_tokens = []
2950 head_element_pointer = null
2951 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
2952 context_element = null # FIXME initialize from args.fragment
2954 # tokenizer initialization
2955 tok_state = tok_state_data
2962 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
2965 serialize_els = (els, shallow, show_ids) ->
2971 serialized += t.serialize shallow, show_ids
2974 # TODO export TYPE_*
2975 module.exports.parse_html = parse_html
2976 module.exports.debug_log_reset = debug_log_reset
2977 module.exports.debug_log_each = debug_log_each
2978 module.exports.TYPE_TAG = TYPE_TAG
2979 module.exports.TYPE_TEXT = TYPE_TEXT
2980 module.exports.TYPE_COMMENT = TYPE_COMMENT
2981 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE