1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
55 module = exports: window.wheic
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
79 debug_log_each = (cb) ->
80 for str in g_debug_log
85 constructor: (type, args = {}) ->
86 @type = type # one of the TYPE_* constants above
87 @name = args.name ? '' # tag name
88 @text = args.text ? '' # contents for text/comment nodes
89 @attrs = args.attrs ? {}
90 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91 @children = args.children ? []
92 @namespace = args.namespace ? NS_HTML
93 @parent = args.parent ? null
94 @token = args.token ? null
98 @id = "#{++prev_node_id}"
99 shallow_clone: -> # return a new node that's the same except without the children or parent
100 # WARNING this doesn't work right on open tags that are still being parsed
102 attrs[k] = v for k, v of @attrs
103 return new Node @type, name: @name, text: @text, attrs: attrs, namespace: @namespace, id: @id, token: @token
104 acknowledge_self_closing: ->
106 @token.flag 'did_self_close'
108 @flag 'did_self_close', true
111 serialize: (shallow = false, show_ids = false) -> # for unit tests
116 ret += JSON.stringify @name
131 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
137 ret += c.serialize shallow, show_ids
141 ret += JSON.stringify @text
144 ret += JSON.stringify @text
150 when TYPE_AAA_BOOKMARK
151 ret += 'aaa_bookmark'
154 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
157 # helpers: (only take args that are normally known when parser creates nodes)
158 new_open_tag = (name) ->
159 return new Node TYPE_START_TAG, name: name
160 new_end_tag = (name) ->
161 return new Node TYPE_END_TAG, name: name
162 new_element = (name) ->
163 return new Node TYPE_TAG, name: name
164 new_text_node = (txt) ->
165 return new Node TYPE_TEXT, text: txt
166 new_character_token = new_text_node
167 new_comment_node = (txt) ->
168 return new Node TYPE_COMMENT, text: txt
170 return new Node TYPE_EOF
172 return new Node TYPE_AFE_MARKER
173 new_aaa_bookmark = ->
174 return new Node TYPE_AAA_BOOKMARK
176 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
177 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
178 digits = "0123456789"
179 alnum = lc_alpha + uc_alpha + digits
180 hex_chars = digits + "abcdefABCDEF"
182 # some SVG elements have dashes in them
183 tag_name_chars = alnum + "-"
185 # http://www.w3.org/TR/html5/infrastructure.html#space-character
186 space_chars = "\u0009\u000a\u000c\u000d\u0020"
188 return txt.length is 1 and space_chars.indexOf(txt) > -1
189 is_space_tok = (t) ->
190 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
192 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
193 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
195 # These are the character references that don't need a terminating semicolon
196 # min length: 2, max: 6, none are a prefix of any other.
198 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
199 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
200 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
201 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
202 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
203 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
204 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
205 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
206 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
207 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
208 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
209 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
210 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
211 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
212 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
213 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
214 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
218 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
219 raw_text_elements = ['script', 'style']
220 escapable_raw_text_elements = ['textarea', 'title']
221 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
223 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
224 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
225 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
226 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
227 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
228 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
229 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
230 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
231 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
232 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
233 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
234 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
235 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
236 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
240 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
242 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
243 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
244 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
245 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
246 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
247 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
248 'determinant', 'diff', 'divergence', 'divide', 'domain',
249 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
250 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
251 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
252 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
253 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
254 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
255 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
256 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
257 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
258 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
259 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
260 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
261 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
262 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
263 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
264 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
265 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
266 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
267 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
268 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
269 'vectorproduct', 'xor'
271 # foreign_elements = [svg_elements..., mathml_elements...]
272 #normal_elements = All other allowed HTML elements are normal elements.
276 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
277 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
278 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
279 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
280 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
281 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
282 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
283 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
284 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
285 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
286 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
287 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
288 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
289 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
290 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
291 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
292 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
293 wbr:NS_HTML, xmp:NS_HTML,
296 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
297 'annotation-xml':NS_MATHML,
300 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
303 formatting_elements = {
304 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
305 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
309 foster_parenting_targets = {
331 el_is_special = (e) ->
332 return special_elements[e.name] is e.namespace
334 # decode_named_char_ref()
336 # The list of named character references is _huge_ so ask the browser to decode
337 # for us instead of wasting bandwidth/space on including the table here.
339 # Pass without the "&" but with the ";" examples:
340 # for "&" pass "amp;"
341 # for "′" pass "x2032;"
344 textarea: document.createElement('textarea')
346 # TODO test this in IE8
347 decode_named_char_ref = (txt) ->
349 decoded = g_dncr.cache[txt]
350 return decoded if decoded?
351 g_dncr.textarea.innerHTML = txt
352 decoded = g_dncr.textarea.value
353 return null if decoded is txt
354 return g_dncr.cache[txt] = decoded
356 parse_html = (txt, parse_error_cb = null) ->
357 cur = 0 # index of next char in txt to be parsed
358 # declare doc and tokenizer variables so they're in scope below
360 open_els = null # stack of open elements
361 afe = null # active formatting elements
362 template_insertion_modes = null
363 insertion_mode = null
364 original_insertion_mode = null
366 tok_cur_tag = null # partially parsed tag
367 flag_scripting = null
368 flag_frameset_ok = null
370 flag_foster_parenting = null
371 form_element_pointer = null
372 temporary_buffer = null
373 pending_table_character_tokens = null
374 head_element_pointer = null
375 flag_fragment_parsing = null
384 console.log "Parse error at character #{cur} of #{txt.length}"
386 afe_push = (new_el) ->
389 if el.name is new_el.name and el.namespace is new_el.namespace
391 continue unless new_el.attrs[k] is v
392 for k, v of new_el.attrs
393 continue unless el.attrs[k] is v
400 afe.unshift new_afe_marker()
402 # the functions below impliment the Tree Contstruction algorithm
403 # http://www.w3.org/TR/html5/syntax.html#tree-construction
405 # But first... the helpers
406 template_tag_is_open = ->
408 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
411 is_in_scope_x = (tag_name, scope, namespace) ->
413 if t.name is tag_name and (namespace is null or namespace is t.namespace)
415 if scope[t.name] is t.namespace
418 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
420 if t.name is tag_name and (namespace is null or namespace is t.namespace)
422 if scope[t.name] is t.namespace
424 if scope2[t.name] is t.namespace
427 standard_scopers = { # FIXME these are supposed to be namespace specific
428 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
429 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
430 template: NS_HTML, mi: NS_MATHML,
432 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
433 'annotation-xml': NS_MATHML,
435 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
437 button_scopers = button: NS_HTML
438 li_scopers = ol: NS_HTML, ul: NS_HTML
439 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
440 is_in_scope = (tag_name, namespace = null) ->
441 return is_in_scope_x tag_name, standard_scopers, namespace
442 is_in_button_scope = (tag_name, namespace = null) ->
443 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
444 is_in_table_scope = (tag_name, namespace = null) ->
445 return is_in_scope_x tag_name, table_scopers, namespace
446 is_in_select_scope = (tag_name, namespace = null) ->
448 if t.name is tag_name and (namespace is null or namespace is t.namespace)
450 if t.ns isnt NS_HTML t.name isnt 'optgroup' and t.name isnt 'option'
453 # this checks for a particular element, not by name
454 el_is_in_scope = (el) ->
458 if standard_scopers[t.name] is t.namespace
462 clear_to_table_stopers = {
467 clear_stack_to_table_context = ->
469 if clear_to_table_stopers[open_els[0].name]?
473 clear_to_table_body_stopers = {
480 clear_stack_to_table_body_context = ->
482 if clear_to_table_body_stopers[open_els[0].name]?
486 clear_to_table_row_stopers = {
491 clear_stack_to_table_row_context = ->
493 if clear_to_table_row_stopers[open_els[0].name]?
497 clear_afe_to_marker = ->
500 if el.type is TYPE_AFE_MARKER
504 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
505 reset_insertion_mode = ->
506 # 1. Let last be false.
508 # 2. Let node be the last node in the stack of open elements.
510 node = open_els[node_i]
511 # 3. Loop: If node is the first node in the stack of open elements,
512 # then set last to true, and, if the parser was originally created as
513 # part of the HTML fragment parsing algorithm (fragment case) set node
514 # to the context element.
516 if node_i is open_els.length - 1
518 # fixfull (fragment case)
520 # 4. If node is a select element, run these substeps:
521 if node.name is 'select'
522 # 1. If last is true, jump to the step below labeled done.
524 # 2. Let ancestor be node.
527 # 3. Loop: If ancestor is the first node in the stack of
528 # open elements, jump to the step below labeled done.
530 if ancestor_i is open_els.length - 1
532 # 4. Let ancestor be the node before ancestor in the stack
535 ancestor = open_els[ancestor_i]
536 # 5. If ancestor is a template node, jump to the step below
538 if ancestor.name is 'template'
540 # 6. If ancestor is a table node, switch the insertion mode
541 # to "in select in table" and abort these steps.
542 if ancestor.name is 'table'
543 insertion_mode = ins_mode_in_select_in_table
545 # 7. Jump back to the step labeled loop.
546 # 8. Done: Switch the insertion mode to "in select" and abort
548 insertion_mode = ins_mode_in_select
550 # 5. If node is a td or th element and last is false, then switch
551 # the insertion mode to "in cell" and abort these steps.
552 if (node.name is 'td' or node.name is 'th') and last is false
553 insertion_mode = ins_mode_in_cell
555 # 6. If node is a tr element, then switch the insertion mode to "in
556 # row" and abort these steps.
558 insertion_mode = ins_mode_in_row
560 # 7. If node is a tbody, thead, or tfoot element, then switch the
561 # insertion mode to "in table body" and abort these steps.
562 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
563 insertion_mode = ins_mode_in_table_body
565 # 8. If node is a caption element, then switch the insertion mode
566 # to "in caption" and abort these steps.
567 if node.name is 'caption'
568 insertion_mode = ins_mode_in_caption
570 # 9. If node is a colgroup element, then switch the insertion mode
571 # to "in column group" and abort these steps.
572 if node.name is 'colgroup'
573 insertion_mode = ins_mode_in_column_group
575 # 10. If node is a table element, then switch the insertion mode to
576 # "in table" and abort these steps.
577 if node.name is 'table'
578 insertion_mode = ins_mode_in_table
580 # 11. If node is a template element, then switch the insertion mode
581 # to the current template insertion mode and abort these steps.
582 # fixfull (template insertion mode stack)
584 # 12. If node is a head element and last is true, then switch the
585 # insertion mode to "in body" ("in body"! not "in head"!) and abort
586 # these steps. (fragment case)
587 if node.name is 'head' and last
588 insertion_mode = ins_mode_in_body
590 # 13. If node is a head element and last is false, then switch the
591 # insertion mode to "in head" and abort these steps.
592 if node.name is 'head' and last is false
593 insertion_mode = ins_mode_in_head
595 # 14. If node is a body element, then switch the insertion mode to
596 # "in body" and abort these steps.
597 if node.name is 'body'
598 insertion_mode = ins_mode_in_body
600 # 15. If node is a frameset element, then switch the insertion mode
601 # to "in frameset" and abort these steps. (fragment case)
602 if node.name is 'frameset'
603 insertion_mode = ins_mode_in_frameset
605 # 16. If node is an html element, run these substeps:
606 if node.name is 'html'
607 # 1. If the head element pointer is null, switch the insertion
608 # mode to "before head" and abort these steps. (fragment case)
609 # fixfull (fragment case)
611 # 2. Otherwise, the head element pointer is not null, switch
612 # the insertion mode to "after head" and abort these steps.
613 insertion_mode = ins_mode_in_body # FIXME fixfull
615 # 17. If last is true, then switch the insertion mode to "in body"
616 # and abort these steps. (fragment case)
618 insertion_mode = ins_mode_in_body
620 # 18. Let node now be the node before node in the stack of open
623 node = open_els[node_i]
624 # 19. Return to the step labeled loop.
626 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
627 # this implementation is structured (mostly) as described at the link above.
628 # capitalized comments are the "labels" described at the link above.
629 reconstruct_active_formatting_elements = ->
630 return if afe.length is 0
631 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
636 if i is afe.length - 1
639 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
644 el = afe[i].shallow_clone()
645 tree_insert_element el
650 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
651 # adoption agency algorithm
653 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
654 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
655 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
656 adoption_agency = (subject) ->
657 debug_log "adoption_agency()"
658 debug_log "tree: #{serialize_els doc.children, false, true}"
659 debug_log "open_els: #{serialize_els open_els, true, true}"
660 debug_log "afe: #{serialize_els afe, true, true}"
661 if open_els[0].name is subject
664 # remove it from the list of active formatting elements (if found)
669 debug_log "aaa: starting off with subject on top of stack, exiting"
676 # 5. Let formatting element be the last element in the list of
677 # active formatting elements that: is between the end of the list
678 # and the last scope marker in the list, if any, or the start of
679 # the list otherwise, and has the tag name subject.
681 for t, fe_of_afe in afe
682 if t.type is TYPE_AFE_MARKER
687 # If there is no such element, then abort these steps and instead
688 # act as described in the "any other end tag" entry above.
690 debug_log "aaa: fe not found in afe"
691 in_body_any_other_end_tag subject
693 # 6. If formatting element is not in the stack of open elements,
694 # then this is a parse error; remove the element from the list, and
697 for t, fe_of_open_els in open_els
702 debug_log "aaa: fe not found in open_els"
704 # "remove it from the list" must mean afe, since it's not in open_els
705 afe.splice fe_of_afe, 1
707 # 7. If formatting element is in the stack of open elements, but
708 # the element is not in scope, then this is a parse error; abort
710 unless el_is_in_scope fe
711 debug_log "aaa: fe not in scope"
714 # 8. If formatting element is not the current node, this is a parse
715 # error. (But do not abort these steps.)
716 unless open_els[0] is fe
719 # 9. Let furthest block be the topmost node in the stack of open
720 # elements that is lower in the stack than formatting element, and
721 # is an element in the special category. There might not be one.
723 fb_of_open_els = null
730 # and continue, to see if there's one that's more "topmost"
731 # 10. If there is no furthest block, then the UA must first pop all
732 # the nodes from the bottom of the stack of open elements, from the
733 # current node up to and including formatting element, then remove
734 # formatting element from the list of active formatting elements,
735 # and finally abort these steps.
737 debug_log "aaa: no fb"
741 afe.splice fe_of_afe, 1
743 # 11. Let common ancestor be the element immediately above
744 # formatting element in the stack of open elements.
745 ca = open_els[fe_of_open_els + 1] # common ancestor
747 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
748 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
749 bookmark = new_aaa_bookmark()
752 afe.splice i, 0, bookmark
754 node = last_node = fb
758 # 3. Let node be the element immediately above node in the
759 # stack of open elements, or if node is no longer in the stack
760 # of open elements (e.g. because it got removed by this
761 # algorithm), the element that was immediately above node in
762 # the stack of open elements before node was removed.
766 node_next = open_els[i + 1]
768 node = node_next ? node_above
769 debug_log "inner loop #{inner}"
770 debug_log "tree: #{serialize_els doc.children, false, true}"
771 debug_log "open_els: #{serialize_els open_els, true, true}"
772 debug_log "afe: #{serialize_els afe, true, true}"
773 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
774 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
775 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
776 debug_log "node: #{node.serialize true, true}"
777 # TODO make sure node_above gets re-set if/when node is removed from open_els
779 # 4. If node is formatting element, then go to the next step in
780 # the overall algorithm.
784 # 5. If inner loop counter is greater than three and node is in
785 # the list of active formatting elements, then remove node from
786 # the list of active formatting elements.
792 debug_log "max out inner"
797 # 6. If node is not in the list of active formatting elements,
798 # then remove node from the stack of open elements and then go
799 # back to the step labeled inner loop.
801 debug_log "not in afe"
804 node_above = open_els[i + 1]
808 debug_log "the bones"
809 # 7. create an element for the token for which the element node
810 # was created, in the HTML namespace, with common ancestor as
811 # the intended parent; replace the entry for node in the list
812 # of active formatting elements with an entry for the new
813 # element, replace the entry for node in the stack of open
814 # elements with an entry for the new element, and let node be
816 new_node = node.shallow_clone()
820 debug_log "replaced in afe"
824 node_above = open_els[i + 1]
825 open_els[i] = new_node
826 debug_log "replaced in open_els"
829 # 8. If last node is furthest block, then move the
830 # aforementioned bookmark to be immediately after the new node
831 # in the list of active formatting elements.
836 debug_log "removed bookmark"
840 # "after" means lower
841 afe.splice i, 0, bookmark # "after as <-
842 debug_log "placed bookmark after node"
843 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
845 # 9. Insert last node into node, first removing it from its
846 # previous parent node if any.
848 debug_log "last_node has parent"
849 for c, i in last_node.parent.children
851 debug_log "removing last_node from parent"
852 last_node.parent.children.splice i, 1
854 node.children.push last_node
855 last_node.parent = node
856 # 10. Let last node be node.
859 # 11. Return to the step labeled inner loop.
860 # 14. Insert whatever last node ended up being in the previous step
861 # at the appropriate place for inserting a node, but using common
862 # ancestor as the override target.
864 # In the case where fe is immediately followed by fb:
865 # * inner loop exits out early (node==fe)
867 # * last_node is still in the tree (not a duplicate)
869 debug_log "FEFIRST? last_node has parent"
870 for c, i in last_node.parent.children
872 debug_log "removing last_node from parent"
873 last_node.parent.children.splice i, 1
876 debug_log "after aaa inner loop"
877 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
878 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
879 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
880 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
881 debug_log "tree: #{serialize_els doc.children, false, true}"
886 # can't use standard insert token thing, because it's already in
887 # open_els and must stay at it's current position in open_els
888 dest = adjusted_insertion_location ca
889 dest[0].children.splice dest[1], 0, last_node
890 last_node.parent = dest[0]
893 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
894 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
895 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
896 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
897 debug_log "tree: #{serialize_els doc.children, false, true}"
899 # 15. Create an element for the token for which formatting element
900 # was created, in the HTML namespace, with furthest block as the
902 new_element = fe.shallow_clone() # FIXME intended parent thing
903 # 16. Take all of the child nodes of furthest block and append them
904 # to the element created in the last step.
905 while fb.children.length
906 t = fb.children.shift()
907 t.parent = new_element
908 new_element.children.push t
909 # 17. Append that new element to furthest block.
910 new_element.parent = fb
911 fb.children.push new_element
912 # 18. Remove formatting element from the list of active formatting
913 # elements, and insert the new element into the list of active
914 # formatting elements at the position of the aforementioned
924 # 19. Remove formatting element from the stack of open elements,
925 # and insert the new element into the stack of open elements
926 # immediately below the position of furthest block in that stack.
933 open_els.splice i, 0, new_element
935 # 20. Jump back to the step labeled outer loop.
936 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
937 debug_log "tree: #{serialize_els doc.children, false, true}"
938 debug_log "open_els: #{serialize_els open_els, true, true}"
939 debug_log "afe: #{serialize_els afe, true, true}"
942 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
944 generate_implied_end_tags 'p' # arg is exception
945 if open_els[0].name isnt 'p'
947 while open_els.length > 1 # just in case
948 el = open_els.shift()
951 close_p_if_in_button_scope = ->
952 if is_in_button_scope 'p'
955 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
956 # aka insert_a_character = (t) ->
957 insert_character = (t) ->
958 dest = adjusted_insertion_location()
959 # fixfull check for Document node
961 prev = dest[0].children[dest[1] - 1]
962 if prev.type is TYPE_TEXT
965 dest[0].children.splice dest[1], 0, t
968 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
969 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
970 adjusted_insertion_location = (override_target = null) ->
971 # 1. If there was an override target specified, then let target be the
974 target = override_target
975 else # Otherwise, let target be the current node.
977 # 2. Determine the adjusted insertion location using the first matching
978 # steps from the following list:
980 # If foster parenting is enabled and target is a table, tbody, tfoot,
981 # thead, or tr element Foster parenting happens when content is
982 # misnested in tables.
983 if flag_foster_parenting and foster_parenting_targets[target.name]
984 loop # once. this is here so we can ``break`` to "abort these substeps"
985 # 1. Let last template be the last template element in the
986 # stack of open elements, if any.
988 last_template_i = null
989 for el, i in open_els
990 if el.name is 'template'
994 # 2. Let last table be the last table element in the stack of
995 # open elements, if any.
998 for el, i in open_els
999 if el.name is 'table'
1003 # 3. If there is a last template and either there is no last
1004 # table, or there is one, but last template is lower (more
1005 # recently added) than last table in the stack of open
1006 # elements, then: let adjusted insertion location be inside
1007 # last template's template contents, after its last child (if
1008 # any), and abort these substeps.
1009 if last_template and (last_table is null or last_template_i < last_table_i)
1010 target = template # fixfull should be it's contents
1011 target_i = target.children.length
1013 # 4. If there is no last table, then let adjusted insertion
1014 # location be inside the first element in the stack of open
1015 # elements (the html element), after its last child (if any),
1016 # and abort these substeps. (fragment case)
1017 if last_table is null
1019 target = open_els[open_els.length - 1]
1020 target_i = target.children.length
1021 # 5. If last table has a parent element, then let adjusted
1022 # insertion location be inside last table's parent element,
1023 # immediately before last table, and abort these substeps.
1024 if last_table.parent?
1025 for c, i in last_table.parent.children
1027 target = last_table.parent
1031 # 6. Let previous element be the element immediately above last
1032 # table in the stack of open elements.
1034 # huh? how could it not have a parent?
1035 previous_element = open_els[last_table_i + 1]
1036 # 7. Let adjusted insertion location be inside previous
1037 # element, after its last child (if any).
1038 target = previous_element
1039 target_i = target.children.length
1040 # Note: These steps are involved in part because it's possible
1041 # for elements, the table element in this case in particular,
1042 # to have been moved by a script around in the DOM, or indeed
1043 # removed from the DOM entirely, after the element was inserted
1045 break # don't really loop
1047 # Otherwise Let adjusted insertion location be inside target, after
1048 # its last child (if any).
1049 target_i = target.children.length
1051 # 3. If the adjusted insertion location is inside a template element,
1052 # let it instead be inside the template element's template contents,
1053 # after its last child (if any).
1054 # fixfull (template)
1056 # 4. Return the adjusted insertion location.
1057 return [target, target_i]
1059 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1060 # aka create_an_element_for_token
1061 token_to_element = (t, namespace, intended_parent) ->
1062 t.type = TYPE_TAG # not TYPE_START_TAG
1063 # convert attributes into a hash
1065 while t.attrs_a.length
1067 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1068 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1070 # TODO 2. If the newly created element has an xmlns attribute in the
1071 # XMLNS namespace whose value is not exactly the same as the element's
1072 # namespace, that is a parse error. Similarly, if the newly created
1073 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1074 # value is not the XLink Namespace, that is a parse error.
1076 # fixfull: the spec says stuff about form pointers and ownerDocument
1080 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1081 insert_foreign_element = (token, namespace) ->
1082 ail = adjusted_insertion_location()
1085 el = token_to_element token, namespace, ail_el
1086 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1088 ail_el.children.splice ail_i, 0, el
1091 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1092 insert_html_element = insert_foreign_element # (token, namespace) ->
1094 # FIXME read implement "foster parenting" part
1095 # FIXME read spec, do this right
1096 # FIXME implement the override target thing
1097 # note: this assumes it's an open tag
1098 # FIXME what part of the spec is this?
1099 # TODO look through all callers of this, and see what they should really be doing.
1100 # eg probably insert_html_element for tokens
1101 tree_insert_element = (el, override_target = null, namespace = null) ->
1103 el.namespace = namespace
1104 dest = adjusted_insertion_location override_target
1105 if el.type is TYPE_START_TAG # means it's a "token"
1106 el = token_to_element el, namespace, dest[0]
1107 unless el.namespace?
1108 namespace = dest.namespace
1109 # fixfull: Document nodes sometimes can't accept more chidren
1110 dest[0].children.splice dest[1], 0, el
1115 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1116 # position should be [node, index_within_children]
1117 insert_comment = (t, position = null) ->
1118 position ?= adjusted_insertion_location()
1119 position[0].children.splice position[1], 0, t
1122 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1123 parse_generic_raw_text = (t) ->
1124 insert_html_element t
1125 tok_state = tok_state_rawtext
1126 original_insertion_mode = insertion_mode
1127 insertion_mode = ins_mode_text
1128 parse_generic_rcdata_text = (t) ->
1129 insert_html_element t
1130 tok_state = tok_state_rcdata
1131 original_insertion_mode = insertion_mode
1132 insertion_mode = ins_mode_text
1134 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1135 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1136 generate_implied_end_tags = (except = null) ->
1137 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1140 # 8.2.5.4 The rules for parsing tokens in HTML content
1141 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1143 # 8.2.5.4.1 The "initial" insertion mode
1144 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1145 ins_mode_initial = (t) ->
1148 if t.type is TYPE_COMMENT
1149 # fixfull this is supposed to be "the last child of the document object"
1152 if t.type is TYPE_DOCTYPE
1156 insertion_mode = ins_mode_before_html
1159 #fixfull (iframe, quirks)
1160 insertion_mode = ins_mode_before_html
1161 insertion_mode t # reprocess the token
1164 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1165 ins_mode_before_html = (t) ->
1166 if t.type is TYPE_DOCTYPE
1169 if t.type is TYPE_COMMENT
1174 if t.type is TYPE_START_TAG and t.name is 'html'
1175 el = token_to_element t, NS_HTML, doc
1176 open_els.unshift(el)
1177 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1178 insertion_mode = ins_mode_before_head
1180 if t.type is TYPE_END_TAG
1181 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1182 # fall through to "anything else"
1187 html_tok = new_open_tag 'html'
1188 el = token_to_element html_tok, NS_HTML, doc
1189 doc.children.push el
1191 # ?fixfull browsing context
1192 insertion_mode = ins_mode_before_head
1196 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1197 ins_mode_before_head = (t) ->
1200 if t.type is TYPE_COMMENT
1203 if t.type is TYPE_DOCTYPE
1206 if t.type is TYPE_START_TAG and t.name is 'html'
1209 if t.type is TYPE_START_TAG and t.name is 'head'
1210 el = insert_html_element t
1211 head_element_pointer = el
1212 insertion_mode = ins_mode_in_head
1213 if t.type is TYPE_END_TAG
1214 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1215 # fall through to Anything else below
1220 head_tok = new_open_tag 'head'
1221 el = insert_html_element head_tok
1222 head_element_pointer = el
1223 insertion_mode = ins_mode_in_head
1224 insertion_mode t # reprocess current token
1226 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1227 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1228 open_els.shift() # spec says this will be a 'head' node
1229 insertion_mode = ins_mode_after_head
1231 ins_mode_in_head = (t) ->
1232 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1235 if t.type is TYPE_COMMENT
1238 if t.type is TYPE_DOCTYPE
1241 if t.type is TYPE_START_TAG and t.name is 'html'
1244 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1245 el = insert_html_element t
1247 t.acknowledge_self_closing()
1249 if t.type is TYPE_START_TAG and t.name is 'meta'
1250 el = insert_html_element t
1252 t.acknowledge_self_closing()
1253 # fixfull encoding stuff
1255 if t.type is TYPE_START_TAG and t.name is 'title'
1256 parse_generic_rcdata_element t
1258 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1259 parse_generic_raw_text t
1261 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1262 insert_html_element t
1263 insertion_mode = in_head_noscript # FIXME implement
1265 if t.type is TYPE_START_TAG and t.name is 'script'
1266 ail = adjusted_insertion_location()
1267 el = token_to_element t, NS_HTML, ail
1268 el.flag_parser_inserted true # FIXME implement
1269 # fixfull frament case
1270 ail[0].children.splice ail[1], 0, el
1272 tok_state = tok_state_script_data
1273 original_insertion_mode = insertion_mode # make sure orig... is defined
1274 insertion_mode = ins_mode_text # FIXME implement
1276 if t.type is TYPE_END_TAG and t.name is 'head'
1277 open_els.shift() # will be a head element... spec says so
1278 insertion_mode = ins_mode_after_head
1280 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1281 ins_mode_in_head_else t
1283 if t.type is TYPE_START_TAG and t.name is 'template'
1284 insert_html_element t
1286 flag_frameset_ok = false
1287 insertion_mode = ins_mode_in_template
1288 template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1290 if t.type is TYPE_END_TAG and t.name is 'template'
1291 if template_tag_is_open()
1292 generate_implied_end_tags
1293 if open_els[0].name isnt 'template'
1296 el = open_els.shift()
1297 if el.name is 'template'
1299 clear_afe_to_marker()
1300 template_insertion_modes.shift()
1301 reset_insertion_mode()
1305 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1308 ins_mode_in_head_else t
1310 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1311 ins_mode_in_head_noscript = (t) ->
1313 console.log "ins_mode_in_head_noscript unimplemented"
1315 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1316 ins_mode_after_head_else = (t) ->
1317 body_tok = new_open_tag 'body'
1318 insert_html_element body_tok
1319 insertion_mode = ins_mode_in_body
1320 insertion_mode t # reprocess token
1322 ins_mode_after_head = (t) ->
1326 if t.type is TYPE_COMMENT
1329 if t.type is TYPE_DOCTYPE
1332 if t.type is TYPE_START_TAG and t.name is 'html'
1335 if t.type is TYPE_START_TAG and t.name is 'body'
1336 insert_html_element t
1337 flag_frameset_ok = false
1338 insertion_mode = ins_mode_in_body
1340 if t.type is TYPE_START_TAG and t.name is 'frameset'
1341 insert_html_element t
1342 insertion_mode = ins_mode_in_frameset
1344 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1346 open_els.unshift head_element_pointer
1348 for el, i of open_els
1349 if el is head_element_pointer
1350 open_els.splice i, 1
1352 console.log "warning: 23904 couldn't find head element in open_els"
1354 if t.type is TYPE_END_TAG and t.name is 'template'
1357 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1358 ins_mode_after_head_else t
1360 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1364 ins_mode_after_head_else t
1366 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1367 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1368 for node, i in open_els
1369 if node.name is name # FIXME check namespace too
1370 generate_implied_end_tags name # arg is exception
1371 parse_error() unless i is 0
1376 if special_elements[node.name]? # FIXME check namespac too
1379 ins_mode_in_body = (t) ->
1385 when "\t", "\u000a", "\u000c", "\u000d", ' '
1386 reconstruct_active_formatting_elements()
1389 reconstruct_active_formatting_elements()
1391 flag_frameset_ok = false
1400 return if template_tag_is_open()
1401 root_attrs = open_els[open_els.length - 1].attrs
1403 root_attrs[k] = v unless root_attrs[k]?
1404 when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1405 # FIXME also do this for </template> (end tag)
1406 return ins_mode_in_head t
1413 when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1414 close_p_if_in_button_scope()
1415 insert_html_element t
1416 when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1417 close_p_if_in_button_scope()
1418 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1421 insert_html_element t
1422 # TODO lots more to implement here
1424 # If the list of active formatting elements
1425 # contains an a element between the end of the list and
1426 # the last marker on the list (or the start of the list
1427 # if there is no marker on the list), then this is a
1428 # parse error; run the adoption agency algorithm for
1429 # the tag name "a", then remove that element from the
1430 # list of active formatting elements and the stack of
1431 # open elements if the adoption agency algorithm didn't
1432 # already remove it (it might not have if the element
1433 # is not in table scope).
1436 if el.type is TYPE_AFE_MARKER
1446 for el, i in open_els
1448 open_els.splice i, 1
1449 reconstruct_active_formatting_elements()
1450 el = insert_html_element t
1452 when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1453 reconstruct_active_formatting_elements()
1454 el = insert_html_element t
1457 # fixfull quirksmode thing
1458 close_p_if_in_button_scope()
1459 insert_html_element t
1460 insertion_mode = ins_mode_in_table
1461 # TODO lots more to implement here
1462 else # any other start tag
1463 reconstruct_active_formatting_elements()
1464 insert_html_element t
1467 dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1468 tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1471 unless ok_tags[t.name]?
1474 # TODO stack of template insertion modes thing
1479 unless is_in_scope 'body'
1482 # TODO implement parse error and move to tree_after_body
1484 unless is_in_scope 'body' # weird, but it's what the spec says
1487 # TODO implement parse error and move to tree_after_body, reprocess
1488 when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1489 unless is_in_scope t.name, NS_HTML
1492 generate_implied_end_tags()
1493 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1496 el = open_els.shift()
1497 if el.name is t.name and el.namespace is NS_HTML
1499 # TODO lots more close tags to implement here
1501 unless is_in_button_scope 'p'
1503 insert_html_element new_open_tag 'p'
1505 # TODO lots more close tags to implement here
1506 when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1507 adoption_agency t.name
1508 # TODO lots more close tags to implement here
1510 in_body_any_other_end_tag t.name
1513 ins_mode_in_table_else = (t) ->
1515 flag_foster_parenting = true # FIXME
1517 flag_foster_parenting = false
1518 can_in_table = { # FIXME do this inline like everywhere else
1526 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1527 ins_mode_text = (t) ->
1528 if t.type is TYPE_TEXT
1531 if t.type is TYPE_EOF
1533 if open_els[0].name is 'script'
1534 open_els[0].flag 'already started', true
1536 insertion_mode = original_insertion_mode
1539 if t.type is TYPE_END_TAG and t.name is 'script'
1541 insertion_mode = original_insertion_mode
1542 # fixfull the spec seems to assume that I'm going to run the script
1543 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1545 if t.type is TYPE_END_TAG
1547 insertion_mode = original_insertion_mode
1549 console.log 'warning: end of ins_mode_text reached'
1551 # the functions below implement the tokenizer stats described here:
1552 # http://www.w3.org/TR/html5/syntax.html#tokenization
1554 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1555 ins_mode_in_table = (t) ->
1558 if can_in_table[t.name]
1559 original_insertion_mode = insertion_mode
1560 insertion_mode = ins_mode_in_table_text
1563 ins_mode_in_table_else t
1571 clear_stack_to_table_context()
1573 insert_html_element t
1574 insertion_mode = ins_mode_in_caption
1576 clear_stack_to_table_context()
1577 insert_html_element t
1578 insertion_mode = ins_mode_in_column_group
1580 clear_stack_to_table_context()
1581 insert_html_element new_open_tag 'colgroup'
1582 insertion_mode = ins_mode_in_column_group
1584 when 'tbody', 'tfoot', 'thead'
1585 clear_stack_to_table_context()
1586 insert_html_element t
1587 insertion_mode = ins_mode_in_table_body
1588 when 'td', 'th', 'tr'
1589 clear_stack_to_table_context()
1590 insert_html_element new_open_tag 'tbody'
1591 insertion_mode = ins_mode_in_table_body
1595 if is_in_table_scope 'table'
1597 el = open_els.shift()
1598 if el.name is 'table'
1600 reset_insertion_mode()
1602 when 'style', 'script', 'template'
1605 if token_is_input_hidden t
1606 ins_mode_in_table_else t
1609 el = insert_html_element t
1611 t.acknowledge_self_closing()
1614 if form_element_pointer?
1616 if template_tag_is_open()
1618 form_element_pointer = insert_html_element t
1621 ins_mode_in_table_else t
1625 if is_in_table_scope 'table'
1627 el = open_els.shift()
1628 if el.name is 'table'
1630 reset_insertion_mode()
1633 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1638 ins_mode_in_table_else t
1642 ins_mode_in_table_else t
1645 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1646 ins_mode_in_table_text = (t) ->
1647 if t.type is TYPE_TEXT and t.text is "\u0000"
1648 # huh? I thought the tokenizer didn't emit these
1651 if t.type is TYPE_TEXT
1652 pending_table_character_tokens.push t
1656 for old in pending_table_character_tokens
1657 unless is_space_tok old
1661 for old in pending_table_character_tokens
1662 insert_character old
1664 for old in pending_table_character_tokens
1665 ins_mode_table_else old
1666 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1667 insertion_mode = original_insertion_mode
1670 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1671 ins_mode_in_caption = (t) ->
1672 if t.type is TYPE_END_TAG and t.name is 'caption'
1673 if is_in_table_scope 'caption'
1674 generate_implied_end_tags()
1675 if open_els[0].name isnt 'caption'
1678 el = open_els.shift()
1679 if el.name is 'caption'
1681 clear_afe_to_marker()
1682 insertion_mode = in_table
1687 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1689 if is_in_table_scope 'caption'
1691 el = open_els.shift()
1692 if el.name is 'caption'
1694 clear_afe_to_marker()
1695 insertion_mode = in_table
1697 # else fragment case
1699 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1705 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1706 ins_mode_in_column_group = (t) ->
1710 if t.type is TYPE_COMMENT
1713 if t.type is TYPE_DOCTYPE
1716 if t.type is TYPE_START_TAG and t.name is 'html'
1719 if t.type is TYPE_START_TAG and t.name is 'col'
1720 el = insert_html_element t
1722 t.acknowledge_self_closing()
1724 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1725 if open_els[0].name is 'colgroup'
1727 insertion_mode = ins_mode_in_table
1731 if t.type is TYPE_END_TAG and t.name is 'col'
1734 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1737 if t.type is TYPE_EOF
1741 if open_els[0].name isnt 'colgroup'
1745 insertion_mode = ins_mode_in_table
1749 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1750 ins_mode_in_table_body = (t) ->
1751 if t.type is TYPE_START_TAG and t.name is 'tr'
1752 clear_stack_to_table_body_context()
1753 insert_html_element t
1754 insertion_mode = ins_mode_in_row
1756 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1758 clear_stack_to_table_body_context()
1759 insert_html_element new_open_tag 'tr'
1760 insertion_mode = ins_mode_in_row
1763 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1764 unless is_in_table_scope t.name # fixfull check namespace
1767 clear_stack_to_table_body_context()
1769 insertion_mode = ins_mode_in_table
1771 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1774 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1777 if table_scopers[el.name]
1782 clear_stack_to_table_body_context()
1784 insertion_mode = ins_mode_in_table
1787 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1793 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1794 ins_mode_in_row = (t) ->
1795 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1796 clear_stack_to_table_row_context()
1797 insert_html_element t
1798 insertion_mode = ins_mode_in_cell
1801 if t.type is TYPE_END_TAG and t.name is 'tr'
1802 if is_in_table_scope 'tr'
1803 clear_stack_to_table_row_context()
1805 insertion_mode = ins_mode_in_table_body
1809 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1810 if is_in_table_scope 'tr'
1811 clear_stack_to_table_row_context()
1813 insertion_mode = ins_mode_in_table_body
1818 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1819 if is_in_table_scope t.name # fixfull namespace
1820 if is_in_table_scope 'tr'
1821 clear_stack_to_table_row_context()
1823 insertion_mode = ins_mode_in_table_body
1828 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1834 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1836 generate_implied_end_tags()
1837 unless open_els[0].name is 'td' or open_els[0] is 'th'
1840 el = open_els.shift()
1841 if el.name is 'td' or el.name is 'th'
1843 clear_afe_to_marker()
1844 insertion_mode = ins_mode_in_row
1846 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1847 ins_mode_in_cell = (t) ->
1848 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1849 if is_in_table_scope t.name
1850 generate_implied_end_tags()
1851 if open_els[0].name isnt t.name
1854 el = open_els.shift()
1855 if el.name is t.name
1857 clear_afe_to_marker()
1858 insertion_mode = ins_mode_in_row
1862 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1865 if el.name is 'td' or el.name is 'th'
1868 if table_scopers[el.name]
1876 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1879 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1880 if is_in_table_scope t.name # fixfull namespace
1889 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
1890 ins_mode_in_select = (t) ->
1891 if t.type is TYPE_TEXT and t.text is "\u0000"
1894 if t.type is TYPE_TEXT
1897 if t.type is TYPE_COMMENT
1900 if t.type is TYPE_DOCTYPE
1903 if t.type is TYPE_START_TAG and t.name is 'html'
1906 if t.type is TYPE_START_TAG and t.name is 'option'
1907 if open_els[0].name is 'option'
1909 insert_html_element t
1911 if t.type is TYPE_START_TAG and t.name is 'optgroup'
1912 if open_els[0].name is 'option'
1914 if open_els[0].name is 'optgroup'
1916 insert_html_element t
1918 if t.type is TYPE_END_TAG and t.name is 'optgroup'
1919 if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
1921 if open_els[0].name is 'optgroup'
1926 if t.type is TYPE_END_TAG and t.name is 'option'
1927 if open_els[0].name is 'option'
1932 if t.type is TYPE_END_TAG and t.name is 'select'
1933 if is_in_select_scope 'select'
1935 el = open_els.shift()
1936 if el.name is 'select'
1938 reset_insertion_mode()
1942 if t.type is TYPE_START_TAG and t.name is 'select'
1945 el = open_els.shift()
1946 if el.name is 'select'
1948 reset_insertion_mode()
1949 # spec says that this is the same as </select> but it doesn't say
1950 # to check scope first
1952 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
1954 if is_in_select_scope 'select'
1957 el = open_els.shift()
1958 if el.name is 'select'
1960 reset_insertion_mode()
1963 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
1966 if t.type is TYPE_EOF
1973 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
1974 ins_mode_in_select_in_table = (t) ->
1975 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1978 el = open_els.shift()
1979 if el.name is 'select'
1981 reset_insertion_mode()
1984 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1986 unless is_in_table_scope t.name, NS_HTML
1989 el = open_els.shift()
1990 if el.name is 'select'
1992 reset_insertion_mode()
1996 ins_mode_in_select t
1999 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2000 ins_mode_in_template = (t) ->
2001 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2004 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2007 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2008 template_insertion_modes.shift()
2009 template_insertion_modes.unshift ins_mode_in_table
2010 insertion_mode = ins_mode_in_table
2013 if t.type is TYPE_START_TAG and t.name is 'col'
2014 template_insertion_modes.shift()
2015 template_insertion_modes.unshift ins_mode_in_column_group
2016 insertion_mode = ins_mode_in_column_group
2019 if t.type is TYPE_START_TAG and t.name is 'tr'
2020 template_insertion_modes.shift()
2021 template_insertion_modes.unshift ins_mode_in_table_body
2022 insertion_mode = ins_mode_in_table_body
2025 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2026 template_insertion_modes.shift()
2027 template_insertion_modes.unshift ins_mode_in_row
2028 insertion_mode = ins_mode_in_row
2031 if t.type is TYPE_START_TAG
2032 template_insertion_modes.shift()
2033 template_insertion_modes.unshift ins_mode_in_body
2034 insertion_mode = ins_mode_in_body
2037 if t.type is TYPE_END_TAG
2041 unless template_tag_is_open()
2046 el = open_els.shift()
2047 if el.name is 'template' # fixfull check namespace
2049 clear_afe_to_marker()
2050 template_insertion_modes.shift()
2051 reset_insertion_mode()
2054 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2055 ins_mode_after_body = (t) ->
2059 if t.type is TYPE_COMMENT
2060 insert_comment t, [open_els[0], open_els[0].children.length]
2062 if t.type is TYPE_DOCTYPE
2065 if t.type is TYPE_START_TAG and t.name is 'html'
2068 if t.type is TYPE_END_TAG and t.name is 'html'
2069 # fixfull fragment case
2070 insertion_mode = ins_mode_after_after_body
2072 if t.type is TYPE_EOF
2077 insertion_mode = ins_mode_in_body
2080 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2081 ins_mode_in_frameset = (t) ->
2085 if t.type is TYPE_COMMENT
2088 if t.type is TYPE_DOCTYPE
2091 if t.type is TYPE_START_TAG and t.name is 'html'
2094 if t.type is TYPE_START_TAG and t.name is 'frameset'
2095 insert_html_element t
2097 if t.type is TYPE_END_TAG and t.name is 'frameset'
2098 # TODO ?correct for: "if the current node is the root html element"
2099 if open_els.length is 1
2101 return # fragment case
2103 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2104 insertion_mode = ins_mode_after_frameset
2106 if t.type is TYPE_START_TAG and t.name is 'frame'
2107 insert_html_element t
2109 t.acknowledge_self_closing()
2111 if t.type is TYPE_START TAG and t.name is 'noframes'
2114 if t.type is TYPE_EOF
2115 # TODO ?correct for: "if the current node is not the root html element"
2116 if open_els.length isnt 1
2124 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2125 ins_mode_after_frameset = (t) ->
2129 if t.type is TYPE_COMMENT
2132 if t.type is TYPE_DOCTYPE
2135 if t.type is TYPE_START_TAG and t.name is 'html'
2138 if t.type is TYPE_END_TAG and t.name is 'html'
2139 insert_mode = ins_mode_after_after_frameset
2141 if t.type is TYPE_START_TAG and t.name is 'noframes'
2144 if t.type is TYPE_EOF
2151 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2152 ins_mode_after_after_body = (t) ->
2153 if t.type is TYPE_COMMENT
2154 insert_comment t, [doc, doc.children.length]
2156 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2159 if t.type is TYPE_EOF
2164 insertion_mode = ins_mode_in_body
2167 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2168 ins_mode_after_after_frameset = (t) ->
2169 if t.type is TYPE_COMMENT
2170 insert_comment t, [doc, doc.children.length]
2172 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2175 if t.type is TYPE_EOF
2178 if t.type is TYPE_START_TAG and t.name is 'noframes'
2189 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2191 switch c = txt.charAt(cur++)
2193 return new_text_node parse_character_reference()
2195 tok_state = tok_state_tag_open
2198 return new_text_node c
2200 return new_eof_token()
2202 return new_text_node c
2205 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2206 # not needed: tok_state_character_reference_in_data = ->
2207 # just call parse_character_reference()
2209 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2210 tok_state_rcdata = ->
2211 switch c = txt.charAt(cur++)
2213 return new_text_node parse_character_reference()
2215 tok_state = tok_state_rcdata_less_than_sign
2218 return new_character_token "\ufffd"
2220 return new_eof_token()
2222 return new_character_token c
2225 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2226 # not needed: tok_state_character_reference_in_rcdata = ->
2227 # just call parse_character_reference()
2229 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2230 tok_state_rawtext = ->
2231 switch c = txt.charAt(cur++)
2233 tok_state = tok_state_rawtext_less_than_sign
2236 return new_character_token "\ufffd"
2238 return new_eof_token()
2240 return new_character_token c
2243 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2244 tok_state_script_data = ->
2245 switch c = txt.charAt(cur++)
2247 tok_state = tok_state_script_data_less_than_sign
2250 return new_character_token "\ufffd"
2252 return new_eof_token()
2254 return new_character_token c
2257 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2258 tok_state_plaintext = ->
2259 switch c = txt.charAt(cur++)
2262 return new_character_token "\ufffd"
2264 return new_eof_token()
2266 return new_character_token c
2270 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2271 tok_state_tag_open = ->
2272 switch c = txt.charAt(cur++)
2274 tok_state = tok_state_markup_declaration_open
2276 tok_state = tok_state_end_tag_open
2279 tok_state = tok_state_bogus_comment
2281 if lc_alpha.indexOf(c) > -1
2282 tok_cur_tag = new_open_tag c
2283 tok_state = tok_state_tag_name
2284 else if uc_alpha.indexOf(c) > -1
2285 tok_cur_tag = new_open_tag c.toLowerCase()
2286 tok_state = tok_state_tag_name
2289 tok_state = tok_state_data
2290 cur -= 1 # we didn't parse/handle the char after <
2291 return new_text_node '<'
2294 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2295 tok_state_end_tag_open = ->
2296 switch c = txt.charAt(cur++)
2299 tok_state = tok_state_data
2302 tok_state = tok_state_data
2303 return new_text_node '</'
2305 if uc_alpha.indexOf(c) > -1
2306 tok_cur_tag = new_end_tag c.toLowerCase()
2307 tok_state = tok_state_tag_name
2308 else if lc_alpha.indexOf(c) > -1
2309 tok_cur_tag = new_end_tag c
2310 tok_state = tok_state_tag_name
2313 tok_state = tok_state_bogus_comment
2316 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2317 tok_state_tag_name = ->
2318 switch c = txt.charAt(cur++)
2319 when "\t", "\n", "\u000c", ' '
2320 tok_state = tok_state_before_attribute_name
2322 tok_state = tok_state_self_closing_start_tag
2324 tok_state = tok_state_data
2330 tok_cur_tag.name += "\ufffd"
2333 tok_state = tok_state_data
2335 if uc_alpha.indexOf(c) > -1
2336 tok_cur_tag.name += c.toLowerCase()
2338 tok_cur_tag.name += c
2341 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2342 tok_state_rcdata_less_than_sign = ->
2343 c = txt.charAt(cur++)
2345 temporary_buffer = ''
2346 tok_state = tok_state_rcdata_end_tag_open
2349 tok_state = tok_state_rcdata
2350 cur -= 1 # reconsume the input character
2351 return new_character_token '<'
2353 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2354 tok_state_rcdata_end_tag_open = ->
2355 c = txt.charAt(cur++)
2356 if uc_alpha.indexOf(c) > -1
2357 tok_cur_tag = new_end_tag c.toLowerCase()
2358 temporary_buffer += c
2359 tok_state = tok_state_rcdata_end_tag_name
2361 if lc_alpha.indexOf(c) > -1
2362 tok_cur_tag = new_end_tag c
2363 temporary_buffer += c
2364 tok_state = tok_state_rcdata_end_tag_name
2367 tok_state = tok_state_rcdata
2368 cur -= 1 # reconsume the input character
2369 return new_character_token "</" # fixfull separate these
2371 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2372 is_appropriate_end_tag = (t) ->
2373 # spec says to check against "the tag name of the last start tag to
2374 # have been emitted from this tokenizer", but this is only called from
2375 # the various "raw" states, which I'm pretty sure all push the start
2376 # token onto open_els. TODO: verify this after the script data states
2378 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2379 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2381 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2382 tok_state_rcdata_end_tag_name = ->
2383 c = txt.charAt(cur++)
2384 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2385 if is_appropriate_end_tag tok_cur_tag
2386 tok_state = tok_state_before_attribute_name
2388 # else fall through to "Anything else"
2390 if is_appropriate_end_tag tok_cur_tag
2391 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2393 # else fall through to "Anything else"
2395 if is_appropriate_end_tag tok_cur_tag
2396 tok_state = tok_state_data
2398 # else fall through to "Anything else"
2399 if uc_alpha.indexOf(c) > -1
2400 tok_cur_tag.name += c.toLowerCase()
2401 temporary_buffer += c
2403 if lc_alpha.indexOf(c) > -1
2404 tok_cur_tag.name += c
2405 temporary_buffer += c
2408 tok_state = tok_state_rcdata
2409 cur -= 1 # reconsume the input character
2410 return new_character_token '</' + temporary_buffer # fixfull separate these
2412 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2413 tok_state_rawtext_less_than_sign = ->
2414 c = txt.charAt(cur++)
2416 temporary_buffer = ''
2417 tok_state = tok_state_rawtext_end_tag_open
2420 tok_state = tok_state_rawtext
2421 cur -= 1 # reconsume the input character
2422 return new_character_token '<'
2424 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2425 tok_state_rawtext_end_tag_open = ->
2426 c = txt.charAt(cur++)
2427 if uc_alpha.indexOf(c) > -1
2428 tok_cur_tag = new_end_tag c.toLowerCase()
2429 temporary_buffer += c
2430 tok_state = tok_state_rawtext_end_tag_name
2432 if lc_alpha.indexOf(c) > -1
2433 tok_cur_tag = new_end_tag c
2434 temporary_buffer += c
2435 tok_state = tok_state_rawtext_end_tag_name
2438 tok_state = tok_state_rawtext
2439 cur -= 1 # reconsume the input character
2440 return new_character_token "</" # fixfull separate these
2442 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2443 tok_state_rawtext_end_tag_name = ->
2444 c = txt.charAt(cur++)
2445 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2446 if is_appropriate_end_tag tok_cur_tag
2447 tok_state = tok_state_before_attribute_name
2449 # else fall through to "Anything else"
2451 if is_appropriate_end_tag tok_cur_tag
2452 tok_state = tok_state_self_closing_start_tag
2454 # else fall through to "Anything else"
2456 if is_appropriate_end_tag tok_cur_tag
2457 tok_state = tok_state_data
2459 # else fall through to "Anything else"
2460 if uc_alpha.indexOf(c) > -1
2461 tok_cur_tag.name += c.toLowerCase()
2462 temporary_buffer += c
2464 if lc_alpha.indexOf(c) > -1
2465 tok_cur_tag.name += c
2466 temporary_buffer += c
2469 tok_state = tok_state_rawtext
2470 cur -= 1 # reconsume the input character
2471 return new_character_token '</' + temporary_buffer # fixfull separate these
2473 # TODO _all_ of the missing states here (17-33) are for parsing script tags
2475 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2476 tok_state_before_attribute_name = ->
2478 switch c = txt.charAt(cur++)
2479 when "\t", "\n", "\u000c", ' '
2482 tok_state = tok_state_self_closing_start_tag
2485 tok_state = tok_state_data
2491 attr_name = "\ufffd"
2492 when '"', "'", '<', '='
2497 tok_state = tok_state_data
2499 if uc_alpha.indexOf(c) > -1
2500 attr_name = c.toLowerCase()
2504 tok_cur_tag.attrs_a.unshift [attr_name, '']
2505 tok_state = tok_state_attribute_name
2508 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2509 tok_state_attribute_name = ->
2510 switch c = txt.charAt(cur++)
2511 when "\t", "\n", "\u000c", ' '
2512 tok_state = tok_state_after_attribute_name
2514 tok_state = tok_state_self_closing_start_tag
2516 tok_state = tok_state_before_attribute_value
2518 tok_state = tok_state_data
2524 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2527 tok_cur_tag.attrs_a[0][0] = c
2530 tok_state = tok_state_data
2532 if uc_alpha.indexOf(c) > -1
2533 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2535 tok_cur_tag.attrs_a[0][0] += c
2538 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2539 tok_state_after_attribute_name = ->
2540 c = txt.charAt(cur++)
2541 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2544 tok_state = tok_state_self_closing_start_tag
2547 tok_state = tok_state_before_attribute_value
2550 tok_state = tok_state_data
2552 if uc_alpha.indexOf(c) > -1
2553 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2554 tok_state = tok_state_attribute_name
2558 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2559 tok_state = tok_state_attribute_name
2563 tok_state = tok_state_data
2564 cur -= 1 # reconsume
2566 if c is '"' or c is "'" or c is '<'
2568 # fall through to Anything else
2570 tok_cur_tag.attrs_a.unshift [c, '']
2571 tok_state = tok_state_attribute_name
2573 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2574 tok_state_before_attribute_value = ->
2575 switch c = txt.charAt(cur++)
2576 when "\t", "\n", "\u000c", ' '
2579 tok_state = tok_state_attribute_value_double_quoted
2581 tok_state = tok_state_attribute_value_unquoted
2584 tok_state = tok_state_attribute_value_single_quoted
2587 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2588 tok_state = tok_state_attribute_value_unquoted
2591 tok_state = tok_state_data
2597 tok_state = tok_state_data
2599 tok_cur_tag.attrs_a[0][1] += c
2600 tok_state = tok_state_attribute_value_unquoted
2603 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2604 tok_state_attribute_value_double_quoted = ->
2605 switch c = txt.charAt(cur++)
2607 tok_state = tok_state_after_attribute_value_quoted
2609 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2612 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2615 tok_state = tok_state_data
2617 tok_cur_tag.attrs_a[0][1] += c
2620 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2621 tok_state_attribute_value_single_quoted = ->
2622 switch c = txt.charAt(cur++)
2624 tok_state = tok_state_after_attribute_value_quoted
2626 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2629 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2632 tok_state = tok_state_data
2634 tok_cur_tag.attrs_a[0][1] += c
2637 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2638 tok_state_attribute_value_unquoted = ->
2639 switch c = txt.charAt(cur++)
2640 when "\t", "\n", "\u000c", ' '
2641 tok_state = tok_state_before_attribute_name
2643 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2645 tok_state = tok_state_data
2650 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2653 tok_state = tok_state_data
2655 # Parse Error if ', <, = or ` (backtick)
2656 tok_cur_tag.attrs_a[0][1] += c
2659 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2660 tok_state_after_attribute_value_quoted = ->
2661 switch c = txt.charAt(cur++)
2662 when "\t", "\n", "\u000c", ' '
2663 tok_state = tok_state_before_attribute_name
2665 tok_state = tok_state_self_closing_start_tag
2667 tok_state = tok_state_data
2673 tok_state = tok_state_data
2676 tok_state = tok_state_before_attribute_name
2677 cur -= 1 # we didn't handle that char
2680 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
2681 # Don't set this as a state, just call it
2682 # returns a string (NOT a text node)
2683 parse_character_reference = (allowed_char = null, in_attr = false) ->
2684 if cur >= txt.length
2686 switch c = txt.charAt(cur)
2687 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
2688 # explicitly not a parse error
2691 # there has to be "one or more" alnums between & and ; to be a parse error
2694 if cur + 1 >= txt.length
2696 if txt.charAt(cur + 1).toLowerCase() is 'x'
2705 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
2709 if txt.charAt(start + i) is ';'
2711 # FIXME This is supposed to generate parse errors for some chars
2712 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
2719 if alnum.indexOf(txt.charAt(cur + i)) is -1
2722 # exit early, because parse_error() below needs at least one alnum
2724 if txt.charAt(cur + i) is ';'
2725 i += 1 # include ';' terminator in value
2726 decoded = decode_named_char_ref txt.substr(cur, i)
2733 # no ';' terminator (only legacy char refs)
2735 for i in [2..max] # no prefix matches, so ok to check shortest first
2736 c = legacy_char_refs[txt.substr(cur, i)]
2739 if txt.charAt(cur + i) is '='
2740 # "because some legacy user agents will
2741 # misinterpret the markup in those cases"
2744 if alnum.indexOf(txt.charAt(cur + i)) > -1
2745 # this makes attributes forgiving about url args
2747 # ok, and besides the weird exceptions for attributes...
2748 # return the matching char
2749 cur += i # consume entity chars
2750 parse_error() # because no terminating ";"
2754 return # never reached
2756 # tree constructor initialization
2757 # see comments on TYPE_TAG/etc for the structure of this data
2758 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
2760 afe = [] # active formatting elements
2761 template_insertion_modes = []
2762 insertion_mode = ins_mode_initial
2763 original_insertion_mode = insertion_mode # TODO check spec
2764 flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
2765 flag_frameset_ok = true
2767 flag_foster_parenting = false
2768 form_element_pointer = null
2769 temporary_buffer = null
2770 pending_table_character_tokens = []
2771 head_element_pointer = null
2772 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
2774 # tokenizer initialization
2775 tok_state = tok_state_data
2782 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
2785 serialize_els = (els, shallow, show_ids) ->
2791 serialized += t.serialize shallow, show_ids
2794 # TODO export TYPE_*
2795 module.exports.parse_html = parse_html
2796 module.exports.debug_log_reset = debug_log_reset
2797 module.exports.debug_log_each = debug_log_each
2798 module.exports.TYPE_TAG = TYPE_TAG
2799 module.exports.TYPE_TEXT = TYPE_TEXT
2800 module.exports.TYPE_COMMENT = TYPE_COMMENT
2801 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE