1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
55 module = exports: window.wheic
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
79 debug_log_each = (cb) ->
80 for str in g_debug_log
85 constructor: (type, args = {}) ->
86 @type = type # one of the TYPE_* constants above
87 @name = args.name ? '' # tag name
88 @text = args.text ? '' # contents for text/comment nodes
89 @attrs = args.attrs ? {}
90 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91 @children = args.children ? []
92 @namespace = args.namespace ? NS_HTML
93 @parent = args.parent ? null
94 @token = args.token ? null
95 @flags = args.flags ? {}
99 @id = "#{++prev_node_id}"
100 acknowledge_self_closing: ->
102 @token.flag 'did_self_close'
104 @flag 'did_self_close', true
105 flag: (key, value = null) ->
110 serialize: (shallow = false, show_ids = false) -> # for unit tests
115 ret += JSON.stringify @name
130 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
136 ret += c.serialize shallow, show_ids
140 ret += JSON.stringify @text
143 ret += JSON.stringify @text
145 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
148 when TYPE_AAA_BOOKMARK
149 ret += 'aaa_bookmark'
152 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
155 # helpers: (only take args that are normally known when parser creates nodes)
156 new_open_tag = (name) ->
157 return new Node TYPE_START_TAG, name: name
158 new_end_tag = (name) ->
159 return new Node TYPE_END_TAG, name: name
160 new_element = (name) ->
161 return new Node TYPE_TAG, name: name
162 new_text_node = (txt) ->
163 return new Node TYPE_TEXT, text: txt
164 new_character_token = new_text_node
165 new_comment_token = (txt) ->
166 return new Node TYPE_COMMENT, text: txt
167 new_doctype_token = (name) ->
168 return new Node TYPE_DOCTYPE, name: name
170 return new Node TYPE_EOF
172 return new Node TYPE_AFE_MARKER
173 new_aaa_bookmark = ->
174 return new Node TYPE_AAA_BOOKMARK
176 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
177 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
178 digits = "0123456789"
179 alnum = lc_alpha + uc_alpha + digits
180 hex_chars = digits + "abcdefABCDEF"
182 is_uc_alpha = (str) ->
183 return str.length is 1 and uc_alpha.indexOf(str) > -1
184 is_lc_alpha = (str) ->
185 return str.length is 1 and lc_alpha.indexOf(str) > -1
187 # some SVG elements have dashes in them
188 tag_name_chars = alnum + "-"
190 # http://www.w3.org/TR/html5/infrastructure.html#space-character
191 space_chars = "\u0009\u000a\u000c\u000d\u0020"
193 return txt.length is 1 and space_chars.indexOf(txt) > -1
194 is_space_tok = (t) ->
195 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
197 is_input_hidden_tok = (t) ->
198 return unless t.type is TYPE_START_TAG
201 if a[1].toLowerCase() is 'hidden'
206 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
207 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
209 # These are the character references that don't need a terminating semicolon
210 # min length: 2, max: 6, none are a prefix of any other.
212 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
213 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
214 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
215 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
216 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
217 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
218 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
219 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
220 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
221 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
222 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
223 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
224 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
225 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
226 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
227 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
228 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
232 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
233 raw_text_elements = ['script', 'style']
234 escapable_raw_text_elements = ['textarea', 'title']
235 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
237 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
238 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
239 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
240 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
241 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
242 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
243 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
244 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
245 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
246 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
247 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
248 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
249 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
250 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
254 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
256 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
257 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
258 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
259 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
260 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
261 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
262 'determinant', 'diff', 'divergence', 'divide', 'domain',
263 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
264 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
265 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
266 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
267 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
268 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
269 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
270 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
271 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
272 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
273 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
274 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
275 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
276 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
277 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
278 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
279 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
280 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
281 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
282 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
283 'vectorproduct', 'xor'
285 # foreign_elements = [svg_elements..., mathml_elements...]
286 #normal_elements = All other allowed HTML elements are normal elements.
290 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
291 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
292 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
293 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
294 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
295 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
296 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
297 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
298 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
299 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
300 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
301 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
302 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
303 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
304 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
305 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
306 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
307 wbr:NS_HTML, xmp:NS_HTML,
310 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
311 'annotation-xml':NS_MATHML,
314 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
317 formatting_elements = {
318 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
319 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
324 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
328 foster_parenting_targets = {
351 el_is_special = (e) ->
352 return special_elements[e.name] is e.namespace
354 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
355 el_is_special_not_adp = (el) ->
356 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
358 svg_attribute_fixes = {
359 attributename: 'attributeName'
360 attributetype: 'attributeType'
361 basefrequency: 'baseFrequency'
362 baseprofile: 'baseProfile'
364 clippathunits: 'clipPathUnits'
365 contentscripttype: 'contentScriptType'
366 contentstyletype: 'contentStyleType'
367 diffuseconstant: 'diffuseConstant'
369 externalresourcesrequired: 'externalResourcesRequired'
370 filterres: 'filterRes'
371 filterunits: 'filterUnits'
373 gradienttransform: 'gradientTransform'
374 gradientunits: 'gradientUnits'
375 kernelmatrix: 'kernelMatrix'
376 kernelunitlength: 'kernelUnitLength'
377 keypoints: 'keyPoints'
378 keysplines: 'keySplines'
380 lengthadjust: 'lengthAdjust'
381 limitingconeangle: 'limitingConeAngle'
382 markerheight: 'markerHeight'
383 markerunits: 'markerUnits'
384 markerwidth: 'markerWidth'
385 maskcontentunits: 'maskContentUnits'
386 maskunits: 'maskUnits'
387 numoctaves: 'numOctaves'
388 pathlength: 'pathLength'
389 patterncontentunits: 'patternContentUnits'
390 patterntransform: 'patternTransform'
391 patternunits: 'patternUnits'
392 pointsatx: 'pointsAtX'
393 pointsaty: 'pointsAtY'
394 pointsatz: 'pointsAtZ'
395 preservealpha: 'preserveAlpha'
396 preserveaspectratio: 'preserveAspectRatio'
397 primitiveunits: 'primitiveUnits'
400 repeatcount: 'repeatCount'
401 repeatdur: 'repeatDur'
402 requiredextensions: 'requiredExtensions'
403 requiredfeatures: 'requiredFeatures'
404 specularconstant: 'specularConstant'
405 specularexponent: 'specularExponent'
406 spreadmethod: 'spreadMethod'
407 startoffset: 'startOffset'
408 stddeviation: 'stdDeviation'
409 stitchtiles: 'stitchTiles'
410 surfacescale: 'surfaceScale'
411 systemlanguage: 'systemLanguage'
412 tablevalues: 'tableValues'
415 textlength: 'textLength'
417 viewtarget: 'viewTarget'
418 xchannelselector: 'xChannelSelector'
419 ychannelselector: 'yChannelSelector'
420 zoomandpan: 'zoomAndPan'
422 adjust_mathml_attributes = (t) ->
424 if a[0] is 'definitionurl'
425 a[0] = 'definitionURL'
427 adjust_svg_attributes = (t) ->
429 if svg_attribute_fixes[a[0]]?
430 a[0] = svg_attribute_fixes[a[0]]
432 adjust_foreign_attributes = (t) ->
436 # decode_named_char_ref()
438 # The list of named character references is _huge_ so ask the browser to decode
439 # for us instead of wasting bandwidth/space on including the table here.
441 # Pass without the "&" but with the ";" examples:
442 # for "&" pass "amp;"
443 # for "′" pass "x2032;"
446 textarea: document.createElement('textarea')
448 # TODO test this in IE8
449 decode_named_char_ref = (txt) ->
451 decoded = g_dncr.cache[txt]
452 return decoded if decoded?
453 g_dncr.textarea.innerHTML = txt
454 decoded = g_dncr.textarea.value
455 return null if decoded is txt
456 return g_dncr.cache[txt] = decoded
458 parse_html = (txt, parse_error_cb = null) ->
459 cur = 0 # index of next char in txt to be parsed
460 # declare doc and tokenizer variables so they're in scope below
462 open_els = null # stack of open elements
463 afe = null # active formatting elements
464 template_ins_modes = null
466 original_ins_mode = null
468 tok_cur_tag = null # partially parsed tag
469 flag_scripting = null
470 flag_frameset_ok = null
472 flag_foster_parenting = null
473 form_element_pointer = null
474 temporary_buffer = null
475 pending_table_character_tokens = null
476 head_element_pointer = null
477 flag_fragment_parsing = null
478 context_element = null
487 console.log "Parse error at character #{cur} of #{txt.length}"
489 afe_push = (new_el) ->
492 if el.name is new_el.name and el.namespace is new_el.namespace
494 continue unless new_el.attrs[k] is v
495 for k, v of new_el.attrs
496 continue unless el.attrs[k] is v
503 afe.unshift new_afe_marker()
505 # the functions below impliment the Tree Contstruction algorithm
506 # http://www.w3.org/TR/html5/syntax.html#tree-construction
508 # But first... the helpers
509 template_tag_is_open = ->
511 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
514 is_in_scope_x = (tag_name, scope, namespace) ->
516 if t.name is tag_name and (namespace is null or namespace is t.namespace)
518 if scope[t.name] is t.namespace
521 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
523 if t.name is tag_name and (namespace is null or namespace is t.namespace)
525 if scope[t.name] is t.namespace
527 if scope2[t.name] is t.namespace
531 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
532 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
533 template: NS_HTML, mi: NS_MATHML,
535 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
536 'annotation-xml': NS_MATHML,
538 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
540 button_scopers = button: NS_HTML
541 li_scopers = ol: NS_HTML, ul: NS_HTML
542 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
543 is_in_scope = (tag_name, namespace = null) ->
544 return is_in_scope_x tag_name, standard_scopers, namespace
545 is_in_button_scope = (tag_name, namespace = null) ->
546 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
547 is_in_table_scope = (tag_name, namespace = null) ->
548 return is_in_scope_x tag_name, table_scopers, namespace
549 # aka is_in_list_item_scope
550 is_in_li_scope = (tag_name, namespace = null) ->
551 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
552 is_in_select_scope = (tag_name, namespace = null) ->
554 if t.name is tag_name and (namespace is null or namespace is t.namespace)
556 if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
559 # this checks for a particular element, not by name
560 el_is_in_scope = (el) ->
564 if standard_scopers[t.name] is t.namespace
568 clear_to_table_stopers = {
573 clear_stack_to_table_context = ->
575 if clear_to_table_stopers[open_els[0].name]?
579 clear_to_table_body_stopers = {
586 clear_stack_to_table_body_context = ->
588 if clear_to_table_body_stopers[open_els[0].name]?
592 clear_to_table_row_stopers = {
597 clear_stack_to_table_row_context = ->
599 if clear_to_table_row_stopers[open_els[0].name]?
603 clear_afe_to_marker = ->
605 return unless afe.length > 0 # this happens in fragment case, ?spec error
607 if el.type is TYPE_AFE_MARKER
612 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
614 # 1. Let last be false.
616 # 2. Let node be the last node in the stack of open elements.
618 node = open_els[node_i]
619 # 3. Loop: If node is the first node in the stack of open elements,
620 # then set last to true, and, if the parser was originally created as
621 # part of the HTML fragment parsing algorithm (fragment case) set node
622 # to the context element.
624 if node_i is open_els.length - 1
626 # fixfull (fragment case)
628 # 4. If node is a select element, run these substeps:
629 if node.name is 'select'
630 # 1. If last is true, jump to the step below labeled done.
632 # 2. Let ancestor be node.
635 # 3. Loop: If ancestor is the first node in the stack of
636 # open elements, jump to the step below labeled done.
638 if ancestor_i is open_els.length - 1
640 # 4. Let ancestor be the node before ancestor in the stack
643 ancestor = open_els[ancestor_i]
644 # 5. If ancestor is a template node, jump to the step below
646 if ancestor.name is 'template'
648 # 6. If ancestor is a table node, switch the insertion mode
649 # to "in select in table" and abort these steps.
650 if ancestor.name is 'table'
651 ins_mode = ins_mode_in_select_in_table
653 # 7. Jump back to the step labeled loop.
654 # 8. Done: Switch the insertion mode to "in select" and abort
656 ins_mode = ins_mode_in_select
658 # 5. If node is a td or th element and last is false, then switch
659 # the insertion mode to "in cell" and abort these steps.
660 if (node.name is 'td' or node.name is 'th') and last is false
661 ins_mode = ins_mode_in_cell
663 # 6. If node is a tr element, then switch the insertion mode to "in
664 # row" and abort these steps.
666 ins_mode = ins_mode_in_row
668 # 7. If node is a tbody, thead, or tfoot element, then switch the
669 # insertion mode to "in table body" and abort these steps.
670 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
671 ins_mode = ins_mode_in_table_body
673 # 8. If node is a caption element, then switch the insertion mode
674 # to "in caption" and abort these steps.
675 if node.name is 'caption'
676 ins_mode = ins_mode_in_caption
678 # 9. If node is a colgroup element, then switch the insertion mode
679 # to "in column group" and abort these steps.
680 if node.name is 'colgroup'
681 ins_mode = ins_mode_in_column_group
683 # 10. If node is a table element, then switch the insertion mode to
684 # "in table" and abort these steps.
685 if node.name is 'table'
686 ins_mode = ins_mode_in_table
688 # 11. If node is a template element, then switch the insertion mode
689 # to the current template insertion mode and abort these steps.
690 # fixfull (template insertion mode stack)
692 # 12. If node is a head element and last is true, then switch the
693 # insertion mode to "in body" ("in body"! not "in head"!) and abort
694 # these steps. (fragment case)
695 if node.name is 'head' and last
696 ins_mode = ins_mode_in_body
698 # 13. If node is a head element and last is false, then switch the
699 # insertion mode to "in head" and abort these steps.
700 if node.name is 'head' and last is false
701 ins_mode = ins_mode_in_head
703 # 14. If node is a body element, then switch the insertion mode to
704 # "in body" and abort these steps.
705 if node.name is 'body'
706 ins_mode = ins_mode_in_body
708 # 15. If node is a frameset element, then switch the insertion mode
709 # to "in frameset" and abort these steps. (fragment case)
710 if node.name is 'frameset'
711 ins_mode = ins_mode_in_frameset
713 # 16. If node is an html element, run these substeps:
714 if node.name is 'html'
715 # 1. If the head element pointer is null, switch the insertion
716 # mode to "before head" and abort these steps. (fragment case)
717 if head_element_pointer is null
718 ins_mode = ins_mode_before_head
720 # 2. Otherwise, the head element pointer is not null,
721 # switch the insertion mode to "after head" and abort these
723 ins_mode = ins_mode_after_head
725 # 17. If last is true, then switch the insertion mode to "in body"
726 # and abort these steps. (fragment case)
728 ins_mode = ins_mode_in_body
730 # 18. Let node now be the node before node in the stack of open
733 node = open_els[node_i]
734 # 19. Return to the step labeled loop.
738 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
739 adjusted_current_node = ->
740 if open_els.length is 1 and flag_fragment_parsing
741 return context_element
744 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
745 # this implementation is structured (mostly) as described at the link above.
746 # capitalized comments are the "labels" described at the link above.
748 return if afe.length is 0
749 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
754 if i is afe.length - 1
757 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
762 el = insert_html_element afe[i].token
767 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
768 # adoption agency algorithm
770 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
771 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
772 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
773 adoption_agency = (subject) ->
774 debug_log "adoption_agency()"
775 debug_log "tree: #{serialize_els doc.children, false, true}"
776 debug_log "open_els: #{serialize_els open_els, true, true}"
777 debug_log "afe: #{serialize_els afe, true, true}"
778 if open_els[0].name is subject
781 # remove it from the list of active formatting elements (if found)
786 debug_log "aaa: starting off with subject on top of stack, exiting"
793 # 5. Let formatting element be the last element in the list of
794 # active formatting elements that: is between the end of the list
795 # and the last scope marker in the list, if any, or the start of
796 # the list otherwise, and has the tag name subject.
798 for t, fe_of_afe in afe
799 if t.type is TYPE_AFE_MARKER
804 # If there is no such element, then abort these steps and instead
805 # act as described in the "any other end tag" entry above.
807 debug_log "aaa: fe not found in afe"
808 in_body_any_other_end_tag subject
810 # 6. If formatting element is not in the stack of open elements,
811 # then this is a parse error; remove the element from the list, and
814 for t, fe_of_open_els in open_els
819 debug_log "aaa: fe not found in open_els"
821 # "remove it from the list" must mean afe, since it's not in open_els
822 afe.splice fe_of_afe, 1
824 # 7. If formatting element is in the stack of open elements, but
825 # the element is not in scope, then this is a parse error; abort
827 unless el_is_in_scope fe
828 debug_log "aaa: fe not in scope"
831 # 8. If formatting element is not the current node, this is a parse
832 # error. (But do not abort these steps.)
833 unless open_els[0] is fe
836 # 9. Let furthest block be the topmost node in the stack of open
837 # elements that is lower in the stack than formatting element, and
838 # is an element in the special category. There might not be one.
840 fb_of_open_els = null
847 # and continue, to see if there's one that's more "topmost"
848 # 10. If there is no furthest block, then the UA must first pop all
849 # the nodes from the bottom of the stack of open elements, from the
850 # current node up to and including formatting element, then remove
851 # formatting element from the list of active formatting elements,
852 # and finally abort these steps.
854 debug_log "aaa: no fb"
858 afe.splice fe_of_afe, 1
860 # 11. Let common ancestor be the element immediately above
861 # formatting element in the stack of open elements.
862 ca = open_els[fe_of_open_els + 1] # common ancestor
864 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
865 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
866 bookmark = new_aaa_bookmark()
869 afe.splice i, 0, bookmark
871 node = last_node = fb
875 # 3. Let node be the element immediately above node in the
876 # stack of open elements, or if node is no longer in the stack
877 # of open elements (e.g. because it got removed by this
878 # algorithm), the element that was immediately above node in
879 # the stack of open elements before node was removed.
883 node_next = open_els[i + 1]
885 node = node_next ? node_above
886 debug_log "inner loop #{inner}"
887 debug_log "tree: #{serialize_els doc.children, false, true}"
888 debug_log "open_els: #{serialize_els open_els, true, true}"
889 debug_log "afe: #{serialize_els afe, true, true}"
890 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
891 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
892 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
893 debug_log "node: #{node.serialize true, true}"
894 # TODO make sure node_above gets re-set if/when node is removed from open_els
896 # 4. If node is formatting element, then go to the next step in
897 # the overall algorithm.
901 # 5. If inner loop counter is greater than three and node is in
902 # the list of active formatting elements, then remove node from
903 # the list of active formatting elements.
909 debug_log "max out inner"
914 # 6. If node is not in the list of active formatting elements,
915 # then remove node from the stack of open elements and then go
916 # back to the step labeled inner loop.
918 debug_log "not in afe"
921 node_above = open_els[i + 1]
925 debug_log "the bones"
926 # 7. create an element for the token for which the element node
927 # was created, in the HTML namespace, with common ancestor as
928 # the intended parent; replace the entry for node in the list
929 # of active formatting elements with an entry for the new
930 # element, replace the entry for node in the stack of open
931 # elements with an entry for the new element, and let node be
933 new_node = token_to_element node.token, NS_HTML, ca
937 debug_log "replaced in afe"
941 node_above = open_els[i + 1]
942 open_els[i] = new_node
943 debug_log "replaced in open_els"
946 # 8. If last node is furthest block, then move the
947 # aforementioned bookmark to be immediately after the new node
948 # in the list of active formatting elements.
953 debug_log "removed bookmark"
957 # "after" means lower
958 afe.splice i, 0, bookmark # "after as <-
959 debug_log "placed bookmark after node"
960 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
962 # 9. Insert last node into node, first removing it from its
963 # previous parent node if any.
965 debug_log "last_node has parent"
966 for c, i in last_node.parent.children
968 debug_log "removing last_node from parent"
969 last_node.parent.children.splice i, 1
971 node.children.push last_node
972 last_node.parent = node
973 # 10. Let last node be node.
976 # 11. Return to the step labeled inner loop.
977 # 14. Insert whatever last node ended up being in the previous step
978 # at the appropriate place for inserting a node, but using common
979 # ancestor as the override target.
981 # In the case where fe is immediately followed by fb:
982 # * inner loop exits out early (node==fe)
984 # * last_node is still in the tree (not a duplicate)
986 debug_log "FEFIRST? last_node has parent"
987 for c, i in last_node.parent.children
989 debug_log "removing last_node from parent"
990 last_node.parent.children.splice i, 1
993 debug_log "after aaa inner loop"
994 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
995 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
996 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
997 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
998 debug_log "tree: #{serialize_els doc.children, false, true}"
1003 # can't use standard insert token thing, because it's already in
1004 # open_els and must stay at it's current position in open_els
1005 dest = adjusted_insertion_location ca
1006 dest[0].children.splice dest[1], 0, last_node
1007 last_node.parent = dest[0]
1010 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1011 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1012 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1013 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1014 debug_log "tree: #{serialize_els doc.children, false, true}"
1016 # 15. Create an element for the token for which formatting element
1017 # was created, in the HTML namespace, with furthest block as the
1019 new_element = token_to_element fe.token, NS_HTML, fb
1020 # 16. Take all of the child nodes of furthest block and append them
1021 # to the element created in the last step.
1022 while fb.children.length
1023 t = fb.children.shift()
1024 t.parent = new_element
1025 new_element.children.push t
1026 # 17. Append that new element to furthest block.
1027 new_element.parent = fb
1028 fb.children.push new_element
1029 # 18. Remove formatting element from the list of active formatting
1030 # elements, and insert the new element into the list of active
1031 # formatting elements at the position of the aforementioned
1039 afe[i] = new_element
1041 # 19. Remove formatting element from the stack of open elements,
1042 # and insert the new element into the stack of open elements
1043 # immediately below the position of furthest block in that stack.
1044 for t, i in open_els
1046 open_els.splice i, 1
1048 for t, i in open_els
1050 open_els.splice i, 0, new_element
1052 # 20. Jump back to the step labeled outer loop.
1053 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1054 debug_log "tree: #{serialize_els doc.children, false, true}"
1055 debug_log "open_els: #{serialize_els open_els, true, true}"
1056 debug_log "afe: #{serialize_els afe, true, true}"
1057 debug_log "AAA DONE"
1059 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1060 close_p_element = ->
1061 generate_implied_end_tags 'p' # arg is exception
1062 if open_els[0].name isnt 'p'
1064 while open_els.length > 1 # just in case
1065 el = open_els.shift()
1068 close_p_if_in_button_scope = ->
1069 if is_in_button_scope 'p'
1072 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1073 # aka insert_a_character = (t) ->
1074 insert_character = (t) ->
1075 dest = adjusted_insertion_location()
1076 # fixfull check for Document node
1078 prev = dest[0].children[dest[1] - 1]
1079 if prev.type is TYPE_TEXT
1082 dest[0].children.splice dest[1], 0, t
1085 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1086 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1087 adjusted_insertion_location = (override_target = null) ->
1088 # 1. If there was an override target specified, then let target be the
1091 target = override_target
1092 else # Otherwise, let target be the current node.
1093 target = open_els[0]
1094 # 2. Determine the adjusted insertion location using the first matching
1095 # steps from the following list:
1097 # If foster parenting is enabled and target is a table, tbody, tfoot,
1098 # thead, or tr element Foster parenting happens when content is
1099 # misnested in tables.
1100 if flag_foster_parenting and foster_parenting_targets[target.name]
1101 loop # once. this is here so we can ``break`` to "abort these substeps"
1102 # 1. Let last template be the last template element in the
1103 # stack of open elements, if any.
1104 last_template = null
1105 last_template_i = null
1106 for el, i in open_els
1107 if el.name is 'template'
1111 # 2. Let last table be the last table element in the stack of
1112 # open elements, if any.
1115 for el, i in open_els
1116 if el.name is 'table'
1120 # 3. If there is a last template and either there is no last
1121 # table, or there is one, but last template is lower (more
1122 # recently added) than last table in the stack of open
1123 # elements, then: let adjusted insertion location be inside
1124 # last template's template contents, after its last child (if
1125 # any), and abort these substeps.
1126 if last_template and (last_table is null or last_template_i < last_table_i)
1127 target = last_template # fixfull should be it's contents
1128 target_i = target.children.length
1130 # 4. If there is no last table, then let adjusted insertion
1131 # location be inside the first element in the stack of open
1132 # elements (the html element), after its last child (if any),
1133 # and abort these substeps. (fragment case)
1134 if last_table is null
1136 target = open_els[open_els.length - 1]
1137 target_i = target.children.length
1138 # 5. If last table has a parent element, then let adjusted
1139 # insertion location be inside last table's parent element,
1140 # immediately before last table, and abort these substeps.
1141 if last_table.parent?
1142 for c, i in last_table.parent.children
1144 target = last_table.parent
1148 # 6. Let previous element be the element immediately above last
1149 # table in the stack of open elements.
1151 # huh? how could it not have a parent?
1152 previous_element = open_els[last_table_i + 1]
1153 # 7. Let adjusted insertion location be inside previous
1154 # element, after its last child (if any).
1155 target = previous_element
1156 target_i = target.children.length
1157 # Note: These steps are involved in part because it's possible
1158 # for elements, the table element in this case in particular,
1159 # to have been moved by a script around in the DOM, or indeed
1160 # removed from the DOM entirely, after the element was inserted
1162 break # don't really loop
1164 # Otherwise Let adjusted insertion location be inside target, after
1165 # its last child (if any).
1166 target_i = target.children.length
1168 # 3. If the adjusted insertion location is inside a template element,
1169 # let it instead be inside the template element's template contents,
1170 # after its last child (if any).
1171 # fixfull (template)
1173 # 4. Return the adjusted insertion location.
1174 return [target, target_i]
1176 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1177 # aka create_an_element_for_token
1178 token_to_element = (t, namespace, intended_parent) ->
1179 # convert attributes into a hash
1182 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1183 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1185 # TODO 2. If the newly created element has an xmlns attribute in the
1186 # XMLNS namespace whose value is not exactly the same as the element's
1187 # namespace, that is a parse error. Similarly, if the newly created
1188 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1189 # value is not the XLink Namespace, that is a parse error.
1191 # fixfull: the spec says stuff about form pointers and ownerDocument
1195 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1196 insert_foreign_element = (token, namespace) ->
1197 ail = adjusted_insertion_location()
1200 el = token_to_element token, namespace, ail_el
1201 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1203 ail_el.children.splice ail_i, 0, el
1206 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1207 insert_html_element = (token) ->
1208 insert_foreign_element token, NS_HTML
1210 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1211 # position should be [node, index_within_children]
1212 insert_comment = (t, position = null) ->
1213 position ?= adjusted_insertion_location()
1214 position[0].children.splice position[1], 0, t
1217 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1218 parse_generic_raw_text = (t) ->
1219 insert_html_element t
1220 tok_state = tok_state_rawtext
1221 original_ins_mode = ins_mode
1222 ins_mode = ins_mode_text
1223 parse_generic_rcdata_text = (t) ->
1224 insert_html_element t
1225 tok_state = tok_state_rcdata
1226 original_ins_mode = ins_mode
1227 ins_mode = ins_mode_text
1229 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1230 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1231 generate_implied_end_tags = (except = null) ->
1232 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1235 # 8.2.5.4 The rules for parsing tokens in HTML content
1236 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1238 # 8.2.5.4.1 The "initial" insertion mode
1239 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1240 ins_mode_initial = (t) ->
1243 if t.type is TYPE_COMMENT
1247 if t.type is TYPE_DOCTYPE
1248 # FIXME check identifiers, set quirks, etc
1251 ins_mode = ins_mode_before_html
1254 #fixfull (iframe, quirks)
1255 ins_mode = ins_mode_before_html
1256 ins_mode t # reprocess the token
1259 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1260 ins_mode_before_html = (t) ->
1261 if t.type is TYPE_DOCTYPE
1264 if t.type is TYPE_COMMENT
1269 if t.type is TYPE_START_TAG and t.name is 'html'
1270 el = token_to_element t, NS_HTML, doc
1271 doc.children.push el
1272 open_els.unshift(el)
1273 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1274 ins_mode = ins_mode_before_head
1276 if t.type is TYPE_END_TAG
1277 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1278 # fall through to "anything else"
1283 html_tok = new_open_tag 'html'
1284 el = token_to_element html_tok, NS_HTML, doc
1285 doc.children.push el
1287 # ?fixfull browsing context
1288 ins_mode = ins_mode_before_head
1292 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1293 ins_mode_before_head = (t) ->
1296 if t.type is TYPE_COMMENT
1299 if t.type is TYPE_DOCTYPE
1302 if t.type is TYPE_START_TAG and t.name is 'html'
1305 if t.type is TYPE_START_TAG and t.name is 'head'
1306 el = insert_html_element t
1307 head_element_pointer = el
1308 ins_mode = ins_mode_in_head
1309 if t.type is TYPE_END_TAG
1310 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1311 # fall through to Anything else below
1316 head_tok = new_open_tag 'head'
1317 el = insert_html_element head_tok
1318 head_element_pointer = el
1319 ins_mode = ins_mode_in_head
1320 ins_mode t # reprocess current token
1322 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1323 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1324 open_els.shift() # spec says this will be a 'head' node
1325 ins_mode = ins_mode_after_head
1327 ins_mode_in_head = (t) ->
1328 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1331 if t.type is TYPE_COMMENT
1334 if t.type is TYPE_DOCTYPE
1337 if t.type is TYPE_START_TAG and t.name is 'html'
1340 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1341 el = insert_html_element t
1343 t.acknowledge_self_closing()
1345 if t.type is TYPE_START_TAG and t.name is 'meta'
1346 el = insert_html_element t
1348 t.acknowledge_self_closing()
1349 # fixfull encoding stuff
1351 if t.type is TYPE_START_TAG and t.name is 'title'
1352 parse_generic_rcdata_text t
1354 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1355 parse_generic_raw_text t
1357 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1358 insert_html_element t
1359 ins_mode = ins_mode_in_head_noscript
1361 if t.type is TYPE_START_TAG and t.name is 'script'
1362 ail = adjusted_insertion_location()
1363 el = token_to_element t, NS_HTML, ail
1364 el.flag 'parser-inserted', true
1365 # fixfull frament case
1366 ail[0].children.splice ail[1], 0, el
1368 tok_state = tok_state_script_data
1369 original_ins_mode = ins_mode # make sure orig... is defined
1370 ins_mode = ins_mode_text
1372 if t.type is TYPE_END_TAG and t.name is 'head'
1373 open_els.shift() # will be a head element... spec says so
1374 ins_mode = ins_mode_after_head
1376 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1377 ins_mode_in_head_else t
1379 if t.type is TYPE_START_TAG and t.name is 'template'
1380 insert_html_element t
1382 flag_frameset_ok = false
1383 ins_mode = ins_mode_in_template
1384 template_ins_modes.unshift ins_mode_in_template
1386 if t.type is TYPE_END_TAG and t.name is 'template'
1387 if template_tag_is_open()
1388 generate_implied_end_tags
1389 if open_els[0].name isnt 'template'
1392 el = open_els.shift()
1393 if el.name is 'template'
1395 clear_afe_to_marker()
1396 template_ins_modes.shift()
1401 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1404 ins_mode_in_head_else t
1406 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1407 ins_mode_in_head_noscript_else = (t) ->
1410 ins_mode = ins_mode_in_head
1412 ins_mode_in_head_noscript = (t) ->
1413 if t.type is TYPE_DOCTYPE
1416 if t.type is TYPE_START_TAG
1419 if t.type is TYPE_END_TAG and t.name is 'noscript'
1421 ins_mode = ins_mode_in_head
1423 if (t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\u000a" or t.text is "\u000c" or t.text is "\u000d" or t.text is ' ')) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1426 if t.type is TYPE_END_TAG and t.name is 'br'
1427 ins_mode_in_head_noscript_else t
1429 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1433 ins_mode_in_head_noscript_else t
1438 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1439 ins_mode_after_head_else = (t) ->
1440 body_tok = new_open_tag 'body'
1441 insert_html_element body_tok
1442 ins_mode = ins_mode_in_body
1443 ins_mode t # reprocess token
1445 ins_mode_after_head = (t) ->
1449 if t.type is TYPE_COMMENT
1452 if t.type is TYPE_DOCTYPE
1455 if t.type is TYPE_START_TAG and t.name is 'html'
1458 if t.type is TYPE_START_TAG and t.name is 'body'
1459 insert_html_element t
1460 flag_frameset_ok = false
1461 ins_mode = ins_mode_in_body
1463 if t.type is TYPE_START_TAG and t.name is 'frameset'
1464 insert_html_element t
1465 ins_mode = ins_mode_in_frameset
1467 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1469 open_els.unshift head_element_pointer
1471 for el, i of open_els
1472 if el is head_element_pointer
1473 open_els.splice i, 1
1475 console.log "warning: 23904 couldn't find head element in open_els"
1477 if t.type is TYPE_END_TAG and t.name is 'template'
1480 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1481 ins_mode_after_head_else t
1483 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1487 ins_mode_after_head_else t
1489 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1490 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1491 for el, i in open_els
1492 if el.namespace is NS_HTML and el.name is name
1493 generate_implied_end_tags name # arg is exception
1494 parse_error() unless i is 0
1499 if special_elements[el.name] is el.namespace
1503 ins_mode_in_body = (t) ->
1504 if t.type is TYPE_TEXT and t.text is "\u0000"
1511 if t.type is TYPE_TEXT
1514 flag_frameset_ok = false
1516 if t.type is TYPE_COMMENT
1519 if t.type is TYPE_DOCTYPE
1522 if t.type is TYPE_START_TAG and t.name is 'html'
1524 return if template_tag_is_open()
1525 root_attrs = open_els[open_els.length - 1].attrs
1527 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1530 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1533 if t.type is TYPE_START_TAG and t.name is 'body'
1535 return if open_els.length < 2
1536 second = open_els[open_els.length - 2]
1537 return unless second.ns is NS_HTML
1538 return unless second.name is 'body'
1539 return if template_tag_is_open()
1540 frameset_ok_flag = false
1542 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1544 if t.type is TYPE_START_TAG and t.name is 'frameset'
1546 return if open_els.length < 2
1547 second_i = open_els.length - 2
1548 second = open_els[second_i]
1549 return unless second.ns is NS_HTML
1550 return unless second.name is 'body'
1551 flag_frameset_ok = false
1553 for el, i in second.parent.children
1555 second.parent.children.splice i, 1
1557 open_els.splice second_i, 1
1558 # pop everything except the "root html element"
1559 while open_els.length > 1
1561 insert_html_element t
1562 ins_mode = ins_mode_in_frameset
1564 if t.type is TYPE_EOF
1566 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1567 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1568 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1571 unless ok_tags[t.name] is el.namespace
1574 if template_ins_modes.length > 0
1575 ins_mode_in_template t
1579 if t.type is TYPE_END_TAG and t.name is 'body'
1580 unless is_in_scope 'body'
1584 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1585 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1586 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1587 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1591 unless ok_tags[t.name] is el.namespace
1594 ins_mode = ins_mode_after_body
1596 if t.type is TYPE_END_TAG and t.name is 'html'
1597 unless is_in_scope 'body'
1601 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1602 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1603 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1604 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1608 unless ok_tags[t.name] is el.namespace
1611 ins_mode = ins_mode_after_body
1614 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1615 close_p_if_in_button_scope()
1616 insert_html_element t
1618 if t.type is TYPE_START_TAG and h_tags[t.name]?
1619 close_p_if_in_button_scope()
1620 if h_tags[open_els[0]] is NS_HTML
1623 insert_html_element t
1625 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1626 close_p_if_in_button_scope()
1627 insert_html_element t
1628 # spec: If the next token is a "LF" (U+000A) character token, then
1629 # ignore that token and move on to the next one. (Newlines at the
1630 # start of pre blocks are ignored as an authoring convenience.)
1631 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1633 flag_frameset_ok = false
1635 if t.type is TYPE_START_TAG and t.name is 'form'
1636 unless form_element_pointer is null or template_tag_is_open()
1639 close_p_if_in_button_scope()
1640 el = insert_html_element t
1641 unless template_tag_is_open()
1642 form_element_pointer = el
1644 if t.type is TYPE_START_TAG and t.name is 'li'
1645 flag_frameset_ok = false
1646 for node in open_els
1647 if node.name is 'li' and node.namespace is NS_HTML
1648 generate_implied_end_tags 'li' # arg is exception
1649 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1652 el = open_els.shift()
1653 if el.name is 'li' and el.namespace is NS_HTML
1656 if el_is_special_not_adp node
1658 close_p_if_in_button_scope()
1659 insert_html_element t
1661 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1662 flag_frameset_ok = false
1663 for node in open_els
1664 if node.name is 'dd' and node.namespace is NS_HTML
1665 generate_implied_end_tags 'dd' # arg is exception
1666 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1669 el = open_els.shift()
1670 if el.name is 'dd' and el.namespace is NS_HTML
1673 if node.name is 'dt' and node.namespace is NS_HTML
1674 generate_implied_end_tags 'dt' # arg is exception
1675 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1678 el = open_els.shift()
1679 if el.name is 'dt' and el.namespace is NS_HTML
1682 if el_is_special_not_adp node
1684 close_p_if_in_button_scope()
1685 insert_html_element t
1687 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1688 close_p_if_in_button_scope()
1689 insert_html_element t
1690 tok_state = tok_state_plaintext
1692 if t.type is TYPE_START_TAG and t.name is 'button'
1693 if is_in_scope 'button', NS_HTML
1695 generate_implied_end_tags()
1697 el = open_els.shift()
1698 if el.name is 'button' and el.namespace is NS_HTML
1701 insert_html_element t
1702 flag_frameset_ok = false
1704 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1705 unless is_in_scope t.name, NS_HTML
1708 generate_implied_end_tags()
1709 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1712 el = open_els.shift()
1713 if el.name is t.name and el.namespace is NS_HTML
1716 if t.type is TYPE_END_TAG and t.name is 'form'
1717 unless template_tag_is_open()
1718 node = form_element_pointer
1719 form_element_pointer = null
1720 if node is null or not el_is_in_scope node
1723 generate_implied_end_tags()
1724 if open_els[0] isnt node
1726 for el, i in open_els
1728 open_els.splice i, 1
1731 unless is_in_scope 'form', NS_HTML
1734 generate_implied_end_tags()
1735 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1738 el = open_els.shift()
1739 if el.name is 'form' and el.namespace is NS_HTML
1742 if t.type is TYPE_END_TAG and t.name is 'p'
1743 unless is_in_button_scope 'p', NS_HTML
1745 insert_html_element new_open_tag 'p'
1748 if t.type is TYPE_END_TAG and t.name is 'li'
1749 unless is_in_li_scope 'li', NS_HTML
1752 generate_implied_end_tags 'li' # arg is exception
1753 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1756 el = open_els.shift()
1757 if el.name is 'li' and el.namespace is NS_HTML
1760 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1761 unless is_in_scope t.name, NS_HTML
1764 generate_implied_end_tags t.name # arg is exception
1765 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1768 el = open_els.shift()
1769 if el.name is t.name and el.namespace is NS_HTML
1772 if t.type is TYPE_END_TAG and h_tags[t.name]?
1775 if h_tags[el.name] is el.namespace
1778 if standard_scopers[el.name] is el.namespace
1783 generate_implied_end_tags()
1784 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1787 el = open_els.shift()
1788 if h_tags[el.name] is el.namespace
1792 if t.type is TYPE_START_TAG and t.name is 'a'
1793 # If the list of active formatting elements contains an a element
1794 # between the end of the list and the last marker on the list (or
1795 # the start of the list if there is no marker on the list), then
1796 # this is a parse error; run the adoption agency algorithm for the
1797 # tag name "a", then remove that element from the list of active
1798 # formatting elements and the stack of open elements if the
1799 # adoption agency algorithm didn't already remove it (it might not
1800 # have if the element is not in table scope).
1803 if el.type is TYPE_AFE_MARKER
1805 if el.name is 'a' and el.namespace is NS_HTML
1813 for el, i in open_els
1815 open_els.splice i, 1
1817 el = insert_html_element t
1820 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1822 el = insert_html_element t
1825 if t.type is TYPE_START_TAG and t.name is 'nobr'
1827 el = insert_html_element t
1830 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1831 adoption_agency t.name
1833 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1835 insert_html_element t
1837 flag_frameset_ok = false
1839 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1840 unless is_in_scope t.name, NS_HTML
1843 generate_implied_end_tags()
1844 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1847 el = open_els.shift()
1848 if el.name is t.name and el.namespace is NS_HTML
1850 clear_afe_to_marker()
1852 if t.type is TYPE_START_TAG and t.name is 'table'
1853 close_p_if_in_button_scope() # fixfull quirksmode thing
1854 insert_html_element t
1855 flag_frameset_ok = false
1856 ins_mode = ins_mode_in_table
1858 if t.type is TYPE_END_TAG and t.name is 'br'
1860 t.type is TYPE_START_TAG
1862 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
1864 insert_html_element t
1866 t.acknowledge_self_closing()
1867 flag_frameset_ok = false
1869 if t.type is TYPE_START_TAG and t.name is 'input'
1871 insert_html_element t
1873 t.acknowledge_self_closing()
1874 unless is_input_hidden_tok t
1875 flag_frameset_ok = false
1877 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
1878 insert_html_element t
1880 t.acknowledge_self_closing()
1882 if t.type is TYPE_START_TAG and t.name is 'hr'
1883 close_p_if_in_button_scope()
1884 insert_html_element t
1886 t.acknowledge_self_closing()
1887 flag_frameset_ok = false
1889 if t.type is TYPE_START_TAG and t.name is 'image'
1894 if t.type is TYPE_START_TAG and t.name is 'isindex'
1896 if template_tag_is_open() is false and form_element_pointer isnt null
1898 t.acknowledge_self_closing()
1899 flag_frameset_ok = false
1900 close_p_if_in_button_scope()
1901 el = insert_html_element new_open_tag 'form'
1902 unless template_tag_is_open()
1903 form_element_pointer = el
1906 el.attrs['action'] = a[1]
1908 insert_html_element new_open_tag 'hr'
1911 insert_html_element new_open_tag 'label'
1912 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
1913 input_el = new_open_tag 'input'
1918 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
1919 input_el.attrs_a.push [a[0], a[1]]
1920 input_el.attrs_a.push ['name', 'isindex']
1921 # fixfull this next bit is in english... internationalize?
1922 prompt ?= "This is a searchable index. Enter search keywords: "
1923 insert_character prompt # fixfull split
1924 # TODO submit typo "balue" in spec
1925 insert_html_element input_el
1927 # insert_character '' # you can put chars here if promt attr missing
1929 insert_html_element new_open_tag 'hr'
1932 unless template_tag_is_open()
1933 form_element_pointer = null
1935 if t.type is TYPE_START_TAG and t.name is 'textarea'
1936 insert_html_element t
1937 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1939 tok_state = tok_state_rcdata
1940 original_ins_mode = ins_mode
1941 flag_frameset_ok = false
1942 ins_mode = ins_mode_text
1944 if t.type is TYPE_START_TAG and t.name is 'xmp'
1945 close_p_if_in_button_scope()
1947 flag_frameset_ok = false
1948 parse_generic_raw_text t
1950 if t.type is TYPE_START_TAG and t.name is 'iframe'
1951 flag_frameset_ok = false
1952 parse_generic_raw_text t
1954 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
1955 parse_generic_raw_text t
1957 if t.type is TYPE_START_TAG and t.name is 'select'
1959 insert_html_element t
1960 flag_frameset_ok = false
1961 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
1962 ins_mode = ins_mode_in_select_in_table
1964 ins_mode = ins_mode_in_select
1966 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
1967 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
1970 insert_html_element t
1972 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
1973 if is_in_scope 'ruby', NS_HTML
1974 generate_implied_end_tags()
1975 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
1977 insert_html_element t
1979 if t.type is TYPE_START_TAG and t.name is 'rt'
1980 if is_in_scope 'ruby', NS_HTML
1981 generate_implied_end_tags 'rtc' # arg is exception
1982 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
1984 insert_html_element t
1986 if t.type is TYPE_START_TAG and t.name is 'math'
1988 adjust_mathml_attributes t
1989 adjust_foreign_attributes t
1990 insert_foreign_element t, NS_MATHML
1991 if t.flag 'self-closing'
1993 t.acknowledge_self_closing()
1995 if t.type is TYPE_START_TAG and t.name is 'svg'
1997 adjust_svg_attributes t
1998 adjust_foreign_attributes t
1999 insert_foreign_element t, NS_SVG
2000 if t.flag 'self-closing'
2002 t.acknowledge_self_closing()
2004 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2007 if t.type is TYPE_START_TAG # any other start tag
2009 insert_html_element t
2011 if t.type is TYPE_END_TAG # any other end tag
2012 in_body_any_other_end_tag t.name
2016 ins_mode_in_table_else = (t) ->
2018 flag_foster_parenting = true # FIXME
2020 flag_foster_parenting = false
2021 can_in_table = { # FIXME do this inline like everywhere else
2029 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2030 ins_mode_text = (t) ->
2031 if t.type is TYPE_TEXT
2034 if t.type is TYPE_EOF
2036 if open_els[0].name is 'script'
2037 open_els[0].flag 'already started', true
2039 ins_mode = original_ins_mode
2042 if t.type is TYPE_END_TAG and t.name is 'script'
2044 ins_mode = original_ins_mode
2045 # fixfull the spec seems to assume that I'm going to run the script
2046 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2048 if t.type is TYPE_END_TAG
2050 ins_mode = original_ins_mode
2052 console.log 'warning: end of ins_mode_text reached'
2054 # the functions below implement the tokenizer stats described here:
2055 # http://www.w3.org/TR/html5/syntax.html#tokenization
2057 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2058 ins_mode_in_table = (t) ->
2061 if can_in_table[t.name]
2062 original_ins_mode = ins_mode
2063 ins_mode = ins_mode_in_table_text
2066 ins_mode_in_table_else t
2074 clear_stack_to_table_context()
2076 insert_html_element t
2077 ins_mode = ins_mode_in_caption
2079 clear_stack_to_table_context()
2080 insert_html_element t
2081 ins_mode = ins_mode_in_column_group
2083 clear_stack_to_table_context()
2084 insert_html_element new_open_tag 'colgroup'
2085 ins_mode = ins_mode_in_column_group
2087 when 'tbody', 'tfoot', 'thead'
2088 clear_stack_to_table_context()
2089 insert_html_element t
2090 ins_mode = ins_mode_in_table_body
2091 when 'td', 'th', 'tr'
2092 clear_stack_to_table_context()
2093 insert_html_element new_open_tag 'tbody'
2094 ins_mode = ins_mode_in_table_body
2098 if is_in_table_scope 'table'
2100 el = open_els.shift()
2101 if el.name is 'table'
2105 when 'style', 'script', 'template'
2108 if is_input_hidden_tok t
2109 ins_mode_in_table_else t
2112 el = insert_html_element t
2114 t.acknowledge_self_closing()
2117 if form_element_pointer?
2119 if template_tag_is_open()
2121 form_element_pointer = insert_html_element t
2124 ins_mode_in_table_else t
2128 if is_in_table_scope 'table'
2130 el = open_els.shift()
2131 if el.name is 'table'
2136 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2141 ins_mode_in_table_else t
2145 ins_mode_in_table_else t
2148 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2149 ins_mode_in_table_text = (t) ->
2150 if t.type is TYPE_TEXT and t.text is "\u0000"
2151 # huh? I thought the tokenizer didn't emit these
2154 if t.type is TYPE_TEXT
2155 pending_table_character_tokens.push t
2159 for old in pending_table_character_tokens
2160 unless is_space_tok old
2164 for old in pending_table_character_tokens
2165 insert_character old
2167 for old in pending_table_character_tokens
2168 ins_mode_table_else old
2169 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
2170 ins_mode = original_ins_mode
2173 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2174 ins_mode_in_caption = (t) ->
2175 if t.type is TYPE_END_TAG and t.name is 'caption'
2176 if is_in_table_scope 'caption'
2177 generate_implied_end_tags()
2178 if open_els[0].name isnt 'caption'
2181 el = open_els.shift()
2182 if el.name is 'caption'
2184 clear_afe_to_marker()
2185 ins_mode = ins_mode_in_table
2190 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2192 if is_in_table_scope 'caption'
2194 el = open_els.shift()
2195 if el.name is 'caption'
2197 clear_afe_to_marker()
2198 ins_mode = ins_mode_in_table
2200 # else fragment case
2202 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2208 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2209 ins_mode_in_column_group = (t) ->
2213 if t.type is TYPE_COMMENT
2216 if t.type is TYPE_DOCTYPE
2219 if t.type is TYPE_START_TAG and t.name is 'html'
2222 if t.type is TYPE_START_TAG and t.name is 'col'
2223 el = insert_html_element t
2225 t.acknowledge_self_closing()
2227 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2228 if open_els[0].name is 'colgroup'
2230 ins_mode = ins_mode_in_table
2234 if t.type is TYPE_END_TAG and t.name is 'col'
2237 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2240 if t.type is TYPE_EOF
2244 if open_els[0].name isnt 'colgroup'
2248 ins_mode = ins_mode_in_table
2252 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2253 ins_mode_in_table_body = (t) ->
2254 if t.type is TYPE_START_TAG and t.name is 'tr'
2255 clear_stack_to_table_body_context()
2256 insert_html_element t
2257 ins_mode = ins_mode_in_row
2259 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2261 clear_stack_to_table_body_context()
2262 insert_html_element new_open_tag 'tr'
2263 ins_mode = ins_mode_in_row
2266 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2267 unless is_in_table_scope t.name # fixfull check namespace
2270 clear_stack_to_table_body_context()
2272 ins_mode = ins_mode_in_table
2274 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2277 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
2280 if table_scopers[el.name]
2285 clear_stack_to_table_body_context()
2287 ins_mode = ins_mode_in_table
2290 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2296 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2297 ins_mode_in_row = (t) ->
2298 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2299 clear_stack_to_table_row_context()
2300 insert_html_element t
2301 ins_mode = ins_mode_in_cell
2304 if t.type is TYPE_END_TAG and t.name is 'tr'
2305 if is_in_table_scope 'tr'
2306 clear_stack_to_table_row_context()
2308 ins_mode = ins_mode_in_table_body
2312 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2313 if is_in_table_scope 'tr'
2314 clear_stack_to_table_row_context()
2316 ins_mode = ins_mode_in_table_body
2321 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2322 if is_in_table_scope t.name # fixfull namespace
2323 if is_in_table_scope 'tr'
2324 clear_stack_to_table_row_context()
2326 ins_mode = ins_mode_in_table_body
2331 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2337 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2339 generate_implied_end_tags()
2340 unless open_els[0].name is 'td' or open_els[0] is 'th'
2343 el = open_els.shift()
2344 if el.name is 'td' or el.name is 'th'
2346 clear_afe_to_marker()
2347 ins_mode = ins_mode_in_row
2349 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2350 ins_mode_in_cell = (t) ->
2351 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2352 if is_in_table_scope t.name
2353 generate_implied_end_tags()
2354 if open_els[0].name isnt t.name
2357 el = open_els.shift()
2358 if el.name is t.name
2360 clear_afe_to_marker()
2361 ins_mode = ins_mode_in_row
2365 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2368 if el.name is 'td' or el.name is 'th'
2371 if table_scopers[el.name]
2379 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2382 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2383 if is_in_table_scope t.name # fixfull namespace
2392 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2393 ins_mode_in_select = (t) ->
2394 if t.type is TYPE_TEXT and t.text is "\u0000"
2397 if t.type is TYPE_TEXT
2400 if t.type is TYPE_COMMENT
2403 if t.type is TYPE_DOCTYPE
2406 if t.type is TYPE_START_TAG and t.name is 'html'
2409 if t.type is TYPE_START_TAG and t.name is 'option'
2410 if open_els[0].name is 'option'
2412 insert_html_element t
2414 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2415 if open_els[0].name is 'option'
2417 if open_els[0].name is 'optgroup'
2419 insert_html_element t
2421 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2422 if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
2424 if open_els[0].name is 'optgroup'
2429 if t.type is TYPE_END_TAG and t.name is 'option'
2430 if open_els[0].name is 'option'
2435 if t.type is TYPE_END_TAG and t.name is 'select'
2436 if is_in_select_scope 'select'
2438 el = open_els.shift()
2439 if el.name is 'select'
2445 if t.type is TYPE_START_TAG and t.name is 'select'
2448 el = open_els.shift()
2449 if el.name is 'select'
2452 # spec says that this is the same as </select> but it doesn't say
2453 # to check scope first
2455 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2457 if is_in_select_scope 'select'
2460 el = open_els.shift()
2461 if el.name is 'select'
2466 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2469 if t.type is TYPE_EOF
2476 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2477 ins_mode_in_select_in_table = (t) ->
2478 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2481 el = open_els.shift()
2482 if el.name is 'select'
2487 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2489 unless is_in_table_scope t.name, NS_HTML
2492 el = open_els.shift()
2493 if el.name is 'select'
2499 ins_mode_in_select t
2502 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2503 ins_mode_in_template = (t) ->
2504 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2507 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2510 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2511 template_ins_modes.shift()
2512 template_ins_modes.unshift ins_mode_in_table
2513 ins_mode = ins_mode_in_table
2516 if t.type is TYPE_START_TAG and t.name is 'col'
2517 template_ins_modes.shift()
2518 template_ins_modes.unshift ins_mode_in_column_group
2519 ins_mode = ins_mode_in_column_group
2522 if t.type is TYPE_START_TAG and t.name is 'tr'
2523 template_ins_modes.shift()
2524 template_ins_modes.unshift ins_mode_in_table_body
2525 ins_mode = ins_mode_in_table_body
2528 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2529 template_ins_modes.shift()
2530 template_ins_modes.unshift ins_mode_in_row
2531 ins_mode = ins_mode_in_row
2534 if t.type is TYPE_START_TAG
2535 template_ins_modes.shift()
2536 template_ins_modes.unshift ins_mode_in_body
2537 ins_mode = ins_mode_in_body
2540 if t.type is TYPE_END_TAG
2543 if t.type is TYPE_EOF
2544 unless template_tag_is_open()
2549 el = open_els.shift()
2550 if el.name is 'template' # fixfull check namespace
2552 clear_afe_to_marker()
2553 template_ins_modes.shift()
2557 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2558 ins_mode_after_body = (t) ->
2562 if t.type is TYPE_COMMENT
2563 insert_comment t, [open_els[0], open_els[0].children.length]
2565 if t.type is TYPE_DOCTYPE
2568 if t.type is TYPE_START_TAG and t.name is 'html'
2571 if t.type is TYPE_END_TAG and t.name is 'html'
2572 # fixfull fragment case
2573 ins_mode = ins_mode_after_after_body
2575 if t.type is TYPE_EOF
2580 ins_mode = ins_mode_in_body
2583 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2584 ins_mode_in_frameset = (t) ->
2588 if t.type is TYPE_COMMENT
2591 if t.type is TYPE_DOCTYPE
2594 if t.type is TYPE_START_TAG and t.name is 'html'
2597 if t.type is TYPE_START_TAG and t.name is 'frameset'
2598 insert_html_element t
2600 if t.type is TYPE_END_TAG and t.name is 'frameset'
2601 # TODO ?correct for: "if the current node is the root html element"
2602 if open_els.length is 1
2604 return # fragment case
2606 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2607 ins_mode = ins_mode_after_frameset
2609 if t.type is TYPE_START_TAG and t.name is 'frame'
2610 insert_html_element t
2612 t.acknowledge_self_closing()
2614 if t.type is TYPE_START_TAG and t.name is 'noframes'
2617 if t.type is TYPE_EOF
2618 # TODO ?correct for: "if the current node is not the root html element"
2619 if open_els.length isnt 1
2627 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2628 ins_mode_after_frameset = (t) ->
2632 if t.type is TYPE_COMMENT
2635 if t.type is TYPE_DOCTYPE
2638 if t.type is TYPE_START_TAG and t.name is 'html'
2641 if t.type is TYPE_END_TAG and t.name is 'html'
2642 insert_mode = ins_mode_after_after_frameset
2644 if t.type is TYPE_START_TAG and t.name is 'noframes'
2647 if t.type is TYPE_EOF
2654 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2655 ins_mode_after_after_body = (t) ->
2656 if t.type is TYPE_COMMENT
2657 insert_comment t, [doc, doc.children.length]
2659 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2662 if t.type is TYPE_EOF
2667 ins_mode = ins_mode_in_body
2670 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2671 ins_mode_after_after_frameset = (t) ->
2672 if t.type is TYPE_COMMENT
2673 insert_comment t, [doc, doc.children.length]
2675 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2678 if t.type is TYPE_EOF
2681 if t.type is TYPE_START_TAG and t.name is 'noframes'
2692 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2694 switch c = txt.charAt(cur++)
2696 return new_text_node parse_character_reference()
2698 tok_state = tok_state_tag_open
2701 return new_text_node c
2703 return new_eof_token()
2705 return new_text_node c
2708 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2709 # not needed: tok_state_character_reference_in_data = ->
2710 # just call parse_character_reference()
2712 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2713 tok_state_rcdata = ->
2714 switch c = txt.charAt(cur++)
2716 return new_text_node parse_character_reference()
2718 tok_state = tok_state_rcdata_less_than_sign
2721 return new_character_token "\ufffd"
2723 return new_eof_token()
2725 return new_character_token c
2728 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2729 # not needed: tok_state_character_reference_in_rcdata = ->
2730 # just call parse_character_reference()
2732 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2733 tok_state_rawtext = ->
2734 switch c = txt.charAt(cur++)
2736 tok_state = tok_state_rawtext_less_than_sign
2739 return new_character_token "\ufffd"
2741 return new_eof_token()
2743 return new_character_token c
2746 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2747 tok_state_script_data = ->
2748 switch c = txt.charAt(cur++)
2750 tok_state = tok_state_script_data_less_than_sign
2753 return new_character_token "\ufffd"
2755 return new_eof_token()
2757 return new_character_token c
2760 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2761 tok_state_plaintext = ->
2762 switch c = txt.charAt(cur++)
2765 return new_character_token "\ufffd"
2767 return new_eof_token()
2769 return new_character_token c
2773 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2774 tok_state_tag_open = ->
2775 switch c = txt.charAt(cur++)
2777 tok_state = tok_state_markup_declaration_open
2779 tok_state = tok_state_end_tag_open
2782 tok_cur_tag = new_comment_token '?'
2783 tok_state = tok_state_bogus_comment
2786 tok_cur_tag = new_open_tag c
2787 tok_state = tok_state_tag_name
2788 else if is_uc_alpha(c)
2789 tok_cur_tag = new_open_tag c.toLowerCase()
2790 tok_state = tok_state_tag_name
2793 tok_state = tok_state_data
2794 cur -= 1 # we didn't parse/handle the char after <
2795 return new_text_node '<'
2798 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2799 tok_state_end_tag_open = ->
2800 switch c = txt.charAt(cur++)
2803 tok_state = tok_state_data
2806 tok_state = tok_state_data
2807 return new_text_node '</'
2810 tok_cur_tag = new_end_tag c.toLowerCase()
2811 tok_state = tok_state_tag_name
2812 else if is_lc_alpha(c)
2813 tok_cur_tag = new_end_tag c
2814 tok_state = tok_state_tag_name
2817 tok_cur_tag = new_comment_token '/'
2818 tok_state = tok_state_bogus_comment
2821 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2822 tok_state_tag_name = ->
2823 switch c = txt.charAt(cur++)
2824 when "\t", "\n", "\u000c", ' '
2825 tok_state = tok_state_before_attribute_name
2827 tok_state = tok_state_self_closing_start_tag
2829 tok_state = tok_state_data
2835 tok_cur_tag.name += "\ufffd"
2838 tok_state = tok_state_data
2841 tok_cur_tag.name += c.toLowerCase()
2843 tok_cur_tag.name += c
2846 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2847 tok_state_rcdata_less_than_sign = ->
2848 c = txt.charAt(cur++)
2850 temporary_buffer = ''
2851 tok_state = tok_state_rcdata_end_tag_open
2854 tok_state = tok_state_rcdata
2855 cur -= 1 # reconsume the input character
2856 return new_character_token '<'
2858 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2859 tok_state_rcdata_end_tag_open = ->
2860 c = txt.charAt(cur++)
2862 tok_cur_tag = new_end_tag c.toLowerCase()
2863 temporary_buffer += c
2864 tok_state = tok_state_rcdata_end_tag_name
2867 tok_cur_tag = new_end_tag c
2868 temporary_buffer += c
2869 tok_state = tok_state_rcdata_end_tag_name
2872 tok_state = tok_state_rcdata
2873 cur -= 1 # reconsume the input character
2874 return new_character_token "</" # fixfull separate these
2876 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2877 is_appropriate_end_tag = (t) ->
2878 # spec says to check against "the tag name of the last start tag to
2879 # have been emitted from this tokenizer", but this is only called from
2880 # the various "raw" states, which I'm pretty sure all push the start
2881 # token onto open_els. TODO: verify this after the script data states
2883 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2884 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2886 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2887 tok_state_rcdata_end_tag_name = ->
2888 c = txt.charAt(cur++)
2889 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2890 if is_appropriate_end_tag tok_cur_tag
2891 tok_state = tok_state_before_attribute_name
2893 # else fall through to "Anything else"
2895 if is_appropriate_end_tag tok_cur_tag
2896 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2898 # else fall through to "Anything else"
2900 if is_appropriate_end_tag tok_cur_tag
2901 tok_state = tok_state_data
2903 # else fall through to "Anything else"
2905 tok_cur_tag.name += c.toLowerCase()
2906 temporary_buffer += c
2909 tok_cur_tag.name += c
2910 temporary_buffer += c
2913 tok_state = tok_state_rcdata
2914 cur -= 1 # reconsume the input character
2915 return new_character_token '</' + temporary_buffer # fixfull separate these
2917 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2918 tok_state_rawtext_less_than_sign = ->
2919 c = txt.charAt(cur++)
2921 temporary_buffer = ''
2922 tok_state = tok_state_rawtext_end_tag_open
2925 tok_state = tok_state_rawtext
2926 cur -= 1 # reconsume the input character
2927 return new_character_token '<'
2929 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2930 tok_state_rawtext_end_tag_open = ->
2931 c = txt.charAt(cur++)
2933 tok_cur_tag = new_end_tag c.toLowerCase()
2934 temporary_buffer += c
2935 tok_state = tok_state_rawtext_end_tag_name
2938 tok_cur_tag = new_end_tag c
2939 temporary_buffer += c
2940 tok_state = tok_state_rawtext_end_tag_name
2943 tok_state = tok_state_rawtext
2944 cur -= 1 # reconsume the input character
2945 return new_character_token "</" # fixfull separate these
2947 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2948 tok_state_rawtext_end_tag_name = ->
2949 c = txt.charAt(cur++)
2950 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2951 if is_appropriate_end_tag tok_cur_tag
2952 tok_state = tok_state_before_attribute_name
2954 # else fall through to "Anything else"
2956 if is_appropriate_end_tag tok_cur_tag
2957 tok_state = tok_state_self_closing_start_tag
2959 # else fall through to "Anything else"
2961 if is_appropriate_end_tag tok_cur_tag
2962 tok_state = tok_state_data
2964 # else fall through to "Anything else"
2966 tok_cur_tag.name += c.toLowerCase()
2967 temporary_buffer += c
2970 tok_cur_tag.name += c
2971 temporary_buffer += c
2974 tok_state = tok_state_rawtext
2975 cur -= 1 # reconsume the input character
2976 return new_character_token '</' + temporary_buffer # fixfull separate these
2978 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
2979 tok_state_script_data_less_than_sign = ->
2980 c = txt.charAt(cur++)
2982 temporary_buffer = ''
2983 tok_state = tok_state_script_data_end_tag_open
2986 tok_state = tok_state_script_data_escape_start
2987 return new_character_token '<!' # fixfull split
2989 tok_state = tok_state_script_data
2990 cur -= 1 # Reconsume
2991 return new_character_token '<'
2993 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2994 tok_state_script_data_end_tag_open = ->
2995 c = txt.charAt(cur++)
2997 tok_cur_tag = new_end_tag c.toLowerCase()
2998 temporary_buffer += c
2999 tok_state = tok_state_script_data_end_tag_name
3002 tok_cur_tag = new_end_tag c
3003 temporary_buffer += c
3004 tok_state = tok_state_script_data_end_tag_name
3007 tok_state = tok_state_script_data
3008 cur -= 1 # Reconsume
3009 return new_character_token '</'
3011 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3012 tok_state_script_data_end_tag_name = ->
3013 c = txt.charAt(cur++)
3014 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3015 if is_appropriate_end_tag tok_cur_tag
3016 tok_state = tok_state_before_attribute_name
3020 if is_appropriate_end_tag tok_cur_tag
3021 tok_state = tok_state_self_closing_start_tag
3025 tok_cur_tag.name += c.toLowerCase()
3026 temporary_buffer += c
3029 tok_cur_tag.name += c
3030 temporary_buffer += c
3033 tok_state = tok_state_script_data
3034 cur -= 1 # Reconsume
3035 return new_character_token "</#{temporary_buffer}" # fixfull split
3037 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3038 tok_state_script_data_escape_start = ->
3039 c = txt.charAt(cur++)
3041 tok_state = tok_state_script_data_escape_start_dash
3042 return new_character_token '-'
3044 tok_state = tok_state_script_data
3045 cur -= 1 # Reconsume
3048 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3049 tok_state_script_data_escape_start_dash = ->
3050 c = txt.charAt(cur++)
3052 tok_state = tok_state_script_data_escaped_dash_dash
3053 return new_character_token '-'
3055 tok_state = tok_state_script_data
3056 cur -= 1 # Reconsume
3059 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3060 tok_state_script_data_escaped = ->
3061 c = txt.charAt(cur++)
3063 tok_state = tok_state_script_data_escaped_dash
3064 return new_character_token '-'
3066 tok_state = tok_state_script_data_escaped_less_than_sign
3070 return new_character_token "\ufffd"
3072 tok_state = tok_state_data
3074 cur -= 1 # Reconsume
3077 return new_character_token c
3079 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3080 tok_state_script_data_escaped_dash = ->
3081 c = txt.charAt(cur++)
3083 tok_state = tok_state_script_data_escaped_dash_dash
3084 return new_character_token '-'
3086 tok_state = tok_state_script_data_escaped_less_than_sign
3090 tok_state = tok_state_script_data_escaped
3091 return new_character_token "\ufffd"
3093 tok_state = tok_state_data
3095 cur -= 1 # Reconsume
3098 tok_state = tok_state_script_data_escaped
3099 return new_character_token c
3101 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3102 tok_state_script_data_escaped_dash_dash = ->
3103 c = txt.charAt(cur++)
3105 return new_character_token '-'
3107 tok_state = tok_state_script_data_escaped_less_than_sign
3110 tok_state = tok_state_script_data
3111 return new_character_token '>'
3114 tok_state = tok_state_script_data_escaped
3115 return new_character_token "\ufffd"
3118 tok_state = tok_state_data
3119 cur -= 1 # Reconsume
3122 tok_state = tok_state_script_data_escaped
3123 return new_character_token c
3125 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3126 tok_state_script_data_escaped_less_than_sign = ->
3127 c = txt.charAt(cur++)
3129 temporary_buffer = ''
3130 tok_state = tok_state_script_data_escaped_end_tag_open
3133 temporary_buffer = c.toLowerCase() # yes, really
3134 tok_state = tok_state_script_data_double_escape_start
3135 return new_character_token "<#{c}" # fixfull split
3137 temporary_buffer = c
3138 tok_state = tok_state_script_data_double_escape_start
3139 return new_character_token "<#{c}" # fixfull split
3141 tok_state = tok_state_script_data_escaped
3142 cur -= 1 # Reconsume
3143 return new_character_token c
3145 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3146 tok_state_script_data_escaped_end_tag_open = ->
3147 c = txt.charAt(cur++)
3149 tok_cur_tag = new_end_tag c.toLowerCase()
3150 temporary_buffer += c
3151 tok_state = tok_state_script_data_escaped_end_tag_name
3154 tok_cur_tag = new_end_tag c
3155 temporary_buffer += c
3156 tok_state = tok_state_script_data_escaped_end_tag_name
3159 tok_state = tok_state_script_data_escaped
3160 cur -= 1 # Reconsume
3161 return new_character_token '</' # fixfull split
3163 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3164 tok_state_script_data_escaped_end_tag_name = ->
3165 c = txt.charAt(cur++)
3166 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3167 if is_appropriate_end_tag tok_cur_tag
3168 tok_state = tok_state_before_attribute_name
3172 if is_appropriate_end_tag tok_cur_tag
3173 tok_state = tok_state_self_closing_start_tag
3177 tok_cur_tag.name += c.toLowerCase()
3178 temporary_buffer += c.toLowerCase()
3181 tok_cur_tag.name += c
3182 temporary_buffer += c.toLowerCase()
3185 tok_state = tok_state_script_data_escaped
3186 cur -= 1 # Reconsume
3187 return new_character_token "</#{temporary_buffer}" # fixfull split
3189 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3190 tok_state_script_data_double_escape_start = ->
3191 c = txt.charAt(cur++)
3192 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3193 if temporary_buffer is 'script'
3194 tok_state = tok_state_script_data_double_escaped
3196 tok_state = tok_state_script_data_escaped
3197 return new_character_token c
3199 temporary_buffer += c.toLowerCase() # yes, really lowercase
3200 return new_character_token c
3202 temporary_buffer += c
3203 return new_character_token c
3205 tok_state = tok_state_script_data_escaped
3206 cur -= 1 # Reconsume
3209 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3210 tok_state_script_data_double_escaped = ->
3211 c = txt.charAt(cur++)
3213 tok_state = tok_state_script_data_double_escaped_dash
3214 return new_character_token '-'
3216 tok_state = tok_state_script_data_double_escaped_less_than_sign
3217 return new_character_token '<'
3220 return new_character_token "\ufffd"
3223 tok_state = tok_state_data
3224 cur -= 1 # Reconsume
3227 return new_character_token c
3229 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3230 tok_state_script_data_double_escaped_dash = ->
3231 c = txt.charAt(cur++)
3233 tok_state = tok_state_script_data_double_escaped_dash_dash
3234 return new_character_token '-'
3236 tok_state = tok_state_script_data_double_escaped_less_than_sign
3237 return new_character_token '<'
3240 tok_state = tok_state_script_data_double_escaped
3241 return new_character_token "\ufffd"
3244 tok_state = tok_state_data
3245 cur -= 1 # Reconsume
3248 tok_state = tok_state_script_data_double_escaped
3249 return new_character_token c
3251 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3252 tok_state_script_data_double_escaped_dash_dash = ->
3253 c = txt.charAt(cur++)
3255 return new_character_token '-'
3257 tok_state = tok_state_script_data_double_escaped_less_than_sign
3258 return new_character_token '<'
3260 tok_state = tok_state_script_data
3261 return new_character_token '>'
3264 tok_state = tok_state_script_data_double_escaped
3265 return new_character_token "\ufffd"
3268 tok_state = tok_state_data
3269 cur -= 1 # Reconsume
3272 tok_state = tok_state_script_data_double_escaped
3273 return new_character_token c
3275 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3276 tok_state_script_data_double_escaped_less_than_sign = ->
3277 c = txt.charAt(cur++)
3279 temporary_buffer = ''
3280 tok_state = tok_state_script_data_double_escape_end
3281 return new_character_token '/'
3283 tok_state = tok_state_script_data_double_escaped
3284 cur -= 1 # Reconsume
3287 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3288 tok_state_script_data_double_escape_end = ->
3289 c = txt.charAt(cur++)
3290 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3291 if temporary_buffer is 'script'
3292 tok_state = tok_state_script_data_escaped
3294 tok_state = tok_state_script_data_double_escaped
3295 return new_character_token c
3297 temporary_buffer += c.toLowerCase() # yes, really lowercase
3298 return new_character_token c
3300 temporary_buffer += c
3301 return new_character_token c
3303 tok_state = tok_state_script_data_double_escaped
3304 cur -= 1 # Reconsume
3307 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3308 tok_state_before_attribute_name = ->
3310 switch c = txt.charAt(cur++)
3311 when "\t", "\n", "\u000c", ' '
3314 tok_state = tok_state_self_closing_start_tag
3317 tok_state = tok_state_data
3323 attr_name = "\ufffd"
3324 when '"', "'", '<', '='
3329 tok_state = tok_state_data
3332 attr_name = c.toLowerCase()
3336 tok_cur_tag.attrs_a.unshift [attr_name, '']
3337 tok_state = tok_state_attribute_name
3340 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3341 tok_state_attribute_name = ->
3342 switch c = txt.charAt(cur++)
3343 when "\t", "\n", "\u000c", ' '
3344 tok_state = tok_state_after_attribute_name
3346 tok_state = tok_state_self_closing_start_tag
3348 tok_state = tok_state_before_attribute_value
3350 tok_state = tok_state_data
3356 tok_cur_tag.attrs_a[0][0] = "\ufffd"
3359 tok_cur_tag.attrs_a[0][0] = c
3362 tok_state = tok_state_data
3365 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
3367 tok_cur_tag.attrs_a[0][0] += c
3370 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3371 tok_state_after_attribute_name = ->
3372 c = txt.charAt(cur++)
3373 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3376 tok_state = tok_state_self_closing_start_tag
3379 tok_state = tok_state_before_attribute_value
3382 tok_state = tok_state_data
3385 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3386 tok_state = tok_state_attribute_name
3390 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3391 tok_state = tok_state_attribute_name
3395 tok_state = tok_state_data
3396 cur -= 1 # reconsume
3398 if c is '"' or c is "'" or c is '<'
3400 # fall through to Anything else
3402 tok_cur_tag.attrs_a.unshift [c, '']
3403 tok_state = tok_state_attribute_name
3405 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3406 tok_state_before_attribute_value = ->
3407 switch c = txt.charAt(cur++)
3408 when "\t", "\n", "\u000c", ' '
3411 tok_state = tok_state_attribute_value_double_quoted
3413 tok_state = tok_state_attribute_value_unquoted
3416 tok_state = tok_state_attribute_value_single_quoted
3419 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3420 tok_state = tok_state_attribute_value_unquoted
3423 tok_state = tok_state_data
3429 tok_state = tok_state_data
3431 tok_cur_tag.attrs_a[0][1] += c
3432 tok_state = tok_state_attribute_value_unquoted
3435 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3436 tok_state_attribute_value_double_quoted = ->
3437 switch c = txt.charAt(cur++)
3439 tok_state = tok_state_after_attribute_value_quoted
3441 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3444 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3447 tok_state = tok_state_data
3449 tok_cur_tag.attrs_a[0][1] += c
3452 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3453 tok_state_attribute_value_single_quoted = ->
3454 switch c = txt.charAt(cur++)
3456 tok_state = tok_state_after_attribute_value_quoted
3458 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3461 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3464 tok_state = tok_state_data
3466 tok_cur_tag.attrs_a[0][1] += c
3469 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3470 tok_state_attribute_value_unquoted = ->
3471 switch c = txt.charAt(cur++)
3472 when "\t", "\n", "\u000c", ' '
3473 tok_state = tok_state_before_attribute_name
3475 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3477 tok_state = tok_state_data
3482 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3485 tok_state = tok_state_data
3487 # Parse Error if ', <, = or ` (backtick)
3488 tok_cur_tag.attrs_a[0][1] += c
3491 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3492 tok_state_after_attribute_value_quoted = ->
3493 switch c = txt.charAt(cur++)
3494 when "\t", "\n", "\u000c", ' '
3495 tok_state = tok_state_before_attribute_name
3497 tok_state = tok_state_self_closing_start_tag
3499 tok_state = tok_state_data
3505 tok_state = tok_state_data
3508 tok_state = tok_state_before_attribute_name
3509 cur -= 1 # we didn't handle that char
3512 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3513 tok_state_self_closing_start_tag = ->
3514 c = txt.charAt(cur++)
3516 tok_cur_tag.flag 'self-closing'
3517 tok_state = tok_state_data
3521 tok_state = tok_state_data
3522 cur -= 1 # Reconsume
3526 tok_state = tok_state_before_attribute_name
3527 cur -= 1 # Reconsume
3530 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3531 # WARNING: put a comment token in tok_cur_tag before setting this state
3532 tok_state_bogus_comment = ->
3533 next_gt = txt.indexOf '>', cur
3535 val = txt.substr cur
3538 val = txt.substr cur, (next_gt - cur)
3540 val = val.replace "\u0000", "\ufffd"
3541 tok_cur_tag.text += val
3542 tok_state = tok_state_data
3545 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3546 tok_state_markup_declaration_open = ->
3547 if txt.substr(cur, 2) is '--'
3549 tok_cur_tag = new_comment_token ''
3550 tok_state = tok_state_comment_start
3552 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3554 tok_state = tok_state_doctype
3556 acn = adjusted_current_node()
3557 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3559 tok_state = tok_state_cdata_section
3563 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
3564 tok_state = tok_state_bogus_comment
3567 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3568 tok_state_comment_start = ->
3569 switch c = txt.charAt(cur++)
3571 tok_state = tok_state_comment_start_dash
3574 return new_character_token "\ufffd"
3577 tok_state = tok_state_data
3581 tok_state = tok_state_data
3582 cur -= 1 # Reconsume
3585 tok_cur_tag.text += c
3588 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3589 tok_state_comment_start_dash = ->
3590 switch c = txt.charAt(cur++)
3592 tok_state = tok_state_comment_end
3595 tok_cur_tag.text += "-\ufffd"
3596 tok_state = tok_state_comment
3599 tok_state = tok_state_data
3603 tok_state = tok_state_data
3604 cur -= 1 # Reconsume
3607 tok_cur_tag.text += "-#{c}"
3608 tok_state = tok_state_comment
3611 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3612 tok_state_comment = ->
3613 switch c = txt.charAt(cur++)
3615 tok_state = tok_state_comment_end_dash
3618 tok_cur_tag.text += "\ufffd"
3621 tok_state = tok_state_data
3622 cur -= 1 # Reconsume
3625 tok_cur_tag.text += c
3628 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3629 tok_state_comment_end_dash = ->
3630 switch c = txt.charAt(cur++)
3632 tok_state = tok_state_comment_end
3635 tok_cur_tag.text += "-\ufffd"
3636 tok_state = tok_state_comment
3639 tok_state = tok_state_data
3640 cur -= 1 # Reconsume
3643 tok_cur_tag.text += "-#{c}"
3644 tok_state = tok_state_comment
3647 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3648 tok_state_comment_end = ->
3649 switch c = txt.charAt(cur++)
3651 tok_state = tok_state_data
3655 tok_cur_tag.text += "--\ufffd"
3656 tok_state = tok_state_comment
3659 tok_state = tok_state_comment_end_bang
3662 tok_cur_tag.text += '-'
3665 tok_state = tok_state_data
3666 cur -= 1 # Reconsume
3670 tok_cur_tag.text += "--#{c}"
3671 tok_state = tok_state_comment
3674 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3675 tok_state_comment_end_bang = ->
3676 switch c = txt.charAt(cur++)
3678 tok_cur_tag.text += "--!#{c}"
3679 tok_state = tok_state_comment_end_dash
3681 tok_state = tok_state_data
3685 tok_cur_tag.text += "--!\ufffd"
3686 tok_state = tok_state_comment
3689 tok_state = tok_state_data
3690 cur -= 1 # Reconsume
3693 tok_cur_tag.text += "--!#{c}"
3694 tok_state = tok_state_comment
3697 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3698 tok_state_doctype = ->
3699 switch c = txt.charAt(cur++)
3700 when "\t", "\u000a", "\u000c", ' '
3701 tok_state = tok_state_before_doctype_name
3704 tok_state = tok_state_data
3705 el = new_doctype_token ''
3706 el.flag 'force-quirks', true
3707 cur -= 1 # Reconsume
3711 tok_state = tok_state_before_doctype_name
3712 cur -= 1 # Reconsume
3715 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3716 tok_state_before_doctype_name = ->
3717 c = txt.charAt(cur++)
3718 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3721 tok_cur_tag = new_doctype_token c.toLowerCase()
3722 tok_state = tok_state_doctype_name
3726 tok_cur_tag = new_doctype_token "\ufffd"
3727 tok_state = tok_state_doctype_name
3731 el = new_doctype_token ''
3732 el.flag 'force-quirks', true
3733 tok_state = tok_state_data
3737 tok_state = tok_state_data
3738 el = new_doctype_token ''
3739 el.flag 'force-quirks', true
3740 cur -= 1 # Reconsume
3743 tok_cur_tag = new_doctype_token c
3744 tok_state = tok_state_doctype_name
3747 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3748 tok_state_doctype_name = ->
3749 c = txt.charAt(cur++)
3750 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3751 tok_state = tok_state_after_doctype_name
3754 tok_state = tok_state_data
3757 tok_cur_tag.name += c.toLowerCase()
3761 tok_cur_tag.name += "\ufffd"
3765 tok_state = tok_state_data
3766 tok_cur_tag.flag 'force-quirks', true
3767 cur -= 1 # Reconsume
3770 tok_cur_tag.name += c
3773 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3774 tok_state_after_doctype_name = ->
3775 c = txt.charAt(cur++)
3776 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3779 tok_state = tok_state_data
3783 tok_state = tok_state_data
3784 tok_cur_tag.flag 'force-quirks', true
3785 cur -= 1 # Reconsume
3788 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3790 tok_state = tok_state_after_doctype_public_keyword
3792 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3794 tok_state = tok_state_after_doctype_system_keyword
3797 tok_cur_tag.flag 'force-quirks', true
3798 tok_state = tok_state_bogus_doctype
3801 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3802 tok_state_after_doctype_public_keyword = ->
3803 c = txt.charAt(cur++)
3804 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3805 tok_state = tok_state_before_doctype_public_identifier
3809 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3810 tok_state = tok_state_doctype_public_identifier_double_quoted
3814 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3815 tok_state = tok_state_doctype_public_identifier_single_quoted
3819 tok_cur_tag.flag 'force-quirks', true
3820 tok_state = tok_state_data
3824 tok_state = tok_state_data
3825 tok_cur_tag.flag 'force-quirks', true
3826 cur -= 1 # Reconsume
3830 tok_cur_tag.flag 'force-quirks', true
3831 tok_state = tok_state_bogus_doctype
3834 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
3835 tok_state_before_doctype_public_identifier = ->
3836 c = txt.charAt(cur++)
3837 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3841 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3842 tok_state = tok_state_doctype_public_identifier_double_quoted
3846 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3847 tok_state = tok_state_doctype_public_identifier_single_quoted
3851 tok_cur_tag.flag 'force-quirks', true
3852 tok_state = tok_state_data
3856 tok_state = tok_state_data
3857 tok_cur_tag.flag 'force-quirks', true
3858 cur -= 1 # Reconsume
3862 tok_cur_tag.flag 'force-quirks', true
3863 tok_state = tok_state_bogus_doctype
3867 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
3868 tok_state_doctype_public_identifier_double_quoted = ->
3869 c = txt.charAt(cur++)
3871 tok_state = tok_state_after_doctype_public_identifier
3875 tok_cur_tag.public_identifier += "\ufffd"
3879 tok_cur_tag.flag 'force-quirks', true
3880 tok_state = tok_state_data
3884 tok_state = tok_state_data
3885 tok_cur_tag.flag 'force-quirks', true
3886 cur -= 1 # Reconsume
3889 tok_cur_tag.public_identifier += c
3892 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
3893 tok_state_doctype_public_identifier_single_quoted = ->
3894 c = txt.charAt(cur++)
3896 tok_state = tok_state_after_doctype_public_identifier
3900 tok_cur_tag.public_identifier += "\ufffd"
3904 tok_cur_tag.flag 'force-quirks', true
3905 tok_state = tok_state_data
3909 tok_state = tok_state_data
3910 tok_cur_tag.flag 'force-quirks', true
3911 cur -= 1 # Reconsume
3914 tok_cur_tag.public_identifier += c
3917 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
3918 tok_state_after_doctype_public_identifier = ->
3919 c = txt.charAt(cur++)
3920 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3921 tok_state = tok_state_between_doctype_public_and_system_identifiers
3924 tok_state = tok_state_data
3928 tok_cur_tag.system_identifier = ''
3929 tok_state = tok_state_doctype_system_identifier_double_quoted
3933 tok_cur_tag.system_identifier = ''
3934 tok_state = tok_state_doctype_system_identifier_single_quoted
3938 tok_state = tok_state_data
3939 tok_cur_tag.flag 'force-quirks', true
3940 cur -= 1 # Reconsume
3944 tok_cur_tag.flag 'force-quirks', true
3945 tok_state = tok_state_bogus_doctype
3948 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
3949 tok_state_between_doctype_public_and_system_identifiers = ->
3950 c = txt.charAt(cur++)
3951 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3954 tok_state = tok_state_data
3958 tok_cur_tag.system_identifier = ''
3959 tok_state = tok_state_doctype_system_identifier_double_quoted
3963 tok_cur_tag.system_identifier = ''
3964 tok_state = tok_state_doctype_system_identifier_single_quoted
3968 tok_state = tok_state_data
3969 tok_cur_tag.flag 'force-quirks', true
3970 cur -= 1 # Reconsume
3974 tok_cur_tag.flag 'force-quirks', true
3975 tok_state = tok_state_bogus_doctype
3978 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
3979 tok_state_after_doctype_system_keyword = ->
3980 c = txt.charAt(cur++)
3981 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3982 tok_state = tok_state_before_doctype_system_identifier
3986 tok_cur_tag.system_identifier = ''
3987 tok_state = tok_state_doctype_system_identifier_double_quoted
3991 tok_cur_tag.system_identifier = ''
3992 tok_state = tok_state_doctype_system_identifier_single_quoted
3996 tok_cur_tag.flag 'force-quirks', true
3997 tok_state = tok_state_data
4001 tok_state = tok_state_data
4002 tok_cur_tag.flag 'force-quirks', true
4003 cur -= 1 # Reconsume
4007 tok_cur_tag.flag 'force-quirks', true
4008 tok_state = tok_state_bogus_doctype
4011 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4012 tok_state_before_doctype_system_identifier = ->
4013 c = txt.charAt(cur++)
4014 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4017 tok_cur_tag.system_identifier = ''
4018 tok_state = tok_state_doctype_system_identifier_double_quoted
4021 tok_cur_tag.system_identifier = ''
4022 tok_state = tok_state_doctype_system_identifier_single_quoted
4026 tok_cur_tag.flag 'force-quirks', true
4027 tok_state = tok_state_data
4031 tok_state = tok_state_data
4032 tok_cur_tag.flag 'force-quirks', true
4033 cur -= 1 # Reconsume
4037 tok_cur_tag.flag 'force-quirks', true
4038 tok_state = tok_state_bogus_doctype
4041 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4042 tok_state_doctype_system_identifier_double_quoted = ->
4043 c = txt.charAt(cur++)
4045 tok_state = tok_state_after_doctype_system_identifier
4049 tok_cur_tag.system_identifier += "\ufffd"
4053 tok_cur_tag.flag 'force-quirks', true
4054 tok_state = tok_state_data
4058 tok_state = tok_state_data
4059 tok_cur_tag.flag 'force-quirks', true
4060 cur -= 1 # Reconsume
4063 tok_cur_tag.system_identifier += c
4066 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4067 tok_state_doctype_system_identifier_single_quoted = ->
4068 c = txt.charAt(cur++)
4070 tok_state = tok_state_after_doctype_system_identifier
4074 tok_cur_tag.system_identifier += "\ufffd"
4078 tok_cur_tag.flag 'force-quirks', true
4079 tok_state = tok_state_data
4083 tok_state = tok_state_data
4084 tok_cur_tag.flag 'force-quirks', true
4085 cur -= 1 # Reconsume
4088 tok_cur_tag.system_identifier += c
4091 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4092 tok_state_after_doctype_system_identifier = ->
4093 c = txt.charAt(cur++)
4094 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4097 tok_state = tok_state_data
4101 tok_state = tok_state_data
4102 tok_cur_tag.flag 'force-quirks', true
4103 cur -= 1 # Reconsume
4107 # do _not_ tok_cur_tag.flag 'force-quirks', true
4108 tok_state = tok_state_bogus_doctype
4111 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4112 tok_state_bogus_doctype = ->
4113 c = txt.charAt(cur++)
4115 tok_state = tok_state_data
4118 tok_state = tok_state_data
4119 cur -= 1 # Reconsume
4124 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4125 tok_state_cdata_section = ->
4126 tok_state = tok_state_data
4127 next_gt = txt.indexOf ']]>', cur
4129 val = txt.substr cur
4132 val = txt.substr cur, (next_gt - cur)
4134 val = val.replace "\u0000", "\ufffd" # fixfull spec doesn't say this
4135 return new_character_token val # fixfull split
4137 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4138 # Don't set this as a state, just call it
4139 # returns a string (NOT a text node)
4140 parse_character_reference = (allowed_char = null, in_attr = false) ->
4141 if cur >= txt.length
4143 switch c = txt.charAt(cur)
4144 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4145 # explicitly not a parse error
4148 # there has to be "one or more" alnums between & and ; to be a parse error
4151 if cur + 1 >= txt.length
4153 if txt.charAt(cur + 1).toLowerCase() is 'x'
4162 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4166 if txt.charAt(start + i) is ';'
4168 # FIXME This is supposed to generate parse errors for some chars
4169 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
4176 if alnum.indexOf(txt.charAt(cur + i)) is -1
4179 # exit early, because parse_error() below needs at least one alnum
4181 if txt.charAt(cur + i) is ';'
4182 i += 1 # include ';' terminator in value
4183 decoded = decode_named_char_ref txt.substr(cur, i)
4190 # no ';' terminator (only legacy char refs)
4192 for i in [2..max] # no prefix matches, so ok to check shortest first
4193 c = legacy_char_refs[txt.substr(cur, i)]
4196 if txt.charAt(cur + i) is '='
4197 # "because some legacy user agents will
4198 # misinterpret the markup in those cases"
4201 if alnum.indexOf(txt.charAt(cur + i)) > -1
4202 # this makes attributes forgiving about url args
4204 # ok, and besides the weird exceptions for attributes...
4205 # return the matching char
4206 cur += i # consume entity chars
4207 parse_error() # because no terminating ";"
4211 return # never reached
4213 # tree constructor initialization
4214 # see comments on TYPE_TAG/etc for the structure of this data
4215 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4217 afe = [] # active formatting elements
4218 template_ins_modes = []
4219 ins_mode = ins_mode_initial
4220 original_ins_mode = ins_mode # TODO check spec
4221 flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
4222 flag_frameset_ok = true
4224 flag_foster_parenting = false
4225 form_element_pointer = null
4226 temporary_buffer = null
4227 pending_table_character_tokens = []
4228 head_element_pointer = null
4229 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4230 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4232 # tokenizer initialization
4233 tok_state = tok_state_data
4240 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4243 serialize_els = (els, shallow, show_ids) ->
4249 serialized += t.serialize shallow, show_ids
4252 # TODO export TYPE_*
4253 module.exports.parse_html = parse_html
4254 module.exports.debug_log_reset = debug_log_reset
4255 module.exports.debug_log_each = debug_log_each
4256 module.exports.TYPE_TAG = TYPE_TAG
4257 module.exports.TYPE_TEXT = TYPE_TEXT
4258 module.exports.TYPE_COMMENT = TYPE_COMMENT
4259 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4260 module.exports.NS_HTML = NS_HTML
4261 module.exports.NS_MATHML = NS_MATHML
4262 module.exports.NS_SVG = NS_SVG