1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
55 module = exports: window.wheic
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
79 debug_log_each = (cb) ->
80 for str in g_debug_log
85 constructor: (type, args = {}) ->
86 @type = type # one of the TYPE_* constants above
87 @name = args.name ? '' # tag name
88 @text = args.text ? '' # contents for text/comment nodes
89 @attrs = args.attrs ? {}
90 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91 @children = args.children ? []
92 @namespace = args.namespace ? NS_HTML
93 @parent = args.parent ? null
94 @token = args.token ? null
95 @flags = args.flags ? {}
99 @id = "#{++prev_node_id}"
100 acknowledge_self_closing: ->
102 @token.flag 'did_self_close'
104 @flag 'did_self_close', true
105 flag: (key, value = null) ->
110 serialize: (shallow = false, show_ids = false) -> # for unit tests
115 ret += JSON.stringify @name
130 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
136 ret += c.serialize shallow, show_ids
140 ret += JSON.stringify @text
143 ret += JSON.stringify @text
145 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
148 when TYPE_AAA_BOOKMARK
149 ret += 'aaa_bookmark'
152 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
155 # helpers: (only take args that are normally known when parser creates nodes)
156 new_open_tag = (name) ->
157 return new Node TYPE_START_TAG, name: name
158 new_end_tag = (name) ->
159 return new Node TYPE_END_TAG, name: name
160 new_element = (name) ->
161 return new Node TYPE_TAG, name: name
162 new_text_node = (txt) ->
163 return new Node TYPE_TEXT, text: txt
164 new_character_token = new_text_node
165 new_comment_token = (txt) ->
166 return new Node TYPE_COMMENT, text: txt
167 new_doctype_token = (name) ->
168 return new Node TYPE_DOCTYPE, name: name
170 return new Node TYPE_EOF
172 return new Node TYPE_AFE_MARKER
173 new_aaa_bookmark = ->
174 return new Node TYPE_AAA_BOOKMARK
176 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
177 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
178 digits = "0123456789"
179 alnum = lc_alpha + uc_alpha + digits
180 hex_chars = digits + "abcdefABCDEF"
182 is_uc_alpha = (str) ->
183 return str.length is 1 and uc_alpha.indexOf(str) > -1
184 is_lc_alpha = (str) ->
185 return str.length is 1 and lc_alpha.indexOf(str) > -1
187 # some SVG elements have dashes in them
188 tag_name_chars = alnum + "-"
190 # http://www.w3.org/TR/html5/infrastructure.html#space-character
191 space_chars = "\u0009\u000a\u000c\u000d\u0020"
193 return txt.length is 1 and space_chars.indexOf(txt) > -1
194 is_space_tok = (t) ->
195 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
197 is_input_hidden_tok = (t) ->
198 return unless t.type is TYPE_START_TAG
201 if a[1].toLowerCase() is 'hidden'
206 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
207 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
209 # These are the character references that don't need a terminating semicolon
210 # min length: 2, max: 6, none are a prefix of any other.
212 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
213 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
214 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
215 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
216 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
217 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
218 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
219 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
220 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
221 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
222 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
223 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
224 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
225 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
226 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
227 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
228 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
232 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
233 raw_text_elements = ['script', 'style']
234 escapable_raw_text_elements = ['textarea', 'title']
235 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
237 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
238 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
239 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
240 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
241 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
242 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
243 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
244 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
245 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
246 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
247 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
248 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
249 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
250 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
254 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
256 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
257 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
258 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
259 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
260 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
261 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
262 'determinant', 'diff', 'divergence', 'divide', 'domain',
263 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
264 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
265 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
266 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
267 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
268 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
269 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
270 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
271 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
272 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
273 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
274 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
275 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
276 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
277 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
278 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
279 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
280 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
281 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
282 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
283 'vectorproduct', 'xor'
285 # foreign_elements = [svg_elements..., mathml_elements...]
286 #normal_elements = All other allowed HTML elements are normal elements.
290 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
291 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
292 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
293 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
294 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
295 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
296 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
297 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
298 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
299 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
300 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
301 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
302 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
303 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
304 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
305 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
306 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
307 wbr:NS_HTML, xmp:NS_HTML,
310 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
311 'annotation-xml':NS_MATHML,
314 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
317 formatting_elements = {
318 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
319 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
323 mathml_text_integration = {
324 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
326 is_mathml_text_integration_point = (el) ->
327 return mathml_text_integration[el.name] = el.namespace
328 is_html_integration = (el) -> # DON'T PASS A TOKEN
329 if el.namespace is NS_MATHML and el.name is 'annotation-xml'
330 if el.attrs.encoding?
331 if el.attrs.encoding.toLowerCase() is 'text/html'
333 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
336 if el.namespace is NS_SVG
337 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
342 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
346 foster_parenting_targets = {
369 el_is_special = (e) ->
370 return special_elements[e.name] is e.namespace
372 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
373 el_is_special_not_adp = (el) ->
374 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
378 altglyphdef: 'altGlyphDef'
379 altglyphitem: 'altGlyphItem'
380 animatecolor: 'animateColor'
381 animatemotion: 'animateMotion'
382 animatetransform: 'animateTransform'
385 fecolormatrix: 'feColorMatrix'
386 fecomponenttransfer: 'feComponentTransfer'
387 fecomposite: 'feComposite'
388 feconvolvematrix: 'feConvolveMatrix'
389 fediffuselighting: 'feDiffuseLighting'
390 fedisplacementmap: 'feDisplacementMap'
391 fedistantlight: 'feDistantLight'
392 fedropshadow: 'feDropShadow'
398 fegaussianblur: 'feGaussianBlur'
401 femergenode: 'feMergeNode'
402 femorphology: 'feMorphology'
404 fepointlight: 'fePointLight'
405 fespecularlighting: 'feSpecularLighting'
406 fespotlight: 'feSpotLight'
408 feturbulence: 'feTurbulence'
409 foreignobject: 'foreignObject'
411 lineargradient: 'linearGradient'
412 radialgradient: 'radialGradient'
415 svg_attribute_fixes = {
416 attributename: 'attributeName'
417 attributetype: 'attributeType'
418 basefrequency: 'baseFrequency'
419 baseprofile: 'baseProfile'
421 clippathunits: 'clipPathUnits'
422 contentscripttype: 'contentScriptType'
423 contentstyletype: 'contentStyleType'
424 diffuseconstant: 'diffuseConstant'
426 externalresourcesrequired: 'externalResourcesRequired'
427 filterres: 'filterRes'
428 filterunits: 'filterUnits'
430 gradienttransform: 'gradientTransform'
431 gradientunits: 'gradientUnits'
432 kernelmatrix: 'kernelMatrix'
433 kernelunitlength: 'kernelUnitLength'
434 keypoints: 'keyPoints'
435 keysplines: 'keySplines'
437 lengthadjust: 'lengthAdjust'
438 limitingconeangle: 'limitingConeAngle'
439 markerheight: 'markerHeight'
440 markerunits: 'markerUnits'
441 markerwidth: 'markerWidth'
442 maskcontentunits: 'maskContentUnits'
443 maskunits: 'maskUnits'
444 numoctaves: 'numOctaves'
445 pathlength: 'pathLength'
446 patterncontentunits: 'patternContentUnits'
447 patterntransform: 'patternTransform'
448 patternunits: 'patternUnits'
449 pointsatx: 'pointsAtX'
450 pointsaty: 'pointsAtY'
451 pointsatz: 'pointsAtZ'
452 preservealpha: 'preserveAlpha'
453 preserveaspectratio: 'preserveAspectRatio'
454 primitiveunits: 'primitiveUnits'
457 repeatcount: 'repeatCount'
458 repeatdur: 'repeatDur'
459 requiredextensions: 'requiredExtensions'
460 requiredfeatures: 'requiredFeatures'
461 specularconstant: 'specularConstant'
462 specularexponent: 'specularExponent'
463 spreadmethod: 'spreadMethod'
464 startoffset: 'startOffset'
465 stddeviation: 'stdDeviation'
466 stitchtiles: 'stitchTiles'
467 surfacescale: 'surfaceScale'
468 systemlanguage: 'systemLanguage'
469 tablevalues: 'tableValues'
472 textlength: 'textLength'
474 viewtarget: 'viewTarget'
475 xchannelselector: 'xChannelSelector'
476 ychannelselector: 'yChannelSelector'
477 zoomandpan: 'zoomAndPan'
479 adjust_mathml_attributes = (t) ->
481 if a[0] is 'definitionurl'
482 a[0] = 'definitionURL'
484 adjust_svg_attributes = (t) ->
486 if svg_attribute_fixes[a[0]]?
487 a[0] = svg_attribute_fixes[a[0]]
489 adjust_foreign_attributes = (t) ->
493 # decode_named_char_ref()
495 # The list of named character references is _huge_ so ask the browser to decode
496 # for us instead of wasting bandwidth/space on including the table here.
498 # Pass without the "&" but with the ";" examples:
499 # for "&" pass "amp;"
500 # for "′" pass "x2032;"
503 textarea: document.createElement('textarea')
505 # TODO test this in IE8
506 decode_named_char_ref = (txt) ->
508 decoded = g_dncr.cache[txt]
509 return decoded if decoded?
510 g_dncr.textarea.innerHTML = txt
511 decoded = g_dncr.textarea.value
512 return null if decoded is txt
513 return g_dncr.cache[txt] = decoded
515 parse_html = (args) ->
517 cur = null # index of next char in txt to be parsed
518 # declare doc and tokenizer variables so they're in scope below
520 open_els = null # stack of open elements
521 afe = null # active formatting elements
522 template_ins_modes = null
524 original_ins_mode = null
526 tok_cur_tag = null # partially parsed tag
527 flag_scripting = null
528 flag_frameset_ok = null
530 flag_foster_parenting = null
531 form_element_pointer = null
532 temporary_buffer = null
533 pending_table_character_tokens = null
534 head_element_pointer = null
535 flag_fragment_parsing = null
536 context_element = null
545 console.log "Parse error at character #{cur} of #{txt.length}"
547 afe_push = (new_el) ->
550 if el.name is new_el.name and el.namespace is new_el.namespace
552 continue unless new_el.attrs[k] is v
553 for k, v of new_el.attrs
554 continue unless el.attrs[k] is v
561 afe.unshift new_afe_marker()
563 # the functions below impliment the Tree Contstruction algorithm
564 # http://www.w3.org/TR/html5/syntax.html#tree-construction
566 # But first... the helpers
567 template_tag_is_open = ->
569 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
572 is_in_scope_x = (tag_name, scope, namespace) ->
574 if t.name is tag_name and (namespace is null or namespace is t.namespace)
576 if scope[t.name] is t.namespace
579 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
581 if t.name is tag_name and (namespace is null or namespace is t.namespace)
583 if scope[t.name] is t.namespace
585 if scope2[t.name] is t.namespace
589 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
590 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
591 template: NS_HTML, mi: NS_MATHML,
593 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
594 'annotation-xml': NS_MATHML,
596 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
598 button_scopers = button: NS_HTML
599 li_scopers = ol: NS_HTML, ul: NS_HTML
600 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
601 is_in_scope = (tag_name, namespace = null) ->
602 return is_in_scope_x tag_name, standard_scopers, namespace
603 is_in_button_scope = (tag_name, namespace = null) ->
604 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
605 is_in_table_scope = (tag_name, namespace = null) ->
606 return is_in_scope_x tag_name, table_scopers, namespace
607 # aka is_in_list_item_scope
608 is_in_li_scope = (tag_name, namespace = null) ->
609 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
610 is_in_select_scope = (tag_name, namespace = null) ->
612 if t.name is tag_name and (namespace is null or namespace is t.namespace)
614 if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
617 # this checks for a particular element, not by name
618 el_is_in_scope = (el) ->
622 if standard_scopers[t.name] is t.namespace
626 clear_to_table_stopers = {
631 clear_stack_to_table_context = ->
633 if clear_to_table_stopers[open_els[0].name]?
637 clear_to_table_body_stopers = {
644 clear_stack_to_table_body_context = ->
646 if clear_to_table_body_stopers[open_els[0].name]?
650 clear_to_table_row_stopers = {
655 clear_stack_to_table_row_context = ->
657 if clear_to_table_row_stopers[open_els[0].name]?
661 clear_afe_to_marker = ->
663 return unless afe.length > 0 # this happens in fragment case, ?spec error
665 if el.type is TYPE_AFE_MARKER
670 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
672 # 1. Let last be false.
674 # 2. Let node be the last node in the stack of open elements.
676 node = open_els[node_i]
677 # 3. Loop: If node is the first node in the stack of open elements,
678 # then set last to true, and, if the parser was originally created as
679 # part of the HTML fragment parsing algorithm (fragment case) set node
680 # to the context element.
682 if node_i is open_els.length - 1
684 # fixfull (fragment case)
686 # 4. If node is a select element, run these substeps:
687 if node.name is 'select'
688 # 1. If last is true, jump to the step below labeled done.
690 # 2. Let ancestor be node.
693 # 3. Loop: If ancestor is the first node in the stack of
694 # open elements, jump to the step below labeled done.
696 if ancestor_i is open_els.length - 1
698 # 4. Let ancestor be the node before ancestor in the stack
701 ancestor = open_els[ancestor_i]
702 # 5. If ancestor is a template node, jump to the step below
704 if ancestor.name is 'template'
706 # 6. If ancestor is a table node, switch the insertion mode
707 # to "in select in table" and abort these steps.
708 if ancestor.name is 'table'
709 ins_mode = ins_mode_in_select_in_table
711 # 7. Jump back to the step labeled loop.
712 # 8. Done: Switch the insertion mode to "in select" and abort
714 ins_mode = ins_mode_in_select
716 # 5. If node is a td or th element and last is false, then switch
717 # the insertion mode to "in cell" and abort these steps.
718 if (node.name is 'td' or node.name is 'th') and last is false
719 ins_mode = ins_mode_in_cell
721 # 6. If node is a tr element, then switch the insertion mode to "in
722 # row" and abort these steps.
724 ins_mode = ins_mode_in_row
726 # 7. If node is a tbody, thead, or tfoot element, then switch the
727 # insertion mode to "in table body" and abort these steps.
728 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
729 ins_mode = ins_mode_in_table_body
731 # 8. If node is a caption element, then switch the insertion mode
732 # to "in caption" and abort these steps.
733 if node.name is 'caption'
734 ins_mode = ins_mode_in_caption
736 # 9. If node is a colgroup element, then switch the insertion mode
737 # to "in column group" and abort these steps.
738 if node.name is 'colgroup'
739 ins_mode = ins_mode_in_column_group
741 # 10. If node is a table element, then switch the insertion mode to
742 # "in table" and abort these steps.
743 if node.name is 'table'
744 ins_mode = ins_mode_in_table
746 # 11. If node is a template element, then switch the insertion mode
747 # to the current template insertion mode and abort these steps.
748 # fixfull (template insertion mode stack)
750 # 12. If node is a head element and last is true, then switch the
751 # insertion mode to "in body" ("in body"! not "in head"!) and abort
752 # these steps. (fragment case)
753 if node.name is 'head' and last
754 ins_mode = ins_mode_in_body
756 # 13. If node is a head element and last is false, then switch the
757 # insertion mode to "in head" and abort these steps.
758 if node.name is 'head' and last is false
759 ins_mode = ins_mode_in_head
761 # 14. If node is a body element, then switch the insertion mode to
762 # "in body" and abort these steps.
763 if node.name is 'body'
764 ins_mode = ins_mode_in_body
766 # 15. If node is a frameset element, then switch the insertion mode
767 # to "in frameset" and abort these steps. (fragment case)
768 if node.name is 'frameset'
769 ins_mode = ins_mode_in_frameset
771 # 16. If node is an html element, run these substeps:
772 if node.name is 'html'
773 # 1. If the head element pointer is null, switch the insertion
774 # mode to "before head" and abort these steps. (fragment case)
775 if head_element_pointer is null
776 ins_mode = ins_mode_before_head
778 # 2. Otherwise, the head element pointer is not null,
779 # switch the insertion mode to "after head" and abort these
781 ins_mode = ins_mode_after_head
783 # 17. If last is true, then switch the insertion mode to "in body"
784 # and abort these steps. (fragment case)
786 ins_mode = ins_mode_in_body
788 # 18. Let node now be the node before node in the stack of open
791 node = open_els[node_i]
792 # 19. Return to the step labeled loop.
796 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
797 adjusted_current_node = ->
798 if open_els.length is 1 and flag_fragment_parsing
799 return context_element
802 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
803 # this implementation is structured (mostly) as described at the link above.
804 # capitalized comments are the "labels" described at the link above.
806 return if afe.length is 0
807 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
812 if i is afe.length - 1
815 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
820 el = insert_html_element afe[i].token
825 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
826 # adoption agency algorithm
828 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
829 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
830 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
831 adoption_agency = (subject) ->
832 debug_log "adoption_agency()"
833 debug_log "tree: #{serialize_els doc.children, false, true}"
834 debug_log "open_els: #{serialize_els open_els, true, true}"
835 debug_log "afe: #{serialize_els afe, true, true}"
836 if open_els[0].name is subject
839 # remove it from the list of active formatting elements (if found)
844 debug_log "aaa: starting off with subject on top of stack, exiting"
851 # 5. Let formatting element be the last element in the list of
852 # active formatting elements that: is between the end of the list
853 # and the last scope marker in the list, if any, or the start of
854 # the list otherwise, and has the tag name subject.
856 for t, fe_of_afe in afe
857 if t.type is TYPE_AFE_MARKER
862 # If there is no such element, then abort these steps and instead
863 # act as described in the "any other end tag" entry above.
865 debug_log "aaa: fe not found in afe"
866 in_body_any_other_end_tag subject
868 # 6. If formatting element is not in the stack of open elements,
869 # then this is a parse error; remove the element from the list, and
872 for t, fe_of_open_els in open_els
877 debug_log "aaa: fe not found in open_els"
879 # "remove it from the list" must mean afe, since it's not in open_els
880 afe.splice fe_of_afe, 1
882 # 7. If formatting element is in the stack of open elements, but
883 # the element is not in scope, then this is a parse error; abort
885 unless el_is_in_scope fe
886 debug_log "aaa: fe not in scope"
889 # 8. If formatting element is not the current node, this is a parse
890 # error. (But do not abort these steps.)
891 unless open_els[0] is fe
894 # 9. Let furthest block be the topmost node in the stack of open
895 # elements that is lower in the stack than formatting element, and
896 # is an element in the special category. There might not be one.
898 fb_of_open_els = null
905 # and continue, to see if there's one that's more "topmost"
906 # 10. If there is no furthest block, then the UA must first pop all
907 # the nodes from the bottom of the stack of open elements, from the
908 # current node up to and including formatting element, then remove
909 # formatting element from the list of active formatting elements,
910 # and finally abort these steps.
912 debug_log "aaa: no fb"
916 afe.splice fe_of_afe, 1
918 # 11. Let common ancestor be the element immediately above
919 # formatting element in the stack of open elements.
920 ca = open_els[fe_of_open_els + 1] # common ancestor
922 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
923 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
924 bookmark = new_aaa_bookmark()
927 afe.splice i, 0, bookmark
929 node = last_node = fb
933 # 3. Let node be the element immediately above node in the
934 # stack of open elements, or if node is no longer in the stack
935 # of open elements (e.g. because it got removed by this
936 # algorithm), the element that was immediately above node in
937 # the stack of open elements before node was removed.
941 node_next = open_els[i + 1]
943 node = node_next ? node_above
944 debug_log "inner loop #{inner}"
945 debug_log "tree: #{serialize_els doc.children, false, true}"
946 debug_log "open_els: #{serialize_els open_els, true, true}"
947 debug_log "afe: #{serialize_els afe, true, true}"
948 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
949 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
950 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
951 debug_log "node: #{node.serialize true, true}"
952 # TODO make sure node_above gets re-set if/when node is removed from open_els
954 # 4. If node is formatting element, then go to the next step in
955 # the overall algorithm.
959 # 5. If inner loop counter is greater than three and node is in
960 # the list of active formatting elements, then remove node from
961 # the list of active formatting elements.
967 debug_log "max out inner"
972 # 6. If node is not in the list of active formatting elements,
973 # then remove node from the stack of open elements and then go
974 # back to the step labeled inner loop.
976 debug_log "not in afe"
979 node_above = open_els[i + 1]
983 debug_log "the bones"
984 # 7. create an element for the token for which the element node
985 # was created, in the HTML namespace, with common ancestor as
986 # the intended parent; replace the entry for node in the list
987 # of active formatting elements with an entry for the new
988 # element, replace the entry for node in the stack of open
989 # elements with an entry for the new element, and let node be
991 new_node = token_to_element node.token, NS_HTML, ca
995 debug_log "replaced in afe"
999 node_above = open_els[i + 1]
1000 open_els[i] = new_node
1001 debug_log "replaced in open_els"
1004 # 8. If last node is furthest block, then move the
1005 # aforementioned bookmark to be immediately after the new node
1006 # in the list of active formatting elements.
1011 debug_log "removed bookmark"
1015 # "after" means lower
1016 afe.splice i, 0, bookmark # "after as <-
1017 debug_log "placed bookmark after node"
1018 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1020 # 9. Insert last node into node, first removing it from its
1021 # previous parent node if any.
1022 if last_node.parent?
1023 debug_log "last_node has parent"
1024 for c, i in last_node.parent.children
1026 debug_log "removing last_node from parent"
1027 last_node.parent.children.splice i, 1
1029 node.children.push last_node
1030 last_node.parent = node
1031 # 10. Let last node be node.
1034 # 11. Return to the step labeled inner loop.
1035 # 14. Insert whatever last node ended up being in the previous step
1036 # at the appropriate place for inserting a node, but using common
1037 # ancestor as the override target.
1039 # In the case where fe is immediately followed by fb:
1040 # * inner loop exits out early (node==fe)
1042 # * last_node is still in the tree (not a duplicate)
1043 if last_node.parent?
1044 debug_log "FEFIRST? last_node has parent"
1045 for c, i in last_node.parent.children
1047 debug_log "removing last_node from parent"
1048 last_node.parent.children.splice i, 1
1051 debug_log "after aaa inner loop"
1052 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1053 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1054 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1055 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1056 debug_log "tree: #{serialize_els doc.children, false, true}"
1061 # can't use standard insert token thing, because it's already in
1062 # open_els and must stay at it's current position in open_els
1063 dest = adjusted_insertion_location ca
1064 dest[0].children.splice dest[1], 0, last_node
1065 last_node.parent = dest[0]
1068 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1069 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1070 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1071 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1072 debug_log "tree: #{serialize_els doc.children, false, true}"
1074 # 15. Create an element for the token for which formatting element
1075 # was created, in the HTML namespace, with furthest block as the
1077 new_element = token_to_element fe.token, NS_HTML, fb
1078 # 16. Take all of the child nodes of furthest block and append them
1079 # to the element created in the last step.
1080 while fb.children.length
1081 t = fb.children.shift()
1082 t.parent = new_element
1083 new_element.children.push t
1084 # 17. Append that new element to furthest block.
1085 new_element.parent = fb
1086 fb.children.push new_element
1087 # 18. Remove formatting element from the list of active formatting
1088 # elements, and insert the new element into the list of active
1089 # formatting elements at the position of the aforementioned
1097 afe[i] = new_element
1099 # 19. Remove formatting element from the stack of open elements,
1100 # and insert the new element into the stack of open elements
1101 # immediately below the position of furthest block in that stack.
1102 for t, i in open_els
1104 open_els.splice i, 1
1106 for t, i in open_els
1108 open_els.splice i, 0, new_element
1110 # 20. Jump back to the step labeled outer loop.
1111 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1112 debug_log "tree: #{serialize_els doc.children, false, true}"
1113 debug_log "open_els: #{serialize_els open_els, true, true}"
1114 debug_log "afe: #{serialize_els afe, true, true}"
1115 debug_log "AAA DONE"
1117 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1118 close_p_element = ->
1119 generate_implied_end_tags 'p' # arg is exception
1120 if open_els[0].name isnt 'p'
1122 while open_els.length > 1 # just in case
1123 el = open_els.shift()
1126 close_p_if_in_button_scope = ->
1127 if is_in_button_scope 'p'
1130 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1131 # aka insert_a_character = (t) ->
1132 insert_character = (t) ->
1133 dest = adjusted_insertion_location()
1134 # fixfull check for Document node
1136 prev = dest[0].children[dest[1] - 1]
1137 if prev.type is TYPE_TEXT
1140 dest[0].children.splice dest[1], 0, t
1143 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1144 process_token = (t) ->
1145 acn = adjusted_current_node()
1149 if acn.namespace is NS_HTML
1152 if is_mathml_text_integration_point(acn)
1153 if t.type is TYPE_START_TAG and (t.name is 'mglyph' or t.name is 'malignmark')
1156 if t.type is TYPE_TEXT
1159 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1162 if is_html_integration acn
1163 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1166 if t.type is TYPE_EOF
1169 in_foreign_content t
1173 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1174 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1175 adjusted_insertion_location = (override_target = null) ->
1176 # 1. If there was an override target specified, then let target be the
1179 target = override_target
1180 else # Otherwise, let target be the current node.
1181 target = open_els[0]
1182 # 2. Determine the adjusted insertion location using the first matching
1183 # steps from the following list:
1185 # If foster parenting is enabled and target is a table, tbody, tfoot,
1186 # thead, or tr element Foster parenting happens when content is
1187 # misnested in tables.
1188 if flag_foster_parenting and foster_parenting_targets[target.name]
1189 loop # once. this is here so we can ``break`` to "abort these substeps"
1190 # 1. Let last template be the last template element in the
1191 # stack of open elements, if any.
1192 last_template = null
1193 last_template_i = null
1194 for el, i in open_els
1195 if el.name is 'template' and el.namespace is NS_HTML
1199 # 2. Let last table be the last table element in the stack of
1200 # open elements, if any.
1203 for el, i in open_els
1204 if el.name is 'table' and el.namespace is NS_HTML
1208 # 3. If there is a last template and either there is no last
1209 # table, or there is one, but last template is lower (more
1210 # recently added) than last table in the stack of open
1211 # elements, then: let adjusted insertion location be inside
1212 # last template's template contents, after its last child (if
1213 # any), and abort these substeps.
1214 if last_template and (last_table is null or last_template_i < last_table_i)
1215 target = last_template # fixfull should be it's contents
1216 target_i = target.children.length
1218 # 4. If there is no last table, then let adjusted insertion
1219 # location be inside the first element in the stack of open
1220 # elements (the html element), after its last child (if any),
1221 # and abort these substeps. (fragment case)
1222 if last_table is null
1224 target = open_els[open_els.length - 1]
1225 target_i = target.children.length
1227 # 5. If last table has a parent element, then let adjusted
1228 # insertion location be inside last table's parent element,
1229 # immediately before last table, and abort these substeps.
1230 if last_table.parent?
1231 for c, i in last_table.parent.children
1233 target = last_table.parent
1237 # 6. Let previous element be the element immediately above last
1238 # table in the stack of open elements.
1240 # huh? how could it not have a parent?
1241 previous_element = open_els[last_table_i + 1]
1242 # 7. Let adjusted insertion location be inside previous
1243 # element, after its last child (if any).
1244 target = previous_element
1245 target_i = target.children.length
1246 # Note: These steps are involved in part because it's possible
1247 # for elements, the table element in this case in particular,
1248 # to have been moved by a script around in the DOM, or indeed
1249 # removed from the DOM entirely, after the element was inserted
1251 break # don't really loop
1253 # Otherwise Let adjusted insertion location be inside target, after
1254 # its last child (if any).
1255 target_i = target.children.length
1257 # 3. If the adjusted insertion location is inside a template element,
1258 # let it instead be inside the template element's template contents,
1259 # after its last child (if any).
1260 # fixfull (template)
1262 # 4. Return the adjusted insertion location.
1263 return [target, target_i]
1265 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1266 # aka create_an_element_for_token
1267 token_to_element = (t, namespace, intended_parent) ->
1268 # convert attributes into a hash
1271 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1272 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1274 # TODO 2. If the newly created element has an xmlns attribute in the
1275 # XMLNS namespace whose value is not exactly the same as the element's
1276 # namespace, that is a parse error. Similarly, if the newly created
1277 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1278 # value is not the XLink Namespace, that is a parse error.
1280 # fixfull: the spec says stuff about form pointers and ownerDocument
1284 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1285 insert_foreign_element = (token, namespace) ->
1286 ail = adjusted_insertion_location()
1289 el = token_to_element token, namespace, ail_el
1290 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1292 ail_el.children.splice ail_i, 0, el
1295 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1296 insert_html_element = (token) ->
1297 insert_foreign_element token, NS_HTML
1299 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1300 # position should be [node, index_within_children]
1301 insert_comment = (t, position = null) ->
1302 position ?= adjusted_insertion_location()
1303 position[0].children.splice position[1], 0, t
1306 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1307 parse_generic_raw_text = (t) ->
1308 insert_html_element t
1309 tok_state = tok_state_rawtext
1310 original_ins_mode = ins_mode
1311 ins_mode = ins_mode_text
1312 parse_generic_rcdata_text = (t) ->
1313 insert_html_element t
1314 tok_state = tok_state_rcdata
1315 original_ins_mode = ins_mode
1316 ins_mode = ins_mode_text
1318 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1319 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1320 generate_implied_end_tags = (except = null) ->
1321 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1324 # 8.2.5.4 The rules for parsing tokens in HTML content
1325 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1327 # 8.2.5.4.1 The "initial" insertion mode
1328 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1329 ins_mode_initial = (t) ->
1332 if t.type is TYPE_COMMENT
1336 if t.type is TYPE_DOCTYPE
1337 # FIXME check identifiers, set quirks, etc
1340 ins_mode = ins_mode_before_html
1343 #fixfull (iframe, quirks)
1344 ins_mode = ins_mode_before_html
1348 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1349 ins_mode_before_html = (t) ->
1350 if t.type is TYPE_DOCTYPE
1353 if t.type is TYPE_COMMENT
1358 if t.type is TYPE_START_TAG and t.name is 'html'
1359 el = token_to_element t, NS_HTML, doc
1360 doc.children.push el
1361 open_els.unshift(el)
1362 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1363 ins_mode = ins_mode_before_head
1365 if t.type is TYPE_END_TAG
1366 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1367 # fall through to "anything else"
1372 html_tok = new_open_tag 'html'
1373 el = token_to_element html_tok, NS_HTML, doc
1374 doc.children.push el
1376 # ?fixfull browsing context
1377 ins_mode = ins_mode_before_head
1381 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1382 ins_mode_before_head = (t) ->
1385 if t.type is TYPE_COMMENT
1388 if t.type is TYPE_DOCTYPE
1391 if t.type is TYPE_START_TAG and t.name is 'html'
1394 if t.type is TYPE_START_TAG and t.name is 'head'
1395 el = insert_html_element t
1396 head_element_pointer = el
1397 ins_mode = ins_mode_in_head
1398 if t.type is TYPE_END_TAG
1399 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1400 # fall through to Anything else below
1405 head_tok = new_open_tag 'head'
1406 el = insert_html_element head_tok
1407 head_element_pointer = el
1408 ins_mode = ins_mode_in_head
1411 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1412 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1413 open_els.shift() # spec says this will be a 'head' node
1414 ins_mode = ins_mode_after_head
1416 ins_mode_in_head = (t) ->
1417 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1420 if t.type is TYPE_COMMENT
1423 if t.type is TYPE_DOCTYPE
1426 if t.type is TYPE_START_TAG and t.name is 'html'
1429 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1430 el = insert_html_element t
1432 t.acknowledge_self_closing()
1434 if t.type is TYPE_START_TAG and t.name is 'meta'
1435 el = insert_html_element t
1437 t.acknowledge_self_closing()
1438 # fixfull encoding stuff
1440 if t.type is TYPE_START_TAG and t.name is 'title'
1441 parse_generic_rcdata_text t
1443 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1444 parse_generic_raw_text t
1446 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1447 insert_html_element t
1448 ins_mode = ins_mode_in_head_noscript
1450 if t.type is TYPE_START_TAG and t.name is 'script'
1451 ail = adjusted_insertion_location()
1452 el = token_to_element t, NS_HTML, ail
1453 el.flag 'parser-inserted', true
1454 # fixfull frament case
1455 ail[0].children.splice ail[1], 0, el
1457 tok_state = tok_state_script_data
1458 original_ins_mode = ins_mode # make sure orig... is defined
1459 ins_mode = ins_mode_text
1461 if t.type is TYPE_END_TAG and t.name is 'head'
1462 open_els.shift() # will be a head element... spec says so
1463 ins_mode = ins_mode_after_head
1465 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1466 ins_mode_in_head_else t
1468 if t.type is TYPE_START_TAG and t.name is 'template'
1469 insert_html_element t
1471 flag_frameset_ok = false
1472 ins_mode = ins_mode_in_template
1473 template_ins_modes.unshift ins_mode_in_template
1475 if t.type is TYPE_END_TAG and t.name is 'template'
1476 if template_tag_is_open()
1477 generate_implied_end_tags
1478 if open_els[0].name isnt 'template'
1481 el = open_els.shift()
1482 if el.name is 'template'
1484 clear_afe_to_marker()
1485 template_ins_modes.shift()
1490 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1493 ins_mode_in_head_else t
1495 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1496 ins_mode_in_head_noscript_else = (t) ->
1499 ins_mode = ins_mode_in_head
1501 ins_mode_in_head_noscript = (t) ->
1502 if t.type is TYPE_DOCTYPE
1505 if t.type is TYPE_START_TAG
1508 if t.type is TYPE_END_TAG and t.name is 'noscript'
1510 ins_mode = ins_mode_in_head
1512 if (t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\u000a" or t.text is "\u000c" or t.text is "\u000d" or t.text is ' ')) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1515 if t.type is TYPE_END_TAG and t.name is 'br'
1516 ins_mode_in_head_noscript_else t
1518 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1522 ins_mode_in_head_noscript_else t
1527 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1528 ins_mode_after_head_else = (t) ->
1529 body_tok = new_open_tag 'body'
1530 insert_html_element body_tok
1531 ins_mode = ins_mode_in_body
1534 ins_mode_after_head = (t) ->
1538 if t.type is TYPE_COMMENT
1541 if t.type is TYPE_DOCTYPE
1544 if t.type is TYPE_START_TAG and t.name is 'html'
1547 if t.type is TYPE_START_TAG and t.name is 'body'
1548 insert_html_element t
1549 flag_frameset_ok = false
1550 ins_mode = ins_mode_in_body
1552 if t.type is TYPE_START_TAG and t.name is 'frameset'
1553 insert_html_element t
1554 ins_mode = ins_mode_in_frameset
1556 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1558 open_els.unshift head_element_pointer
1560 for el, i of open_els
1561 if el is head_element_pointer
1562 open_els.splice i, 1
1564 console.log "warning: 23904 couldn't find head element in open_els"
1566 if t.type is TYPE_END_TAG and t.name is 'template'
1569 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1570 ins_mode_after_head_else t
1572 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1576 ins_mode_after_head_else t
1578 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1579 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1580 for el, i in open_els
1581 if el.namespace is NS_HTML and el.name is name
1582 generate_implied_end_tags name # arg is exception
1583 parse_error() unless i is 0
1588 if special_elements[el.name] is el.namespace
1592 ins_mode_in_body = (t) ->
1593 if t.type is TYPE_TEXT and t.text is "\u0000"
1600 if t.type is TYPE_TEXT
1603 flag_frameset_ok = false
1605 if t.type is TYPE_COMMENT
1608 if t.type is TYPE_DOCTYPE
1611 if t.type is TYPE_START_TAG and t.name is 'html'
1613 return if template_tag_is_open()
1614 root_attrs = open_els[open_els.length - 1].attrs
1616 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1619 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1622 if t.type is TYPE_START_TAG and t.name is 'body'
1624 return if open_els.length < 2
1625 second = open_els[open_els.length - 2]
1626 return unless second.ns is NS_HTML
1627 return unless second.name is 'body'
1628 return if template_tag_is_open()
1629 frameset_ok_flag = false
1631 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1633 if t.type is TYPE_START_TAG and t.name is 'frameset'
1635 return if open_els.length < 2
1636 second_i = open_els.length - 2
1637 second = open_els[second_i]
1638 return unless second.ns is NS_HTML
1639 return unless second.name is 'body'
1640 flag_frameset_ok = false
1642 for el, i in second.parent.children
1644 second.parent.children.splice i, 1
1646 open_els.splice second_i, 1
1647 # pop everything except the "root html element"
1648 while open_els.length > 1
1650 insert_html_element t
1651 ins_mode = ins_mode_in_frameset
1653 if t.type is TYPE_EOF
1655 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1656 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1657 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1660 unless ok_tags[t.name] is el.namespace
1663 if template_ins_modes.length > 0
1664 ins_mode_in_template t
1668 if t.type is TYPE_END_TAG and t.name is 'body'
1669 unless is_in_scope 'body'
1673 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1674 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1675 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1676 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1680 unless ok_tags[t.name] is el.namespace
1683 ins_mode = ins_mode_after_body
1685 if t.type is TYPE_END_TAG and t.name is 'html'
1686 unless is_in_scope 'body'
1690 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1691 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1692 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1693 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1697 unless ok_tags[t.name] is el.namespace
1700 ins_mode = ins_mode_after_body
1703 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1704 close_p_if_in_button_scope()
1705 insert_html_element t
1707 if t.type is TYPE_START_TAG and h_tags[t.name]?
1708 close_p_if_in_button_scope()
1709 if h_tags[open_els[0]] is NS_HTML
1712 insert_html_element t
1714 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1715 close_p_if_in_button_scope()
1716 insert_html_element t
1717 # spec: If the next token is a "LF" (U+000A) character token, then
1718 # ignore that token and move on to the next one. (Newlines at the
1719 # start of pre blocks are ignored as an authoring convenience.)
1720 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1722 flag_frameset_ok = false
1724 if t.type is TYPE_START_TAG and t.name is 'form'
1725 unless form_element_pointer is null or template_tag_is_open()
1728 close_p_if_in_button_scope()
1729 el = insert_html_element t
1730 unless template_tag_is_open()
1731 form_element_pointer = el
1733 if t.type is TYPE_START_TAG and t.name is 'li'
1734 flag_frameset_ok = false
1735 for node in open_els
1736 if node.name is 'li' and node.namespace is NS_HTML
1737 generate_implied_end_tags 'li' # arg is exception
1738 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1741 el = open_els.shift()
1742 if el.name is 'li' and el.namespace is NS_HTML
1745 if el_is_special_not_adp node
1747 close_p_if_in_button_scope()
1748 insert_html_element t
1750 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1751 flag_frameset_ok = false
1752 for node in open_els
1753 if node.name is 'dd' and node.namespace is NS_HTML
1754 generate_implied_end_tags 'dd' # arg is exception
1755 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1758 el = open_els.shift()
1759 if el.name is 'dd' and el.namespace is NS_HTML
1762 if node.name is 'dt' and node.namespace is NS_HTML
1763 generate_implied_end_tags 'dt' # arg is exception
1764 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1767 el = open_els.shift()
1768 if el.name is 'dt' and el.namespace is NS_HTML
1771 if el_is_special_not_adp node
1773 close_p_if_in_button_scope()
1774 insert_html_element t
1776 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1777 close_p_if_in_button_scope()
1778 insert_html_element t
1779 tok_state = tok_state_plaintext
1781 if t.type is TYPE_START_TAG and t.name is 'button'
1782 if is_in_scope 'button', NS_HTML
1784 generate_implied_end_tags()
1786 el = open_els.shift()
1787 if el.name is 'button' and el.namespace is NS_HTML
1790 insert_html_element t
1791 flag_frameset_ok = false
1793 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1794 unless is_in_scope t.name, NS_HTML
1797 generate_implied_end_tags()
1798 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1801 el = open_els.shift()
1802 if el.name is t.name and el.namespace is NS_HTML
1805 if t.type is TYPE_END_TAG and t.name is 'form'
1806 unless template_tag_is_open()
1807 node = form_element_pointer
1808 form_element_pointer = null
1809 if node is null or not el_is_in_scope node
1812 generate_implied_end_tags()
1813 if open_els[0] isnt node
1815 for el, i in open_els
1817 open_els.splice i, 1
1820 unless is_in_scope 'form', NS_HTML
1823 generate_implied_end_tags()
1824 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1827 el = open_els.shift()
1828 if el.name is 'form' and el.namespace is NS_HTML
1831 if t.type is TYPE_END_TAG and t.name is 'p'
1832 unless is_in_button_scope 'p', NS_HTML
1834 insert_html_element new_open_tag 'p'
1837 if t.type is TYPE_END_TAG and t.name is 'li'
1838 unless is_in_li_scope 'li', NS_HTML
1841 generate_implied_end_tags 'li' # arg is exception
1842 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1845 el = open_els.shift()
1846 if el.name is 'li' and el.namespace is NS_HTML
1849 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1850 unless is_in_scope t.name, NS_HTML
1853 generate_implied_end_tags t.name # arg is exception
1854 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1857 el = open_els.shift()
1858 if el.name is t.name and el.namespace is NS_HTML
1861 if t.type is TYPE_END_TAG and h_tags[t.name]?
1864 if h_tags[el.name] is el.namespace
1867 if standard_scopers[el.name] is el.namespace
1872 generate_implied_end_tags()
1873 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1876 el = open_els.shift()
1877 if h_tags[el.name] is el.namespace
1881 if t.type is TYPE_START_TAG and t.name is 'a'
1882 # If the list of active formatting elements contains an a element
1883 # between the end of the list and the last marker on the list (or
1884 # the start of the list if there is no marker on the list), then
1885 # this is a parse error; run the adoption agency algorithm for the
1886 # tag name "a", then remove that element from the list of active
1887 # formatting elements and the stack of open elements if the
1888 # adoption agency algorithm didn't already remove it (it might not
1889 # have if the element is not in table scope).
1892 if el.type is TYPE_AFE_MARKER
1894 if el.name is 'a' and el.namespace is NS_HTML
1902 for el, i in open_els
1904 open_els.splice i, 1
1906 el = insert_html_element t
1909 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1911 el = insert_html_element t
1914 if t.type is TYPE_START_TAG and t.name is 'nobr'
1916 el = insert_html_element t
1919 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1920 adoption_agency t.name
1922 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1924 insert_html_element t
1926 flag_frameset_ok = false
1928 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1929 unless is_in_scope t.name, NS_HTML
1932 generate_implied_end_tags()
1933 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1936 el = open_els.shift()
1937 if el.name is t.name and el.namespace is NS_HTML
1939 clear_afe_to_marker()
1941 if t.type is TYPE_START_TAG and t.name is 'table'
1942 close_p_if_in_button_scope() # fixfull quirksmode thing
1943 insert_html_element t
1944 flag_frameset_ok = false
1945 ins_mode = ins_mode_in_table
1947 if t.type is TYPE_END_TAG and t.name is 'br'
1949 t.type is TYPE_START_TAG
1951 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
1953 insert_html_element t
1955 t.acknowledge_self_closing()
1956 flag_frameset_ok = false
1958 if t.type is TYPE_START_TAG and t.name is 'input'
1960 insert_html_element t
1962 t.acknowledge_self_closing()
1963 unless is_input_hidden_tok t
1964 flag_frameset_ok = false
1966 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
1967 insert_html_element t
1969 t.acknowledge_self_closing()
1971 if t.type is TYPE_START_TAG and t.name is 'hr'
1972 close_p_if_in_button_scope()
1973 insert_html_element t
1975 t.acknowledge_self_closing()
1976 flag_frameset_ok = false
1978 if t.type is TYPE_START_TAG and t.name is 'image'
1983 if t.type is TYPE_START_TAG and t.name is 'isindex'
1985 if template_tag_is_open() is false and form_element_pointer isnt null
1987 t.acknowledge_self_closing()
1988 flag_frameset_ok = false
1989 close_p_if_in_button_scope()
1990 el = insert_html_element new_open_tag 'form'
1991 unless template_tag_is_open()
1992 form_element_pointer = el
1995 el.attrs['action'] = a[1]
1997 insert_html_element new_open_tag 'hr'
2000 insert_html_element new_open_tag 'label'
2001 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2002 input_el = new_open_tag 'input'
2007 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2008 input_el.attrs_a.push [a[0], a[1]]
2009 input_el.attrs_a.push ['name', 'isindex']
2010 # fixfull this next bit is in english... internationalize?
2011 prompt ?= "This is a searchable index. Enter search keywords: "
2012 insert_character new_character_token prompt # fixfull split
2013 # TODO submit typo "balue" in spec
2014 insert_html_element input_el
2016 # insert_character '' # you can put chars here if promt attr missing
2018 insert_html_element new_open_tag 'hr'
2021 unless template_tag_is_open()
2022 form_element_pointer = null
2024 if t.type is TYPE_START_TAG and t.name is 'textarea'
2025 insert_html_element t
2026 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2028 tok_state = tok_state_rcdata
2029 original_ins_mode = ins_mode
2030 flag_frameset_ok = false
2031 ins_mode = ins_mode_text
2033 if t.type is TYPE_START_TAG and t.name is 'xmp'
2034 close_p_if_in_button_scope()
2036 flag_frameset_ok = false
2037 parse_generic_raw_text t
2039 if t.type is TYPE_START_TAG and t.name is 'iframe'
2040 flag_frameset_ok = false
2041 parse_generic_raw_text t
2043 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2044 parse_generic_raw_text t
2046 if t.type is TYPE_START_TAG and t.name is 'select'
2048 insert_html_element t
2049 flag_frameset_ok = false
2050 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2051 ins_mode = ins_mode_in_select_in_table
2053 ins_mode = ins_mode_in_select
2055 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2056 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2059 insert_html_element t
2061 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2062 if is_in_scope 'ruby', NS_HTML
2063 generate_implied_end_tags()
2064 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2066 insert_html_element t
2068 if t.type is TYPE_START_TAG and t.name is 'rt'
2069 if is_in_scope 'ruby', NS_HTML
2070 generate_implied_end_tags 'rtc' # arg is exception
2071 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2073 insert_html_element t
2075 if t.type is TYPE_START_TAG and t.name is 'math'
2077 adjust_mathml_attributes t
2078 adjust_foreign_attributes t
2079 insert_foreign_element t, NS_MATHML
2080 if t.flag 'self-closing'
2082 t.acknowledge_self_closing()
2084 if t.type is TYPE_START_TAG and t.name is 'svg'
2086 adjust_svg_attributes t
2087 adjust_foreign_attributes t
2088 insert_foreign_element t, NS_SVG
2089 if t.flag 'self-closing'
2091 t.acknowledge_self_closing()
2093 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2096 if t.type is TYPE_START_TAG # any other start tag
2098 insert_html_element t
2100 if t.type is TYPE_END_TAG # any other end tag
2101 in_body_any_other_end_tag t.name
2105 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2106 ins_mode_text = (t) ->
2107 if t.type is TYPE_TEXT
2110 if t.type is TYPE_EOF
2112 if open_els[0].name is 'script'
2113 open_els[0].flag 'already started', true
2115 ins_mode = original_ins_mode
2118 if t.type is TYPE_END_TAG and t.name is 'script'
2120 ins_mode = original_ins_mode
2121 # fixfull the spec seems to assume that I'm going to run the script
2122 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2124 if t.type is TYPE_END_TAG
2126 ins_mode = original_ins_mode
2128 console.log 'warning: end of ins_mode_text reached'
2130 # the functions below implement the tokenizer stats described here:
2131 # http://www.w3.org/TR/html5/syntax.html#tokenization
2133 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2134 ins_mode_in_table_else = (t) ->
2136 flag_foster_parenting = true
2138 flag_foster_parenting = false
2140 can_in_table = { # FIXME do this inline like everywhere else
2147 ins_mode_in_table = (t) ->
2150 if can_in_table[t.name]
2151 original_ins_mode = ins_mode
2152 ins_mode = ins_mode_in_table_text
2155 ins_mode_in_table_else t
2163 clear_stack_to_table_context()
2165 insert_html_element t
2166 ins_mode = ins_mode_in_caption
2168 clear_stack_to_table_context()
2169 insert_html_element t
2170 ins_mode = ins_mode_in_column_group
2172 clear_stack_to_table_context()
2173 insert_html_element new_open_tag 'colgroup'
2174 ins_mode = ins_mode_in_column_group
2176 when 'tbody', 'tfoot', 'thead'
2177 clear_stack_to_table_context()
2178 insert_html_element t
2179 ins_mode = ins_mode_in_table_body
2180 when 'td', 'th', 'tr'
2181 clear_stack_to_table_context()
2182 insert_html_element new_open_tag 'tbody'
2183 ins_mode = ins_mode_in_table_body
2187 if is_in_table_scope 'table'
2189 el = open_els.shift()
2190 if el.name is 'table'
2194 when 'style', 'script', 'template'
2197 unless is_input_hidden_tok t
2198 ins_mode_in_table_else t
2201 el = insert_html_element t
2203 t.acknowledge_self_closing()
2206 if form_element_pointer?
2208 if template_tag_is_open()
2210 form_element_pointer = insert_html_element t
2213 ins_mode_in_table_else t
2217 if is_in_table_scope 'table'
2219 el = open_els.shift()
2220 if el.name is 'table'
2225 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2230 ins_mode_in_table_else t
2234 ins_mode_in_table_else t
2237 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2238 ins_mode_in_table_text = (t) ->
2239 if t.type is TYPE_TEXT and t.text is "\u0000"
2240 # huh? I thought the tokenizer didn't emit these
2243 if t.type is TYPE_TEXT
2244 pending_table_character_tokens.push t
2248 for old in pending_table_character_tokens
2249 unless is_space_tok old
2253 for old in pending_table_character_tokens
2254 insert_character old
2256 for old in pending_table_character_tokens
2257 ins_mode_table_else old
2258 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
2259 ins_mode = original_ins_mode
2262 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2263 ins_mode_in_caption = (t) ->
2264 if t.type is TYPE_END_TAG and t.name is 'caption'
2265 if is_in_table_scope 'caption'
2266 generate_implied_end_tags()
2267 if open_els[0].name isnt 'caption'
2270 el = open_els.shift()
2271 if el.name is 'caption'
2273 clear_afe_to_marker()
2274 ins_mode = ins_mode_in_table
2279 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2281 if is_in_table_scope 'caption'
2283 el = open_els.shift()
2284 if el.name is 'caption'
2286 clear_afe_to_marker()
2287 ins_mode = ins_mode_in_table
2289 # else fragment case
2291 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2297 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2298 ins_mode_in_column_group = (t) ->
2302 if t.type is TYPE_COMMENT
2305 if t.type is TYPE_DOCTYPE
2308 if t.type is TYPE_START_TAG and t.name is 'html'
2311 if t.type is TYPE_START_TAG and t.name is 'col'
2312 el = insert_html_element t
2314 t.acknowledge_self_closing()
2316 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2317 if open_els[0].name is 'colgroup'
2319 ins_mode = ins_mode_in_table
2323 if t.type is TYPE_END_TAG and t.name is 'col'
2326 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2329 if t.type is TYPE_EOF
2333 if open_els[0].name isnt 'colgroup'
2337 ins_mode = ins_mode_in_table
2341 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2342 ins_mode_in_table_body = (t) ->
2343 if t.type is TYPE_START_TAG and t.name is 'tr'
2344 clear_stack_to_table_body_context()
2345 insert_html_element t
2346 ins_mode = ins_mode_in_row
2348 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2350 clear_stack_to_table_body_context()
2351 insert_html_element new_open_tag 'tr'
2352 ins_mode = ins_mode_in_row
2355 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2356 unless is_in_table_scope t.name # fixfull check namespace
2359 clear_stack_to_table_body_context()
2361 ins_mode = ins_mode_in_table
2363 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2366 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
2369 if table_scopers[el.name]
2374 clear_stack_to_table_body_context()
2376 ins_mode = ins_mode_in_table
2379 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2385 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2386 ins_mode_in_row = (t) ->
2387 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2388 clear_stack_to_table_row_context()
2389 insert_html_element t
2390 ins_mode = ins_mode_in_cell
2393 if t.type is TYPE_END_TAG and t.name is 'tr'
2394 if is_in_table_scope 'tr'
2395 clear_stack_to_table_row_context()
2397 ins_mode = ins_mode_in_table_body
2401 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2402 if is_in_table_scope 'tr'
2403 clear_stack_to_table_row_context()
2405 ins_mode = ins_mode_in_table_body
2410 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2411 if is_in_table_scope t.name # fixfull namespace
2412 if is_in_table_scope 'tr'
2413 clear_stack_to_table_row_context()
2415 ins_mode = ins_mode_in_table_body
2420 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2426 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2428 generate_implied_end_tags()
2429 unless open_els[0].name is 'td' or open_els[0] is 'th'
2432 el = open_els.shift()
2433 if el.name is 'td' or el.name is 'th'
2435 clear_afe_to_marker()
2436 ins_mode = ins_mode_in_row
2438 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2439 ins_mode_in_cell = (t) ->
2440 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2441 if is_in_table_scope t.name
2442 generate_implied_end_tags()
2443 if open_els[0].name isnt t.name
2446 el = open_els.shift()
2447 if el.name is t.name
2449 clear_afe_to_marker()
2450 ins_mode = ins_mode_in_row
2454 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2457 if el.name is 'td' or el.name is 'th'
2460 if table_scopers[el.name]
2468 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2471 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2472 if is_in_table_scope t.name # fixfull namespace
2481 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2482 ins_mode_in_select = (t) ->
2483 if t.type is TYPE_TEXT and t.text is "\u0000"
2486 if t.type is TYPE_TEXT
2489 if t.type is TYPE_COMMENT
2492 if t.type is TYPE_DOCTYPE
2495 if t.type is TYPE_START_TAG and t.name is 'html'
2498 if t.type is TYPE_START_TAG and t.name is 'option'
2499 if open_els[0].name is 'option'
2501 insert_html_element t
2503 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2504 if open_els[0].name is 'option'
2506 if open_els[0].name is 'optgroup'
2508 insert_html_element t
2510 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2511 if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
2513 if open_els[0].name is 'optgroup'
2518 if t.type is TYPE_END_TAG and t.name is 'option'
2519 if open_els[0].name is 'option'
2524 if t.type is TYPE_END_TAG and t.name is 'select'
2525 if is_in_select_scope 'select'
2527 el = open_els.shift()
2528 if el.name is 'select'
2534 if t.type is TYPE_START_TAG and t.name is 'select'
2537 el = open_els.shift()
2538 if el.name is 'select'
2541 # spec says that this is the same as </select> but it doesn't say
2542 # to check scope first
2544 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2546 if is_in_select_scope 'select'
2549 el = open_els.shift()
2550 if el.name is 'select'
2555 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2558 if t.type is TYPE_EOF
2565 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2566 ins_mode_in_select_in_table = (t) ->
2567 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2570 el = open_els.shift()
2571 if el.name is 'select'
2576 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2578 unless is_in_table_scope t.name, NS_HTML
2581 el = open_els.shift()
2582 if el.name is 'select'
2588 ins_mode_in_select t
2591 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2592 ins_mode_in_template = (t) ->
2593 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2596 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2599 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2600 template_ins_modes.shift()
2601 template_ins_modes.unshift ins_mode_in_table
2602 ins_mode = ins_mode_in_table
2605 if t.type is TYPE_START_TAG and t.name is 'col'
2606 template_ins_modes.shift()
2607 template_ins_modes.unshift ins_mode_in_column_group
2608 ins_mode = ins_mode_in_column_group
2611 if t.type is TYPE_START_TAG and t.name is 'tr'
2612 template_ins_modes.shift()
2613 template_ins_modes.unshift ins_mode_in_table_body
2614 ins_mode = ins_mode_in_table_body
2617 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2618 template_ins_modes.shift()
2619 template_ins_modes.unshift ins_mode_in_row
2620 ins_mode = ins_mode_in_row
2623 if t.type is TYPE_START_TAG
2624 template_ins_modes.shift()
2625 template_ins_modes.unshift ins_mode_in_body
2626 ins_mode = ins_mode_in_body
2629 if t.type is TYPE_END_TAG
2632 if t.type is TYPE_EOF
2633 unless template_tag_is_open()
2638 el = open_els.shift()
2639 if el.name is 'template' # fixfull check namespace
2641 clear_afe_to_marker()
2642 template_ins_modes.shift()
2646 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2647 ins_mode_after_body = (t) ->
2651 if t.type is TYPE_COMMENT
2652 insert_comment t, [open_els[0], open_els[0].children.length]
2654 if t.type is TYPE_DOCTYPE
2657 if t.type is TYPE_START_TAG and t.name is 'html'
2660 if t.type is TYPE_END_TAG and t.name is 'html'
2661 # fixfull fragment case
2662 ins_mode = ins_mode_after_after_body
2664 if t.type is TYPE_EOF
2669 ins_mode = ins_mode_in_body
2672 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2673 ins_mode_in_frameset = (t) ->
2677 if t.type is TYPE_COMMENT
2680 if t.type is TYPE_DOCTYPE
2683 if t.type is TYPE_START_TAG and t.name is 'html'
2686 if t.type is TYPE_START_TAG and t.name is 'frameset'
2687 insert_html_element t
2689 if t.type is TYPE_END_TAG and t.name is 'frameset'
2690 # TODO ?correct for: "if the current node is the root html element"
2691 if open_els.length is 1
2693 return # fragment case
2695 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2696 ins_mode = ins_mode_after_frameset
2698 if t.type is TYPE_START_TAG and t.name is 'frame'
2699 insert_html_element t
2701 t.acknowledge_self_closing()
2703 if t.type is TYPE_START_TAG and t.name is 'noframes'
2706 if t.type is TYPE_EOF
2707 # TODO ?correct for: "if the current node is not the root html element"
2708 if open_els.length isnt 1
2716 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2717 ins_mode_after_frameset = (t) ->
2721 if t.type is TYPE_COMMENT
2724 if t.type is TYPE_DOCTYPE
2727 if t.type is TYPE_START_TAG and t.name is 'html'
2730 if t.type is TYPE_END_TAG and t.name is 'html'
2731 insert_mode = ins_mode_after_after_frameset
2733 if t.type is TYPE_START_TAG and t.name is 'noframes'
2736 if t.type is TYPE_EOF
2743 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2744 ins_mode_after_after_body = (t) ->
2745 if t.type is TYPE_COMMENT
2746 insert_comment t, [doc, doc.children.length]
2748 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2751 if t.type is TYPE_EOF
2756 ins_mode = ins_mode_in_body
2759 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2760 ins_mode_after_after_frameset = (t) ->
2761 if t.type is TYPE_COMMENT
2762 insert_comment t, [doc, doc.children.length]
2764 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2767 if t.type is TYPE_EOF
2770 if t.type is TYPE_START_TAG and t.name is 'noframes'
2777 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2778 has_color_face_or_size = (t) ->
2780 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2783 in_foreign_content_end_script = ->
2787 in_foreign_content_other_start = (t) ->
2788 acn = adjusted_current_node()
2789 if acn.namespace is NS_MATHML
2790 adjust_mathml_attributes t
2791 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2792 t.name = svg_name_fixes[t.name]
2793 if acn.namespace is NS_SVG
2794 adjust_svg_attributes t
2795 adjust_foreign_attributes t
2796 insert_foreign_element t, acn.namespace
2797 if t.flag 'self-closing'
2798 if t.name is 'script'
2799 t.acknowledge_self_closing()
2800 in_foreign_content_end_script()
2803 t.acknowledge_self_closing()
2805 in_foreign_content = (t) ->
2806 if t.type is TYPE_TEXT and t.text is "\u0000"
2808 insert_character new_character_token "\ufffd"
2813 if t.type is TYPE_TEXT
2814 flag_frameset_ok = false
2817 if t.type is TYPE_COMMENT
2820 if t.type is TYPE_DOCTYPE
2823 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
2825 if flag_fragment_parsing
2826 in_foreign_content_other_start t
2828 loop # is this safe?
2831 if is_mathml_text_integration_point(cn) or is_html_integration(cn) or cn.namespace is NS_HTML
2835 if t.type is TYPE_START_TAG
2836 in_foreign_content_other_start t
2838 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
2839 in_foreign_content_end_script()
2841 if t.type is TYPE_END_TAG
2842 if open_els[0].name.toLowerCase() isnt t.name
2844 for node in open_els
2845 if node is open_els[open_els.length - 1]
2847 if node.name.toLowerCase() is t.name
2849 el = open_els.shift()
2852 if node.namespace is NS_HTML
2854 ins_mode t # explicitly call HTML insertion mode
2857 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2859 switch c = txt.charAt(cur++)
2861 return new_text_node parse_character_reference()
2863 tok_state = tok_state_tag_open
2866 return new_text_node c
2868 return new_eof_token()
2870 return new_text_node c
2873 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2874 # not needed: tok_state_character_reference_in_data = ->
2875 # just call parse_character_reference()
2877 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2878 tok_state_rcdata = ->
2879 switch c = txt.charAt(cur++)
2881 return new_text_node parse_character_reference()
2883 tok_state = tok_state_rcdata_less_than_sign
2886 return new_character_token "\ufffd"
2888 return new_eof_token()
2890 return new_character_token c
2893 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2894 # not needed: tok_state_character_reference_in_rcdata = ->
2895 # just call parse_character_reference()
2897 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2898 tok_state_rawtext = ->
2899 switch c = txt.charAt(cur++)
2901 tok_state = tok_state_rawtext_less_than_sign
2904 return new_character_token "\ufffd"
2906 return new_eof_token()
2908 return new_character_token c
2911 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2912 tok_state_script_data = ->
2913 switch c = txt.charAt(cur++)
2915 tok_state = tok_state_script_data_less_than_sign
2918 return new_character_token "\ufffd"
2920 return new_eof_token()
2922 return new_character_token c
2925 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2926 tok_state_plaintext = ->
2927 switch c = txt.charAt(cur++)
2930 return new_character_token "\ufffd"
2932 return new_eof_token()
2934 return new_character_token c
2938 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2939 tok_state_tag_open = ->
2940 switch c = txt.charAt(cur++)
2942 tok_state = tok_state_markup_declaration_open
2944 tok_state = tok_state_end_tag_open
2947 tok_cur_tag = new_comment_token '?'
2948 tok_state = tok_state_bogus_comment
2951 tok_cur_tag = new_open_tag c
2952 tok_state = tok_state_tag_name
2953 else if is_uc_alpha(c)
2954 tok_cur_tag = new_open_tag c.toLowerCase()
2955 tok_state = tok_state_tag_name
2958 tok_state = tok_state_data
2959 cur -= 1 # we didn't parse/handle the char after <
2960 return new_text_node '<'
2963 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2964 tok_state_end_tag_open = ->
2965 switch c = txt.charAt(cur++)
2968 tok_state = tok_state_data
2971 tok_state = tok_state_data
2972 return new_text_node '</'
2975 tok_cur_tag = new_end_tag c.toLowerCase()
2976 tok_state = tok_state_tag_name
2977 else if is_lc_alpha(c)
2978 tok_cur_tag = new_end_tag c
2979 tok_state = tok_state_tag_name
2982 tok_cur_tag = new_comment_token '/'
2983 tok_state = tok_state_bogus_comment
2986 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2987 tok_state_tag_name = ->
2988 switch c = txt.charAt(cur++)
2989 when "\t", "\n", "\u000c", ' '
2990 tok_state = tok_state_before_attribute_name
2992 tok_state = tok_state_self_closing_start_tag
2994 tok_state = tok_state_data
3000 tok_cur_tag.name += "\ufffd"
3003 tok_state = tok_state_data
3006 tok_cur_tag.name += c.toLowerCase()
3008 tok_cur_tag.name += c
3011 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3012 tok_state_rcdata_less_than_sign = ->
3013 c = txt.charAt(cur++)
3015 temporary_buffer = ''
3016 tok_state = tok_state_rcdata_end_tag_open
3019 tok_state = tok_state_rcdata
3020 cur -= 1 # reconsume the input character
3021 return new_character_token '<'
3023 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3024 tok_state_rcdata_end_tag_open = ->
3025 c = txt.charAt(cur++)
3027 tok_cur_tag = new_end_tag c.toLowerCase()
3028 temporary_buffer += c
3029 tok_state = tok_state_rcdata_end_tag_name
3032 tok_cur_tag = new_end_tag c
3033 temporary_buffer += c
3034 tok_state = tok_state_rcdata_end_tag_name
3037 tok_state = tok_state_rcdata
3038 cur -= 1 # reconsume the input character
3039 return new_character_token "</" # fixfull separate these
3041 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3042 is_appropriate_end_tag = (t) ->
3043 # spec says to check against "the tag name of the last start tag to
3044 # have been emitted from this tokenizer", but this is only called from
3045 # the various "raw" states, so it's hopefully ok to assume that
3046 # open_els[0].name will work instead TODO: verify this after the script
3047 # data states are implemented
3048 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3049 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3051 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3052 tok_state_rcdata_end_tag_name = ->
3053 c = txt.charAt(cur++)
3054 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3055 if is_appropriate_end_tag tok_cur_tag
3056 tok_state = tok_state_before_attribute_name
3058 # else fall through to "Anything else"
3060 if is_appropriate_end_tag tok_cur_tag
3061 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3063 # else fall through to "Anything else"
3065 if is_appropriate_end_tag tok_cur_tag
3066 tok_state = tok_state_data
3068 # else fall through to "Anything else"
3070 tok_cur_tag.name += c.toLowerCase()
3071 temporary_buffer += c
3074 tok_cur_tag.name += c
3075 temporary_buffer += c
3078 tok_state = tok_state_rcdata
3079 cur -= 1 # reconsume the input character
3080 return new_character_token '</' + temporary_buffer # fixfull separate these
3082 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3083 tok_state_rawtext_less_than_sign = ->
3084 c = txt.charAt(cur++)
3086 temporary_buffer = ''
3087 tok_state = tok_state_rawtext_end_tag_open
3090 tok_state = tok_state_rawtext
3091 cur -= 1 # reconsume the input character
3092 return new_character_token '<'
3094 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3095 tok_state_rawtext_end_tag_open = ->
3096 c = txt.charAt(cur++)
3098 tok_cur_tag = new_end_tag c.toLowerCase()
3099 temporary_buffer += c
3100 tok_state = tok_state_rawtext_end_tag_name
3103 tok_cur_tag = new_end_tag c
3104 temporary_buffer += c
3105 tok_state = tok_state_rawtext_end_tag_name
3108 tok_state = tok_state_rawtext
3109 cur -= 1 # reconsume the input character
3110 return new_character_token "</" # fixfull separate these
3112 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3113 tok_state_rawtext_end_tag_name = ->
3114 c = txt.charAt(cur++)
3115 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3116 if is_appropriate_end_tag tok_cur_tag
3117 tok_state = tok_state_before_attribute_name
3119 # else fall through to "Anything else"
3121 if is_appropriate_end_tag tok_cur_tag
3122 tok_state = tok_state_self_closing_start_tag
3124 # else fall through to "Anything else"
3126 if is_appropriate_end_tag tok_cur_tag
3127 tok_state = tok_state_data
3129 # else fall through to "Anything else"
3131 tok_cur_tag.name += c.toLowerCase()
3132 temporary_buffer += c
3135 tok_cur_tag.name += c
3136 temporary_buffer += c
3139 tok_state = tok_state_rawtext
3140 cur -= 1 # reconsume the input character
3141 return new_character_token '</' + temporary_buffer # fixfull separate these
3143 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3144 tok_state_script_data_less_than_sign = ->
3145 c = txt.charAt(cur++)
3147 temporary_buffer = ''
3148 tok_state = tok_state_script_data_end_tag_open
3151 tok_state = tok_state_script_data_escape_start
3152 return new_character_token '<!' # fixfull split
3154 tok_state = tok_state_script_data
3155 cur -= 1 # Reconsume
3156 return new_character_token '<'
3158 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3159 tok_state_script_data_end_tag_open = ->
3160 c = txt.charAt(cur++)
3162 tok_cur_tag = new_end_tag c.toLowerCase()
3163 temporary_buffer += c
3164 tok_state = tok_state_script_data_end_tag_name
3167 tok_cur_tag = new_end_tag c
3168 temporary_buffer += c
3169 tok_state = tok_state_script_data_end_tag_name
3172 tok_state = tok_state_script_data
3173 cur -= 1 # Reconsume
3174 return new_character_token '</'
3176 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3177 tok_state_script_data_end_tag_name = ->
3178 c = txt.charAt(cur++)
3179 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3180 if is_appropriate_end_tag tok_cur_tag
3181 tok_state = tok_state_before_attribute_name
3185 if is_appropriate_end_tag tok_cur_tag
3186 tok_state = tok_state_self_closing_start_tag
3190 if is_appropriate_end_tag tok_cur_tag
3191 tok_state = tok_state_data
3195 tok_cur_tag.name += c.toLowerCase()
3196 temporary_buffer += c
3199 tok_cur_tag.name += c
3200 temporary_buffer += c
3203 tok_state = tok_state_script_data
3204 cur -= 1 # Reconsume
3205 return new_character_token "</#{temporary_buffer}" # fixfull split
3207 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3208 tok_state_script_data_escape_start = ->
3209 c = txt.charAt(cur++)
3211 tok_state = tok_state_script_data_escape_start_dash
3212 return new_character_token '-'
3214 tok_state = tok_state_script_data
3215 cur -= 1 # Reconsume
3218 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3219 tok_state_script_data_escape_start_dash = ->
3220 c = txt.charAt(cur++)
3222 tok_state = tok_state_script_data_escaped_dash_dash
3223 return new_character_token '-'
3225 tok_state = tok_state_script_data
3226 cur -= 1 # Reconsume
3229 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3230 tok_state_script_data_escaped = ->
3231 c = txt.charAt(cur++)
3233 tok_state = tok_state_script_data_escaped_dash
3234 return new_character_token '-'
3236 tok_state = tok_state_script_data_escaped_less_than_sign
3240 return new_character_token "\ufffd"
3242 tok_state = tok_state_data
3244 cur -= 1 # Reconsume
3247 return new_character_token c
3249 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3250 tok_state_script_data_escaped_dash = ->
3251 c = txt.charAt(cur++)
3253 tok_state = tok_state_script_data_escaped_dash_dash
3254 return new_character_token '-'
3256 tok_state = tok_state_script_data_escaped_less_than_sign
3260 tok_state = tok_state_script_data_escaped
3261 return new_character_token "\ufffd"
3263 tok_state = tok_state_data
3265 cur -= 1 # Reconsume
3268 tok_state = tok_state_script_data_escaped
3269 return new_character_token c
3271 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3272 tok_state_script_data_escaped_dash_dash = ->
3273 c = txt.charAt(cur++)
3275 return new_character_token '-'
3277 tok_state = tok_state_script_data_escaped_less_than_sign
3280 tok_state = tok_state_script_data
3281 return new_character_token '>'
3284 tok_state = tok_state_script_data_escaped
3285 return new_character_token "\ufffd"
3288 tok_state = tok_state_data
3289 cur -= 1 # Reconsume
3292 tok_state = tok_state_script_data_escaped
3293 return new_character_token c
3295 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3296 tok_state_script_data_escaped_less_than_sign = ->
3297 c = txt.charAt(cur++)
3299 temporary_buffer = ''
3300 tok_state = tok_state_script_data_escaped_end_tag_open
3303 temporary_buffer = c.toLowerCase() # yes, really
3304 tok_state = tok_state_script_data_double_escape_start
3305 return new_character_token "<#{c}" # fixfull split
3307 temporary_buffer = c
3308 tok_state = tok_state_script_data_double_escape_start
3309 return new_character_token "<#{c}" # fixfull split
3311 tok_state = tok_state_script_data_escaped
3312 cur -= 1 # Reconsume
3313 return new_character_token c
3315 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3316 tok_state_script_data_escaped_end_tag_open = ->
3317 c = txt.charAt(cur++)
3319 tok_cur_tag = new_end_tag c.toLowerCase()
3320 temporary_buffer += c
3321 tok_state = tok_state_script_data_escaped_end_tag_name
3324 tok_cur_tag = new_end_tag c
3325 temporary_buffer += c
3326 tok_state = tok_state_script_data_escaped_end_tag_name
3329 tok_state = tok_state_script_data_escaped
3330 cur -= 1 # Reconsume
3331 return new_character_token '</' # fixfull split
3333 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3334 tok_state_script_data_escaped_end_tag_name = ->
3335 c = txt.charAt(cur++)
3336 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3337 if is_appropriate_end_tag tok_cur_tag
3338 tok_state = tok_state_before_attribute_name
3342 if is_appropriate_end_tag tok_cur_tag
3343 tok_state = tok_state_self_closing_start_tag
3347 if is_appropriate_end_tag tok_cur_tag
3348 tok_state = tok_state_data
3352 tok_cur_tag.name += c.toLowerCase()
3353 temporary_buffer += c.toLowerCase()
3356 tok_cur_tag.name += c
3357 temporary_buffer += c.toLowerCase()
3360 tok_state = tok_state_script_data_escaped
3361 cur -= 1 # Reconsume
3362 return new_character_token "</#{temporary_buffer}" # fixfull split
3364 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3365 tok_state_script_data_double_escape_start = ->
3366 c = txt.charAt(cur++)
3367 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3368 if temporary_buffer is 'script'
3369 tok_state = tok_state_script_data_double_escaped
3371 tok_state = tok_state_script_data_escaped
3372 return new_character_token c
3374 temporary_buffer += c.toLowerCase() # yes, really lowercase
3375 return new_character_token c
3377 temporary_buffer += c
3378 return new_character_token c
3380 tok_state = tok_state_script_data_escaped
3381 cur -= 1 # Reconsume
3384 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3385 tok_state_script_data_double_escaped = ->
3386 c = txt.charAt(cur++)
3388 tok_state = tok_state_script_data_double_escaped_dash
3389 return new_character_token '-'
3391 tok_state = tok_state_script_data_double_escaped_less_than_sign
3392 return new_character_token '<'
3395 return new_character_token "\ufffd"
3398 tok_state = tok_state_data
3399 cur -= 1 # Reconsume
3402 return new_character_token c
3404 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3405 tok_state_script_data_double_escaped_dash = ->
3406 c = txt.charAt(cur++)
3408 tok_state = tok_state_script_data_double_escaped_dash_dash
3409 return new_character_token '-'
3411 tok_state = tok_state_script_data_double_escaped_less_than_sign
3412 return new_character_token '<'
3415 tok_state = tok_state_script_data_double_escaped
3416 return new_character_token "\ufffd"
3419 tok_state = tok_state_data
3420 cur -= 1 # Reconsume
3423 tok_state = tok_state_script_data_double_escaped
3424 return new_character_token c
3426 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3427 tok_state_script_data_double_escaped_dash_dash = ->
3428 c = txt.charAt(cur++)
3430 return new_character_token '-'
3432 tok_state = tok_state_script_data_double_escaped_less_than_sign
3433 return new_character_token '<'
3435 tok_state = tok_state_script_data
3436 return new_character_token '>'
3439 tok_state = tok_state_script_data_double_escaped
3440 return new_character_token "\ufffd"
3443 tok_state = tok_state_data
3444 cur -= 1 # Reconsume
3447 tok_state = tok_state_script_data_double_escaped
3448 return new_character_token c
3450 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3451 tok_state_script_data_double_escaped_less_than_sign = ->
3452 c = txt.charAt(cur++)
3454 temporary_buffer = ''
3455 tok_state = tok_state_script_data_double_escape_end
3456 return new_character_token '/'
3458 tok_state = tok_state_script_data_double_escaped
3459 cur -= 1 # Reconsume
3462 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3463 tok_state_script_data_double_escape_end = ->
3464 c = txt.charAt(cur++)
3465 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3466 if temporary_buffer is 'script'
3467 tok_state = tok_state_script_data_escaped
3469 tok_state = tok_state_script_data_double_escaped
3470 return new_character_token c
3472 temporary_buffer += c.toLowerCase() # yes, really lowercase
3473 return new_character_token c
3475 temporary_buffer += c
3476 return new_character_token c
3478 tok_state = tok_state_script_data_double_escaped
3479 cur -= 1 # Reconsume
3482 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3483 tok_state_before_attribute_name = ->
3485 switch c = txt.charAt(cur++)
3486 when "\t", "\n", "\u000c", ' '
3489 tok_state = tok_state_self_closing_start_tag
3492 tok_state = tok_state_data
3498 attr_name = "\ufffd"
3499 when '"', "'", '<', '='
3504 tok_state = tok_state_data
3507 attr_name = c.toLowerCase()
3511 tok_cur_tag.attrs_a.unshift [attr_name, '']
3512 tok_state = tok_state_attribute_name
3515 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3516 tok_state_attribute_name = ->
3517 switch c = txt.charAt(cur++)
3518 when "\t", "\n", "\u000c", ' '
3519 tok_state = tok_state_after_attribute_name
3521 tok_state = tok_state_self_closing_start_tag
3523 tok_state = tok_state_before_attribute_value
3525 tok_state = tok_state_data
3531 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3534 tok_cur_tag.attrs_a[0][0] += c
3537 tok_state = tok_state_data
3540 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3542 tok_cur_tag.attrs_a[0][0] += c
3545 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3546 tok_state_after_attribute_name = ->
3547 c = txt.charAt(cur++)
3548 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3551 tok_state = tok_state_self_closing_start_tag
3554 tok_state = tok_state_before_attribute_value
3557 tok_state = tok_state_data
3560 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3561 tok_state = tok_state_attribute_name
3565 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3566 tok_state = tok_state_attribute_name
3570 tok_state = tok_state_data
3571 cur -= 1 # reconsume
3573 if c is '"' or c is "'" or c is '<'
3575 # fall through to Anything else
3577 tok_cur_tag.attrs_a.unshift [c, '']
3578 tok_state = tok_state_attribute_name
3580 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3581 tok_state_before_attribute_value = ->
3582 switch c = txt.charAt(cur++)
3583 when "\t", "\n", "\u000c", ' '
3586 tok_state = tok_state_attribute_value_double_quoted
3588 tok_state = tok_state_attribute_value_unquoted
3591 tok_state = tok_state_attribute_value_single_quoted
3594 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3595 tok_state = tok_state_attribute_value_unquoted
3598 tok_state = tok_state_data
3604 tok_state = tok_state_data
3606 tok_cur_tag.attrs_a[0][1] += c
3607 tok_state = tok_state_attribute_value_unquoted
3610 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3611 tok_state_attribute_value_double_quoted = ->
3612 switch c = txt.charAt(cur++)
3614 tok_state = tok_state_after_attribute_value_quoted
3616 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3619 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3622 tok_state = tok_state_data
3624 tok_cur_tag.attrs_a[0][1] += c
3627 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3628 tok_state_attribute_value_single_quoted = ->
3629 switch c = txt.charAt(cur++)
3631 tok_state = tok_state_after_attribute_value_quoted
3633 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3636 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3639 tok_state = tok_state_data
3641 tok_cur_tag.attrs_a[0][1] += c
3644 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3645 tok_state_attribute_value_unquoted = ->
3646 switch c = txt.charAt(cur++)
3647 when "\t", "\n", "\u000c", ' '
3648 tok_state = tok_state_before_attribute_name
3650 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3652 tok_state = tok_state_data
3657 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3660 tok_state = tok_state_data
3662 # Parse Error if ', <, = or ` (backtick)
3663 tok_cur_tag.attrs_a[0][1] += c
3666 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3667 tok_state_after_attribute_value_quoted = ->
3668 switch c = txt.charAt(cur++)
3669 when "\t", "\n", "\u000c", ' '
3670 tok_state = tok_state_before_attribute_name
3672 tok_state = tok_state_self_closing_start_tag
3674 tok_state = tok_state_data
3680 tok_state = tok_state_data
3683 tok_state = tok_state_before_attribute_name
3684 cur -= 1 # we didn't handle that char
3687 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3688 tok_state_self_closing_start_tag = ->
3689 c = txt.charAt(cur++)
3691 tok_cur_tag.flag 'self-closing'
3692 tok_state = tok_state_data
3696 tok_state = tok_state_data
3697 cur -= 1 # Reconsume
3701 tok_state = tok_state_before_attribute_name
3702 cur -= 1 # Reconsume
3705 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3706 # WARNING: put a comment token in tok_cur_tag before setting this state
3707 tok_state_bogus_comment = ->
3708 next_gt = txt.indexOf '>', cur
3710 val = txt.substr cur
3713 val = txt.substr cur, (next_gt - cur)
3715 val = val.replace "\u0000", "\ufffd"
3716 tok_cur_tag.text += val
3717 tok_state = tok_state_data
3720 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3721 tok_state_markup_declaration_open = ->
3722 if txt.substr(cur, 2) is '--'
3724 tok_cur_tag = new_comment_token ''
3725 tok_state = tok_state_comment_start
3727 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3729 tok_state = tok_state_doctype
3731 acn = adjusted_current_node()
3732 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3734 tok_state = tok_state_cdata_section
3738 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
3739 tok_state = tok_state_bogus_comment
3742 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3743 tok_state_comment_start = ->
3744 switch c = txt.charAt(cur++)
3746 tok_state = tok_state_comment_start_dash
3749 return new_character_token "\ufffd"
3752 tok_state = tok_state_data
3756 tok_state = tok_state_data
3757 cur -= 1 # Reconsume
3760 tok_cur_tag.text += c
3763 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3764 tok_state_comment_start_dash = ->
3765 switch c = txt.charAt(cur++)
3767 tok_state = tok_state_comment_end
3770 tok_cur_tag.text += "-\ufffd"
3771 tok_state = tok_state_comment
3774 tok_state = tok_state_data
3778 tok_state = tok_state_data
3779 cur -= 1 # Reconsume
3782 tok_cur_tag.text += "-#{c}"
3783 tok_state = tok_state_comment
3786 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3787 tok_state_comment = ->
3788 switch c = txt.charAt(cur++)
3790 tok_state = tok_state_comment_end_dash
3793 tok_cur_tag.text += "\ufffd"
3796 tok_state = tok_state_data
3797 cur -= 1 # Reconsume
3800 tok_cur_tag.text += c
3803 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3804 tok_state_comment_end_dash = ->
3805 switch c = txt.charAt(cur++)
3807 tok_state = tok_state_comment_end
3810 tok_cur_tag.text += "-\ufffd"
3811 tok_state = tok_state_comment
3814 tok_state = tok_state_data
3815 cur -= 1 # Reconsume
3818 tok_cur_tag.text += "-#{c}"
3819 tok_state = tok_state_comment
3822 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3823 tok_state_comment_end = ->
3824 switch c = txt.charAt(cur++)
3826 tok_state = tok_state_data
3830 tok_cur_tag.text += "--\ufffd"
3831 tok_state = tok_state_comment
3834 tok_state = tok_state_comment_end_bang
3837 tok_cur_tag.text += '-'
3840 tok_state = tok_state_data
3841 cur -= 1 # Reconsume
3845 tok_cur_tag.text += "--#{c}"
3846 tok_state = tok_state_comment
3849 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3850 tok_state_comment_end_bang = ->
3851 switch c = txt.charAt(cur++)
3853 tok_cur_tag.text += "--!#{c}"
3854 tok_state = tok_state_comment_end_dash
3856 tok_state = tok_state_data
3860 tok_cur_tag.text += "--!\ufffd"
3861 tok_state = tok_state_comment
3864 tok_state = tok_state_data
3865 cur -= 1 # Reconsume
3868 tok_cur_tag.text += "--!#{c}"
3869 tok_state = tok_state_comment
3872 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3873 tok_state_doctype = ->
3874 switch c = txt.charAt(cur++)
3875 when "\t", "\u000a", "\u000c", ' '
3876 tok_state = tok_state_before_doctype_name
3879 tok_state = tok_state_data
3880 el = new_doctype_token ''
3881 el.flag 'force-quirks', true
3882 cur -= 1 # Reconsume
3886 tok_state = tok_state_before_doctype_name
3887 cur -= 1 # Reconsume
3890 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3891 tok_state_before_doctype_name = ->
3892 c = txt.charAt(cur++)
3893 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3896 tok_cur_tag = new_doctype_token c.toLowerCase()
3897 tok_state = tok_state_doctype_name
3901 tok_cur_tag = new_doctype_token "\ufffd"
3902 tok_state = tok_state_doctype_name
3906 el = new_doctype_token ''
3907 el.flag 'force-quirks', true
3908 tok_state = tok_state_data
3912 tok_state = tok_state_data
3913 el = new_doctype_token ''
3914 el.flag 'force-quirks', true
3915 cur -= 1 # Reconsume
3918 tok_cur_tag = new_doctype_token c
3919 tok_state = tok_state_doctype_name
3922 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3923 tok_state_doctype_name = ->
3924 c = txt.charAt(cur++)
3925 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3926 tok_state = tok_state_after_doctype_name
3929 tok_state = tok_state_data
3932 tok_cur_tag.name += c.toLowerCase()
3936 tok_cur_tag.name += "\ufffd"
3940 tok_state = tok_state_data
3941 tok_cur_tag.flag 'force-quirks', true
3942 cur -= 1 # Reconsume
3945 tok_cur_tag.name += c
3948 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3949 tok_state_after_doctype_name = ->
3950 c = txt.charAt(cur++)
3951 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3954 tok_state = tok_state_data
3958 tok_state = tok_state_data
3959 tok_cur_tag.flag 'force-quirks', true
3960 cur -= 1 # Reconsume
3963 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3965 tok_state = tok_state_after_doctype_public_keyword
3967 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3969 tok_state = tok_state_after_doctype_system_keyword
3972 tok_cur_tag.flag 'force-quirks', true
3973 tok_state = tok_state_bogus_doctype
3976 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3977 tok_state_after_doctype_public_keyword = ->
3978 c = txt.charAt(cur++)
3979 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3980 tok_state = tok_state_before_doctype_public_identifier
3984 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3985 tok_state = tok_state_doctype_public_identifier_double_quoted
3989 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3990 tok_state = tok_state_doctype_public_identifier_single_quoted
3994 tok_cur_tag.flag 'force-quirks', true
3995 tok_state = tok_state_data
3999 tok_state = tok_state_data
4000 tok_cur_tag.flag 'force-quirks', true
4001 cur -= 1 # Reconsume
4005 tok_cur_tag.flag 'force-quirks', true
4006 tok_state = tok_state_bogus_doctype
4009 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4010 tok_state_before_doctype_public_identifier = ->
4011 c = txt.charAt(cur++)
4012 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4016 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
4017 tok_state = tok_state_doctype_public_identifier_double_quoted
4021 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
4022 tok_state = tok_state_doctype_public_identifier_single_quoted
4026 tok_cur_tag.flag 'force-quirks', true
4027 tok_state = tok_state_data
4031 tok_state = tok_state_data
4032 tok_cur_tag.flag 'force-quirks', true
4033 cur -= 1 # Reconsume
4037 tok_cur_tag.flag 'force-quirks', true
4038 tok_state = tok_state_bogus_doctype
4042 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4043 tok_state_doctype_public_identifier_double_quoted = ->
4044 c = txt.charAt(cur++)
4046 tok_state = tok_state_after_doctype_public_identifier
4050 tok_cur_tag.public_identifier += "\ufffd"
4054 tok_cur_tag.flag 'force-quirks', true
4055 tok_state = tok_state_data
4059 tok_state = tok_state_data
4060 tok_cur_tag.flag 'force-quirks', true
4061 cur -= 1 # Reconsume
4064 tok_cur_tag.public_identifier += c
4067 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4068 tok_state_doctype_public_identifier_single_quoted = ->
4069 c = txt.charAt(cur++)
4071 tok_state = tok_state_after_doctype_public_identifier
4075 tok_cur_tag.public_identifier += "\ufffd"
4079 tok_cur_tag.flag 'force-quirks', true
4080 tok_state = tok_state_data
4084 tok_state = tok_state_data
4085 tok_cur_tag.flag 'force-quirks', true
4086 cur -= 1 # Reconsume
4089 tok_cur_tag.public_identifier += c
4092 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4093 tok_state_after_doctype_public_identifier = ->
4094 c = txt.charAt(cur++)
4095 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4096 tok_state = tok_state_between_doctype_public_and_system_identifiers
4099 tok_state = tok_state_data
4103 tok_cur_tag.system_identifier = ''
4104 tok_state = tok_state_doctype_system_identifier_double_quoted
4108 tok_cur_tag.system_identifier = ''
4109 tok_state = tok_state_doctype_system_identifier_single_quoted
4113 tok_state = tok_state_data
4114 tok_cur_tag.flag 'force-quirks', true
4115 cur -= 1 # Reconsume
4119 tok_cur_tag.flag 'force-quirks', true
4120 tok_state = tok_state_bogus_doctype
4123 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4124 tok_state_between_doctype_public_and_system_identifiers = ->
4125 c = txt.charAt(cur++)
4126 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4129 tok_state = tok_state_data
4133 tok_cur_tag.system_identifier = ''
4134 tok_state = tok_state_doctype_system_identifier_double_quoted
4138 tok_cur_tag.system_identifier = ''
4139 tok_state = tok_state_doctype_system_identifier_single_quoted
4143 tok_state = tok_state_data
4144 tok_cur_tag.flag 'force-quirks', true
4145 cur -= 1 # Reconsume
4149 tok_cur_tag.flag 'force-quirks', true
4150 tok_state = tok_state_bogus_doctype
4153 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4154 tok_state_after_doctype_system_keyword = ->
4155 c = txt.charAt(cur++)
4156 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4157 tok_state = tok_state_before_doctype_system_identifier
4161 tok_cur_tag.system_identifier = ''
4162 tok_state = tok_state_doctype_system_identifier_double_quoted
4166 tok_cur_tag.system_identifier = ''
4167 tok_state = tok_state_doctype_system_identifier_single_quoted
4171 tok_cur_tag.flag 'force-quirks', true
4172 tok_state = tok_state_data
4176 tok_state = tok_state_data
4177 tok_cur_tag.flag 'force-quirks', true
4178 cur -= 1 # Reconsume
4182 tok_cur_tag.flag 'force-quirks', true
4183 tok_state = tok_state_bogus_doctype
4186 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4187 tok_state_before_doctype_system_identifier = ->
4188 c = txt.charAt(cur++)
4189 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4192 tok_cur_tag.system_identifier = ''
4193 tok_state = tok_state_doctype_system_identifier_double_quoted
4196 tok_cur_tag.system_identifier = ''
4197 tok_state = tok_state_doctype_system_identifier_single_quoted
4201 tok_cur_tag.flag 'force-quirks', true
4202 tok_state = tok_state_data
4206 tok_state = tok_state_data
4207 tok_cur_tag.flag 'force-quirks', true
4208 cur -= 1 # Reconsume
4212 tok_cur_tag.flag 'force-quirks', true
4213 tok_state = tok_state_bogus_doctype
4216 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4217 tok_state_doctype_system_identifier_double_quoted = ->
4218 c = txt.charAt(cur++)
4220 tok_state = tok_state_after_doctype_system_identifier
4224 tok_cur_tag.system_identifier += "\ufffd"
4228 tok_cur_tag.flag 'force-quirks', true
4229 tok_state = tok_state_data
4233 tok_state = tok_state_data
4234 tok_cur_tag.flag 'force-quirks', true
4235 cur -= 1 # Reconsume
4238 tok_cur_tag.system_identifier += c
4241 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4242 tok_state_doctype_system_identifier_single_quoted = ->
4243 c = txt.charAt(cur++)
4245 tok_state = tok_state_after_doctype_system_identifier
4249 tok_cur_tag.system_identifier += "\ufffd"
4253 tok_cur_tag.flag 'force-quirks', true
4254 tok_state = tok_state_data
4258 tok_state = tok_state_data
4259 tok_cur_tag.flag 'force-quirks', true
4260 cur -= 1 # Reconsume
4263 tok_cur_tag.system_identifier += c
4266 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4267 tok_state_after_doctype_system_identifier = ->
4268 c = txt.charAt(cur++)
4269 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4272 tok_state = tok_state_data
4276 tok_state = tok_state_data
4277 tok_cur_tag.flag 'force-quirks', true
4278 cur -= 1 # Reconsume
4282 # do _not_ tok_cur_tag.flag 'force-quirks', true
4283 tok_state = tok_state_bogus_doctype
4286 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4287 tok_state_bogus_doctype = ->
4288 c = txt.charAt(cur++)
4290 tok_state = tok_state_data
4293 tok_state = tok_state_data
4294 cur -= 1 # Reconsume
4299 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4300 tok_state_cdata_section = ->
4301 tok_state = tok_state_data
4302 next_gt = txt.indexOf ']]>', cur
4304 val = txt.substr cur
4307 val = txt.substr cur, (next_gt - cur)
4309 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4310 val = val.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4311 val = val.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4312 return new_character_token val # fixfull split
4314 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4315 # Don't set this as a state, just call it
4316 # returns a string (NOT a text node)
4317 parse_character_reference = (allowed_char = null, in_attr = false) ->
4318 if cur >= txt.length
4320 switch c = txt.charAt(cur)
4321 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4322 # explicitly not a parse error
4325 # there has to be "one or more" alnums between & and ; to be a parse error
4328 if cur + 1 >= txt.length
4330 if txt.charAt(cur + 1).toLowerCase() is 'x'
4339 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4343 if txt.charAt(start + i) is ';'
4345 # FIXME This is supposed to generate parse errors for some chars
4346 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
4353 if alnum.indexOf(txt.charAt(cur + i)) is -1
4356 # exit early, because parse_error() below needs at least one alnum
4358 if txt.charAt(cur + i) is ';'
4359 i += 1 # include ';' terminator in value
4360 decoded = decode_named_char_ref txt.substr(cur, i)
4367 # no ';' terminator (only legacy char refs)
4369 for i in [2..max] # no prefix matches, so ok to check shortest first
4370 c = legacy_char_refs[txt.substr(cur, i)]
4373 if txt.charAt(cur + i) is '='
4374 # "because some legacy user agents will
4375 # misinterpret the markup in those cases"
4378 if alnum.indexOf(txt.charAt(cur + i)) > -1
4379 # this makes attributes forgiving about url args
4381 # ok, and besides the weird exceptions for attributes...
4382 # return the matching char
4383 cur += i # consume entity chars
4384 parse_error() # because no terminating ";"
4388 return # never reached
4390 # tree constructor initialization
4391 # see comments on TYPE_TAG/etc for the structure of this data
4394 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4396 afe = [] # active formatting elements
4397 template_ins_modes = []
4398 ins_mode = ins_mode_initial
4399 original_ins_mode = ins_mode # TODO check spec
4400 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4401 flag_frameset_ok = true
4403 flag_foster_parenting = false
4404 form_element_pointer = null
4405 temporary_buffer = null
4406 pending_table_character_tokens = []
4407 head_element_pointer = null
4408 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4409 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4411 # tokenizer initialization
4412 tok_state = tok_state_data
4415 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4420 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4423 serialize_els = (els, shallow, show_ids) ->
4429 serialized += t.serialize shallow, show_ids
4432 # TODO export TYPE_*
4433 module.exports.parse_html = parse_html
4434 module.exports.debug_log_reset = debug_log_reset
4435 module.exports.debug_log_each = debug_log_each
4436 module.exports.TYPE_TAG = TYPE_TAG
4437 module.exports.TYPE_TEXT = TYPE_TEXT
4438 module.exports.TYPE_COMMENT = TYPE_COMMENT
4439 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4440 module.exports.NS_HTML = NS_HTML
4441 module.exports.NS_MATHML = NS_MATHML
4442 module.exports.NS_SVG = NS_SVG