1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
55 module = exports: window.wheic
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
79 debug_log_each = (cb) ->
80 for str in g_debug_log
85 constructor: (type, args = {}) ->
86 @type = type # one of the TYPE_* constants above
87 @name = args.name ? '' # tag name
88 @text = args.text ? '' # contents for text/comment nodes
89 @attrs = args.attrs ? {}
90 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91 @children = args.children ? []
92 @namespace = args.namespace ? NS_HTML
93 @parent = args.parent ? null
94 @token = args.token ? null
95 @flags = args.flags ? {}
99 @id = "#{++prev_node_id}"
100 acknowledge_self_closing: ->
102 @token.flag 'did_self_close'
104 @flag 'did_self_close', true
105 flag: (key, value = null) ->
110 serialize: (shallow = false, show_ids = false) -> # for unit tests
115 ret += JSON.stringify @name
130 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
136 ret += c.serialize shallow, show_ids
140 ret += JSON.stringify @text
143 ret += JSON.stringify @text
145 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
148 when TYPE_AAA_BOOKMARK
149 ret += 'aaa_bookmark'
152 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
155 # helpers: (only take args that are normally known when parser creates nodes)
156 new_open_tag = (name) ->
157 return new Node TYPE_START_TAG, name: name
158 new_end_tag = (name) ->
159 return new Node TYPE_END_TAG, name: name
160 new_element = (name) ->
161 return new Node TYPE_TAG, name: name
162 new_text_node = (txt) ->
163 return new Node TYPE_TEXT, text: txt
164 new_character_token = new_text_node
165 new_comment_token = (txt) ->
166 return new Node TYPE_COMMENT, text: txt
167 new_doctype_token = (name) ->
168 return new Node TYPE_DOCTYPE, name: name
170 return new Node TYPE_EOF
172 return new Node TYPE_AFE_MARKER
173 new_aaa_bookmark = ->
174 return new Node TYPE_AAA_BOOKMARK
176 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
177 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
178 digits = "0123456789"
179 alnum = lc_alpha + uc_alpha + digits
180 hex_chars = digits + "abcdefABCDEF"
182 is_uc_alpha = (str) ->
183 return str.length is 1 and uc_alpha.indexOf(str) > -1
184 is_lc_alpha = (str) ->
185 return str.length is 1 and lc_alpha.indexOf(str) > -1
187 # some SVG elements have dashes in them
188 tag_name_chars = alnum + "-"
190 # http://www.w3.org/TR/html5/infrastructure.html#space-character
191 space_chars = "\u0009\u000a\u000c\u000d\u0020"
193 return txt.length is 1 and space_chars.indexOf(txt) > -1
194 is_space_tok = (t) ->
195 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
197 is_input_hidden_tok = (t) ->
198 return false unless t.type is TYPE_START_TAG
201 if a[1].toLowerCase() is 'hidden'
206 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
207 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
209 # These are the character references that don't need a terminating semicolon
210 # min length: 2, max: 6, none are a prefix of any other.
212 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
213 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
214 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
215 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
216 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
217 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
218 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
219 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
220 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
221 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
222 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
223 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
224 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
225 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
226 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
227 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
228 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
232 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
233 raw_text_elements = ['script', 'style']
234 escapable_raw_text_elements = ['textarea', 'title']
235 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
237 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
238 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
239 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
240 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
241 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
242 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
243 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
244 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
245 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
246 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
247 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
248 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
249 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
250 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
254 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
256 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
257 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
258 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
259 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
260 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
261 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
262 'determinant', 'diff', 'divergence', 'divide', 'domain',
263 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
264 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
265 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
266 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
267 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
268 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
269 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
270 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
271 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
272 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
273 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
274 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
275 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
276 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
277 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
278 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
279 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
280 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
281 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
282 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
283 'vectorproduct', 'xor'
285 # foreign_elements = [svg_elements..., mathml_elements...]
286 #normal_elements = All other allowed HTML elements are normal elements.
290 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
291 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
292 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
293 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
294 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
295 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
296 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
297 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
298 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
299 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
300 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
301 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
302 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
303 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
304 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
305 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
306 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
307 wbr:NS_HTML, xmp:NS_HTML,
310 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
311 'annotation-xml':NS_MATHML,
314 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
317 formatting_elements = {
318 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
319 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
323 mathml_text_integration = {
324 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
326 is_mathml_text_integration_point = (el) ->
327 return mathml_text_integration[el.name] is el.namespace
328 is_html_integration = (el) -> # DON'T PASS A TOKEN
329 if el.namespace is NS_MATHML
330 if el.name is 'annotation-xml'
331 if el.attrs.encoding?
332 if el.attrs.encoding.toLowerCase() is 'text/html'
334 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
337 if el.namespace is NS_SVG
338 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
343 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
346 foster_parenting_targets = {
367 el_is_special = (e) ->
368 return special_elements[e.name] is e.namespace
370 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
371 el_is_special_not_adp = (el) ->
372 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
376 altglyphdef: 'altGlyphDef'
377 altglyphitem: 'altGlyphItem'
378 animatecolor: 'animateColor'
379 animatemotion: 'animateMotion'
380 animatetransform: 'animateTransform'
383 fecolormatrix: 'feColorMatrix'
384 fecomponenttransfer: 'feComponentTransfer'
385 fecomposite: 'feComposite'
386 feconvolvematrix: 'feConvolveMatrix'
387 fediffuselighting: 'feDiffuseLighting'
388 fedisplacementmap: 'feDisplacementMap'
389 fedistantlight: 'feDistantLight'
390 fedropshadow: 'feDropShadow'
396 fegaussianblur: 'feGaussianBlur'
399 femergenode: 'feMergeNode'
400 femorphology: 'feMorphology'
402 fepointlight: 'fePointLight'
403 fespecularlighting: 'feSpecularLighting'
404 fespotlight: 'feSpotLight'
406 feturbulence: 'feTurbulence'
407 foreignobject: 'foreignObject'
409 lineargradient: 'linearGradient'
410 radialgradient: 'radialGradient'
413 svg_attribute_fixes = {
414 attributename: 'attributeName'
415 attributetype: 'attributeType'
416 basefrequency: 'baseFrequency'
417 baseprofile: 'baseProfile'
419 clippathunits: 'clipPathUnits'
420 contentscripttype: 'contentScriptType'
421 contentstyletype: 'contentStyleType'
422 diffuseconstant: 'diffuseConstant'
424 externalresourcesrequired: 'externalResourcesRequired'
425 filterres: 'filterRes'
426 filterunits: 'filterUnits'
428 gradienttransform: 'gradientTransform'
429 gradientunits: 'gradientUnits'
430 kernelmatrix: 'kernelMatrix'
431 kernelunitlength: 'kernelUnitLength'
432 keypoints: 'keyPoints'
433 keysplines: 'keySplines'
435 lengthadjust: 'lengthAdjust'
436 limitingconeangle: 'limitingConeAngle'
437 markerheight: 'markerHeight'
438 markerunits: 'markerUnits'
439 markerwidth: 'markerWidth'
440 maskcontentunits: 'maskContentUnits'
441 maskunits: 'maskUnits'
442 numoctaves: 'numOctaves'
443 pathlength: 'pathLength'
444 patterncontentunits: 'patternContentUnits'
445 patterntransform: 'patternTransform'
446 patternunits: 'patternUnits'
447 pointsatx: 'pointsAtX'
448 pointsaty: 'pointsAtY'
449 pointsatz: 'pointsAtZ'
450 preservealpha: 'preserveAlpha'
451 preserveaspectratio: 'preserveAspectRatio'
452 primitiveunits: 'primitiveUnits'
455 repeatcount: 'repeatCount'
456 repeatdur: 'repeatDur'
457 requiredextensions: 'requiredExtensions'
458 requiredfeatures: 'requiredFeatures'
459 specularconstant: 'specularConstant'
460 specularexponent: 'specularExponent'
461 spreadmethod: 'spreadMethod'
462 startoffset: 'startOffset'
463 stddeviation: 'stdDeviation'
464 stitchtiles: 'stitchTiles'
465 surfacescale: 'surfaceScale'
466 systemlanguage: 'systemLanguage'
467 tablevalues: 'tableValues'
470 textlength: 'textLength'
472 viewtarget: 'viewTarget'
473 xchannelselector: 'xChannelSelector'
474 ychannelselector: 'yChannelSelector'
475 zoomandpan: 'zoomAndPan'
477 adjust_mathml_attributes = (t) ->
479 if a[0] is 'definitionurl'
480 a[0] = 'definitionURL'
482 adjust_svg_attributes = (t) ->
484 if svg_attribute_fixes[a[0]]?
485 a[0] = svg_attribute_fixes[a[0]]
487 adjust_foreign_attributes = (t) ->
491 # decode_named_char_ref()
493 # The list of named character references is _huge_ so ask the browser to decode
494 # for us instead of wasting bandwidth/space on including the table here.
496 # Pass without the "&" but with the ";" examples:
497 # for "&" pass "amp;"
498 # for "′" pass "x2032;"
501 textarea: document.createElement('textarea')
503 # TODO test this in IE8
504 decode_named_char_ref = (txt) ->
506 decoded = g_dncr.cache[txt]
507 return decoded if decoded?
508 g_dncr.textarea.innerHTML = txt
509 decoded = g_dncr.textarea.value
510 return null if decoded is txt
511 return g_dncr.cache[txt] = decoded
513 parse_html = (args) ->
515 cur = null # index of next char in txt to be parsed
516 # declare doc and tokenizer variables so they're in scope below
518 open_els = null # stack of open elements
519 afe = null # active formatting elements
520 template_ins_modes = null
522 original_ins_mode = null
524 tok_cur_tag = null # partially parsed tag
525 flag_scripting = null
526 flag_frameset_ok = null
528 flag_foster_parenting = null
529 form_element_pointer = null
530 temporary_buffer = null
531 pending_table_character_tokens = null
532 head_element_pointer = null
533 flag_fragment_parsing = null
534 context_element = null
543 console.log "Parse error at character #{cur} of #{txt.length}"
545 afe_push = (new_el) ->
548 if el.name is new_el.name and el.namespace is new_el.namespace
550 continue unless new_el.attrs[k] is v
551 for k, v of new_el.attrs
552 continue unless el.attrs[k] is v
559 afe.unshift new_afe_marker()
561 # the functions below impliment the Tree Contstruction algorithm
562 # http://www.w3.org/TR/html5/syntax.html#tree-construction
564 # But first... the helpers
565 template_tag_is_open = ->
567 if t.name is 'template' and t.namespace is NS_HTML
570 is_in_scope_x = (tag_name, scope, namespace) ->
572 if t.name is tag_name and (namespace is null or namespace is t.namespace)
574 if scope[t.name] is t.namespace
577 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
579 if t.name is tag_name and (namespace is null or namespace is t.namespace)
581 if scope[t.name] is t.namespace
583 if scope2[t.name] is t.namespace
587 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
588 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
589 template: NS_HTML, mi: NS_MATHML,
591 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
592 'annotation-xml': NS_MATHML,
594 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
596 button_scopers = button: NS_HTML
597 li_scopers = ol: NS_HTML, ul: NS_HTML
598 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
599 is_in_scope = (tag_name, namespace = null) ->
600 return is_in_scope_x tag_name, standard_scopers, namespace
601 is_in_button_scope = (tag_name, namespace = null) ->
602 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
603 is_in_table_scope = (tag_name, namespace = null) ->
604 return is_in_scope_x tag_name, table_scopers, namespace
605 # aka is_in_list_item_scope
606 is_in_li_scope = (tag_name, namespace = null) ->
607 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
608 is_in_select_scope = (tag_name, namespace = null) ->
610 if t.name is tag_name and (namespace is null or namespace is t.namespace)
612 if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
615 # this checks for a particular element, not by name
616 # this requires a namespace match
617 el_is_in_scope = (needle) ->
621 if standard_scopers[el.name] is el.namespace
625 clear_to_table_stopers = {
630 clear_stack_to_table_context = ->
632 if clear_to_table_stopers[open_els[0].name]?
636 clear_to_table_body_stopers = {
643 clear_stack_to_table_body_context = ->
645 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
649 clear_to_table_row_stopers = {
654 clear_stack_to_table_row_context = ->
656 if clear_to_table_row_stopers[open_els[0].name]?
660 clear_afe_to_marker = ->
662 return unless afe.length > 0 # this happens in fragment case, ?spec error
664 if el.type is TYPE_AFE_MARKER
669 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
671 # 1. Let last be false.
673 # 2. Let node be the last node in the stack of open elements.
675 node = open_els[node_i]
676 # 3. Loop: If node is the first node in the stack of open elements,
677 # then set last to true, and, if the parser was originally created as
678 # part of the HTML fragment parsing algorithm (fragment case) set node
679 # to the context element.
681 if node_i is open_els.length - 1
683 # fixfull (fragment case)
685 # 4. If node is a select element, run these substeps:
686 if node.name is 'select'
687 # 1. If last is true, jump to the step below labeled done.
689 # 2. Let ancestor be node.
692 # 3. Loop: If ancestor is the first node in the stack of
693 # open elements, jump to the step below labeled done.
695 if ancestor_i is open_els.length - 1
697 # 4. Let ancestor be the node before ancestor in the stack
700 ancestor = open_els[ancestor_i]
701 # 5. If ancestor is a template node, jump to the step below
703 if ancestor.name is 'template'
705 # 6. If ancestor is a table node, switch the insertion mode
706 # to "in select in table" and abort these steps.
707 if ancestor.name is 'table'
708 ins_mode = ins_mode_in_select_in_table
710 # 7. Jump back to the step labeled loop.
711 # 8. Done: Switch the insertion mode to "in select" and abort
713 ins_mode = ins_mode_in_select
715 # 5. If node is a td or th element and last is false, then switch
716 # the insertion mode to "in cell" and abort these steps.
717 if (node.name is 'td' or node.name is 'th') and last is false
718 ins_mode = ins_mode_in_cell
720 # 6. If node is a tr element, then switch the insertion mode to "in
721 # row" and abort these steps.
723 ins_mode = ins_mode_in_row
725 # 7. If node is a tbody, thead, or tfoot element, then switch the
726 # insertion mode to "in table body" and abort these steps.
727 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
728 ins_mode = ins_mode_in_table_body
730 # 8. If node is a caption element, then switch the insertion mode
731 # to "in caption" and abort these steps.
732 if node.name is 'caption'
733 ins_mode = ins_mode_in_caption
735 # 9. If node is a colgroup element, then switch the insertion mode
736 # to "in column group" and abort these steps.
737 if node.name is 'colgroup'
738 ins_mode = ins_mode_in_column_group
740 # 10. If node is a table element, then switch the insertion mode to
741 # "in table" and abort these steps.
742 if node.name is 'table'
743 ins_mode = ins_mode_in_table
745 # 11. If node is a template element, then switch the insertion mode
746 # to the current template insertion mode and abort these steps.
747 # fixfull (template insertion mode stack)
749 # 12. If node is a head element and last is true, then switch the
750 # insertion mode to "in body" ("in body"! not "in head"!) and abort
751 # these steps. (fragment case)
752 if node.name is 'head' and last
753 ins_mode = ins_mode_in_body
755 # 13. If node is a head element and last is false, then switch the
756 # insertion mode to "in head" and abort these steps.
757 if node.name is 'head' and last is false
758 ins_mode = ins_mode_in_head
760 # 14. If node is a body element, then switch the insertion mode to
761 # "in body" and abort these steps.
762 if node.name is 'body'
763 ins_mode = ins_mode_in_body
765 # 15. If node is a frameset element, then switch the insertion mode
766 # to "in frameset" and abort these steps. (fragment case)
767 if node.name is 'frameset'
768 ins_mode = ins_mode_in_frameset
770 # 16. If node is an html element, run these substeps:
771 if node.name is 'html'
772 # 1. If the head element pointer is null, switch the insertion
773 # mode to "before head" and abort these steps. (fragment case)
774 if head_element_pointer is null
775 ins_mode = ins_mode_before_head
777 # 2. Otherwise, the head element pointer is not null,
778 # switch the insertion mode to "after head" and abort these
780 ins_mode = ins_mode_after_head
782 # 17. If last is true, then switch the insertion mode to "in body"
783 # and abort these steps. (fragment case)
785 ins_mode = ins_mode_in_body
787 # 18. Let node now be the node before node in the stack of open
790 node = open_els[node_i]
791 # 19. Return to the step labeled loop.
795 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
796 adjusted_current_node = ->
797 if open_els.length is 1 and flag_fragment_parsing
798 return context_element
801 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
802 # this implementation is structured (mostly) as described at the link above.
803 # capitalized comments are the "labels" described at the link above.
805 return if afe.length is 0
806 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
811 if i is afe.length - 1
814 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
819 el = insert_html_element afe[i].token
824 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
825 # adoption agency algorithm
827 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
828 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
829 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
830 adoption_agency = (subject) ->
831 debug_log "adoption_agency()"
832 debug_log "tree: #{serialize_els doc.children, false, true}"
833 debug_log "open_els: #{serialize_els open_els, true, true}"
834 debug_log "afe: #{serialize_els afe, true, true}"
835 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
838 # remove it from the list of active formatting elements (if found)
843 debug_log "aaa: starting off with subject on top of stack, exiting"
850 # 5. Let formatting element be the last element in the list of
851 # active formatting elements that: is between the end of the list
852 # and the last scope marker in the list, if any, or the start of
853 # the list otherwise, and has the tag name subject.
855 for t, fe_of_afe in afe
856 if t.type is TYPE_AFE_MARKER
861 # If there is no such element, then abort these steps and instead
862 # act as described in the "any other end tag" entry above.
864 debug_log "aaa: fe not found in afe"
865 in_body_any_other_end_tag subject
867 # 6. If formatting element is not in the stack of open elements,
868 # then this is a parse error; remove the element from the list, and
871 for t, fe_of_open_els in open_els
876 debug_log "aaa: fe not found in open_els"
878 # "remove it from the list" must mean afe, since it's not in open_els
879 afe.splice fe_of_afe, 1
881 # 7. If formatting element is in the stack of open elements, but
882 # the element is not in scope, then this is a parse error; abort
884 unless el_is_in_scope fe
885 debug_log "aaa: fe not in scope"
888 # 8. If formatting element is not the current node, this is a parse
889 # error. (But do not abort these steps.)
890 unless open_els[0] is fe
893 # 9. Let furthest block be the topmost node in the stack of open
894 # elements that is lower in the stack than formatting element, and
895 # is an element in the special category. There might not be one.
897 fb_of_open_els = null
904 # and continue, to see if there's one that's more "topmost"
905 # 10. If there is no furthest block, then the UA must first pop all
906 # the nodes from the bottom of the stack of open elements, from the
907 # current node up to and including formatting element, then remove
908 # formatting element from the list of active formatting elements,
909 # and finally abort these steps.
911 debug_log "aaa: no fb"
915 afe.splice fe_of_afe, 1
917 # 11. Let common ancestor be the element immediately above
918 # formatting element in the stack of open elements.
919 ca = open_els[fe_of_open_els + 1] # common ancestor
921 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
922 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
923 bookmark = new_aaa_bookmark()
926 afe.splice i, 0, bookmark
928 node = last_node = fb
932 # 3. Let node be the element immediately above node in the
933 # stack of open elements, or if node is no longer in the stack
934 # of open elements (e.g. because it got removed by this
935 # algorithm), the element that was immediately above node in
936 # the stack of open elements before node was removed.
940 node_next = open_els[i + 1]
942 node = node_next ? node_above
943 debug_log "inner loop #{inner}"
944 debug_log "tree: #{serialize_els doc.children, false, true}"
945 debug_log "open_els: #{serialize_els open_els, true, true}"
946 debug_log "afe: #{serialize_els afe, true, true}"
947 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
948 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
949 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
950 debug_log "node: #{node.serialize true, true}"
951 # TODO make sure node_above gets re-set if/when node is removed from open_els
953 # 4. If node is formatting element, then go to the next step in
954 # the overall algorithm.
958 # 5. If inner loop counter is greater than three and node is in
959 # the list of active formatting elements, then remove node from
960 # the list of active formatting elements.
966 debug_log "max out inner"
971 # 6. If node is not in the list of active formatting elements,
972 # then remove node from the stack of open elements and then go
973 # back to the step labeled inner loop.
975 debug_log "not in afe"
978 node_above = open_els[i + 1]
982 debug_log "the bones"
983 # 7. create an element for the token for which the element node
984 # was created, in the HTML namespace, with common ancestor as
985 # the intended parent; replace the entry for node in the list
986 # of active formatting elements with an entry for the new
987 # element, replace the entry for node in the stack of open
988 # elements with an entry for the new element, and let node be
990 new_node = token_to_element node.token, NS_HTML, ca
994 debug_log "replaced in afe"
998 node_above = open_els[i + 1]
999 open_els[i] = new_node
1000 debug_log "replaced in open_els"
1003 # 8. If last node is furthest block, then move the
1004 # aforementioned bookmark to be immediately after the new node
1005 # in the list of active formatting elements.
1010 debug_log "removed bookmark"
1014 # "after" means lower
1015 afe.splice i, 0, bookmark # "after as <-
1016 debug_log "placed bookmark after node"
1017 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1019 # 9. Insert last node into node, first removing it from its
1020 # previous parent node if any.
1021 if last_node.parent?
1022 debug_log "last_node has parent"
1023 for c, i in last_node.parent.children
1025 debug_log "removing last_node from parent"
1026 last_node.parent.children.splice i, 1
1028 node.children.push last_node
1029 last_node.parent = node
1030 # 10. Let last node be node.
1033 # 11. Return to the step labeled inner loop.
1034 # 14. Insert whatever last node ended up being in the previous step
1035 # at the appropriate place for inserting a node, but using common
1036 # ancestor as the override target.
1038 # In the case where fe is immediately followed by fb:
1039 # * inner loop exits out early (node==fe)
1041 # * last_node is still in the tree (not a duplicate)
1042 if last_node.parent?
1043 debug_log "FEFIRST? last_node has parent"
1044 for c, i in last_node.parent.children
1046 debug_log "removing last_node from parent"
1047 last_node.parent.children.splice i, 1
1050 debug_log "after aaa inner loop"
1051 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1052 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1053 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1054 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1055 debug_log "tree: #{serialize_els doc.children, false, true}"
1060 # can't use standard insert token thing, because it's already in
1061 # open_els and must stay at it's current position in open_els
1062 dest = adjusted_insertion_location ca
1063 dest[0].children.splice dest[1], 0, last_node
1064 last_node.parent = dest[0]
1067 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1068 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1069 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1070 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1071 debug_log "tree: #{serialize_els doc.children, false, true}"
1073 # 15. Create an element for the token for which formatting element
1074 # was created, in the HTML namespace, with furthest block as the
1076 new_element = token_to_element fe.token, NS_HTML, fb
1077 # 16. Take all of the child nodes of furthest block and append them
1078 # to the element created in the last step.
1079 while fb.children.length
1080 t = fb.children.shift()
1081 t.parent = new_element
1082 new_element.children.push t
1083 # 17. Append that new element to furthest block.
1084 new_element.parent = fb
1085 fb.children.push new_element
1086 # 18. Remove formatting element from the list of active formatting
1087 # elements, and insert the new element into the list of active
1088 # formatting elements at the position of the aforementioned
1096 afe[i] = new_element
1098 # 19. Remove formatting element from the stack of open elements,
1099 # and insert the new element into the stack of open elements
1100 # immediately below the position of furthest block in that stack.
1101 for t, i in open_els
1103 open_els.splice i, 1
1105 for t, i in open_els
1107 open_els.splice i, 0, new_element
1109 # 20. Jump back to the step labeled outer loop.
1110 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1111 debug_log "tree: #{serialize_els doc.children, false, true}"
1112 debug_log "open_els: #{serialize_els open_els, true, true}"
1113 debug_log "afe: #{serialize_els afe, true, true}"
1114 debug_log "AAA DONE"
1116 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1117 close_p_element = ->
1118 generate_implied_end_tags 'p' # arg is exception
1119 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1121 while open_els.length > 1 # just in case
1122 el = open_els.shift()
1123 if el.name is 'p' and el.namespace is NS_HTML
1125 close_p_if_in_button_scope = ->
1126 if is_in_button_scope 'p', NS_HTML
1129 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1130 # aka insert_a_character = (t) ->
1131 insert_character = (t) ->
1132 dest = adjusted_insertion_location()
1133 # fixfull check for Document node
1135 prev = dest[0].children[dest[1] - 1]
1136 if prev.type is TYPE_TEXT
1139 dest[0].children.splice dest[1], 0, t
1142 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1143 process_token = (t) ->
1144 acn = adjusted_current_node()
1148 if acn.namespace is NS_HTML
1151 if is_mathml_text_integration_point(acn)
1152 if t.type is TYPE_START_TAG and (t.name is 'mglyph' or t.name is 'malignmark')
1155 if t.type is TYPE_TEXT
1158 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1161 if is_html_integration acn
1162 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1165 if t.type is TYPE_EOF
1168 in_foreign_content t
1172 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1173 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1174 adjusted_insertion_location = (override_target = null) ->
1175 # 1. If there was an override target specified, then let target be the
1178 target = override_target
1179 else # Otherwise, let target be the current node.
1180 target = open_els[0]
1181 # 2. Determine the adjusted insertion location using the first matching
1182 # steps from the following list:
1184 # If foster parenting is enabled and target is a table, tbody, tfoot,
1185 # thead, or tr element Foster parenting happens when content is
1186 # misnested in tables.
1187 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1188 loop # once. this is here so we can ``break`` to "abort these substeps"
1189 # 1. Let last template be the last template element in the
1190 # stack of open elements, if any.
1191 last_template = null
1192 last_template_i = null
1193 for el, i in open_els
1194 if el.name is 'template' and el.namespace is NS_HTML
1198 # 2. Let last table be the last table element in the stack of
1199 # open elements, if any.
1202 for el, i in open_els
1203 if el.name is 'table' and el.namespace is NS_HTML
1207 # 3. If there is a last template and either there is no last
1208 # table, or there is one, but last template is lower (more
1209 # recently added) than last table in the stack of open
1210 # elements, then: let adjusted insertion location be inside
1211 # last template's template contents, after its last child (if
1212 # any), and abort these substeps.
1213 if last_template and (last_table is null or last_template_i < last_table_i)
1214 target = last_template # fixfull should be it's contents
1215 target_i = target.children.length
1217 # 4. If there is no last table, then let adjusted insertion
1218 # location be inside the first element in the stack of open
1219 # elements (the html element), after its last child (if any),
1220 # and abort these substeps. (fragment case)
1221 if last_table is null
1223 target = open_els[open_els.length - 1]
1224 target_i = target.children.length
1226 # 5. If last table has a parent element, then let adjusted
1227 # insertion location be inside last table's parent element,
1228 # immediately before last table, and abort these substeps.
1229 if last_table.parent?
1230 for c, i in last_table.parent.children
1232 target = last_table.parent
1236 # 6. Let previous element be the element immediately above last
1237 # table in the stack of open elements.
1239 # huh? how could it not have a parent?
1240 previous_element = open_els[last_table_i + 1]
1241 # 7. Let adjusted insertion location be inside previous
1242 # element, after its last child (if any).
1243 target = previous_element
1244 target_i = target.children.length
1245 # Note: These steps are involved in part because it's possible
1246 # for elements, the table element in this case in particular,
1247 # to have been moved by a script around in the DOM, or indeed
1248 # removed from the DOM entirely, after the element was inserted
1250 break # don't really loop
1252 # Otherwise Let adjusted insertion location be inside target, after
1253 # its last child (if any).
1254 target_i = target.children.length
1256 # 3. If the adjusted insertion location is inside a template element,
1257 # let it instead be inside the template element's template contents,
1258 # after its last child (if any).
1259 # fixfull (template)
1261 # 4. Return the adjusted insertion location.
1262 return [target, target_i]
1264 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1265 # aka create_an_element_for_token
1266 token_to_element = (t, namespace, intended_parent) ->
1267 # convert attributes into a hash
1270 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1271 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1273 # TODO 2. If the newly created element has an xmlns attribute in the
1274 # XMLNS namespace whose value is not exactly the same as the element's
1275 # namespace, that is a parse error. Similarly, if the newly created
1276 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1277 # value is not the XLink Namespace, that is a parse error.
1279 # fixfull: the spec says stuff about form pointers and ownerDocument
1283 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1284 insert_foreign_element = (token, namespace) ->
1285 ail = adjusted_insertion_location()
1288 el = token_to_element token, namespace, ail_el
1289 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1291 ail_el.children.splice ail_i, 0, el
1294 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1295 insert_html_element = (token) ->
1296 insert_foreign_element token, NS_HTML
1298 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1299 # position should be [node, index_within_children]
1300 insert_comment = (t, position = null) ->
1301 position ?= adjusted_insertion_location()
1302 position[0].children.splice position[1], 0, t
1305 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1306 parse_generic_raw_text = (t) ->
1307 insert_html_element t
1308 tok_state = tok_state_rawtext
1309 original_ins_mode = ins_mode
1310 ins_mode = ins_mode_text
1311 parse_generic_rcdata_text = (t) ->
1312 insert_html_element t
1313 tok_state = tok_state_rcdata
1314 original_ins_mode = ins_mode
1315 ins_mode = ins_mode_text
1317 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1318 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1319 generate_implied_end_tags = (except = null) ->
1320 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1323 # 8.2.5.4 The rules for parsing tokens in HTML content
1324 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1326 # 8.2.5.4.1 The "initial" insertion mode
1327 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1328 ins_mode_initial = (t) ->
1331 if t.type is TYPE_COMMENT
1335 if t.type is TYPE_DOCTYPE
1336 # FIXME check identifiers, set quirks, etc
1339 ins_mode = ins_mode_before_html
1342 #fixfull (iframe, quirks)
1343 ins_mode = ins_mode_before_html
1347 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1348 ins_mode_before_html = (t) ->
1349 if t.type is TYPE_DOCTYPE
1352 if t.type is TYPE_COMMENT
1357 if t.type is TYPE_START_TAG and t.name is 'html'
1358 el = token_to_element t, NS_HTML, doc
1359 doc.children.push el
1360 open_els.unshift(el)
1361 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1362 ins_mode = ins_mode_before_head
1364 if t.type is TYPE_END_TAG
1365 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1366 # fall through to "anything else"
1371 html_tok = new_open_tag 'html'
1372 el = token_to_element html_tok, NS_HTML, doc
1373 doc.children.push el
1375 # ?fixfull browsing context
1376 ins_mode = ins_mode_before_head
1380 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1381 ins_mode_before_head = (t) ->
1384 if t.type is TYPE_COMMENT
1387 if t.type is TYPE_DOCTYPE
1390 if t.type is TYPE_START_TAG and t.name is 'html'
1393 if t.type is TYPE_START_TAG and t.name is 'head'
1394 el = insert_html_element t
1395 head_element_pointer = el
1396 ins_mode = ins_mode_in_head
1398 if t.type is TYPE_END_TAG
1399 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1400 # fall through to Anything else below
1405 head_tok = new_open_tag 'head'
1406 el = insert_html_element head_tok
1407 head_element_pointer = el
1408 ins_mode = ins_mode_in_head
1411 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1412 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1413 open_els.shift() # spec says this will be a 'head' node
1414 ins_mode = ins_mode_after_head
1416 ins_mode_in_head = (t) ->
1417 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1420 if t.type is TYPE_COMMENT
1423 if t.type is TYPE_DOCTYPE
1426 if t.type is TYPE_START_TAG and t.name is 'html'
1429 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1430 el = insert_html_element t
1432 t.acknowledge_self_closing()
1434 if t.type is TYPE_START_TAG and t.name is 'meta'
1435 el = insert_html_element t
1437 t.acknowledge_self_closing()
1438 # fixfull encoding stuff
1440 if t.type is TYPE_START_TAG and t.name is 'title'
1441 parse_generic_rcdata_text t
1443 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1444 parse_generic_raw_text t
1446 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1447 insert_html_element t
1448 ins_mode = ins_mode_in_head_noscript
1450 if t.type is TYPE_START_TAG and t.name is 'script'
1451 ail = adjusted_insertion_location()
1452 el = token_to_element t, NS_HTML, ail
1453 el.flag 'parser-inserted', true
1454 # fixfull frament case
1455 ail[0].children.splice ail[1], 0, el
1457 tok_state = tok_state_script_data
1458 original_ins_mode = ins_mode # make sure orig... is defined
1459 ins_mode = ins_mode_text
1461 if t.type is TYPE_END_TAG and t.name is 'head'
1462 open_els.shift() # will be a head element... spec says so
1463 ins_mode = ins_mode_after_head
1465 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1466 ins_mode_in_head_else t
1468 if t.type is TYPE_START_TAG and t.name is 'template'
1469 insert_html_element t
1471 flag_frameset_ok = false
1472 ins_mode = ins_mode_in_template
1473 template_ins_modes.unshift ins_mode_in_template
1475 if t.type is TYPE_END_TAG and t.name is 'template'
1476 if template_tag_is_open()
1477 generate_implied_end_tags
1478 if open_els[0].name isnt 'template'
1481 el = open_els.shift()
1482 if el.name is 'template' and el.namespace is NS_HTML
1484 clear_afe_to_marker()
1485 template_ins_modes.shift()
1490 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1493 ins_mode_in_head_else t
1495 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1496 ins_mode_in_head_noscript_else = (t) ->
1499 ins_mode = ins_mode_in_head
1501 ins_mode_in_head_noscript = (t) ->
1502 if t.type is TYPE_DOCTYPE
1505 if t.type is TYPE_START_TAG and t.name is 'html'
1508 if t.type is TYPE_END_TAG and t.name is 'noscript'
1510 ins_mode = ins_mode_in_head
1512 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1515 if t.type is TYPE_END_TAG and t.name is 'br'
1516 ins_mode_in_head_noscript_else t
1518 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1522 ins_mode_in_head_noscript_else t
1527 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1528 ins_mode_after_head_else = (t) ->
1529 body_tok = new_open_tag 'body'
1530 insert_html_element body_tok
1531 ins_mode = ins_mode_in_body
1534 ins_mode_after_head = (t) ->
1538 if t.type is TYPE_COMMENT
1541 if t.type is TYPE_DOCTYPE
1544 if t.type is TYPE_START_TAG and t.name is 'html'
1547 if t.type is TYPE_START_TAG and t.name is 'body'
1548 insert_html_element t
1549 flag_frameset_ok = false
1550 ins_mode = ins_mode_in_body
1552 if t.type is TYPE_START_TAG and t.name is 'frameset'
1553 insert_html_element t
1554 ins_mode = ins_mode_in_frameset
1556 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1558 open_els.unshift head_element_pointer
1560 for el, i of open_els
1561 if el is head_element_pointer
1562 open_els.splice i, 1
1564 console.log "warning: 23904 couldn't find head element in open_els"
1566 if t.type is TYPE_END_TAG and t.name is 'template'
1569 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1570 ins_mode_after_head_else t
1572 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1576 ins_mode_after_head_else t
1578 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1579 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1580 for el, i in open_els
1581 if el.name is name and el.namespace is NS_HTML
1582 generate_implied_end_tags name # arg is exception
1583 parse_error() unless i is 0
1588 if special_elements[el.name] is el.namespace
1592 ins_mode_in_body = (t) ->
1593 if t.type is TYPE_TEXT and t.text is "\u0000"
1600 if t.type is TYPE_TEXT
1603 flag_frameset_ok = false
1605 if t.type is TYPE_COMMENT
1608 if t.type is TYPE_DOCTYPE
1611 if t.type is TYPE_START_TAG and t.name is 'html'
1613 return if template_tag_is_open()
1614 root_attrs = open_els[open_els.length - 1].attrs
1616 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1619 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1622 if t.type is TYPE_START_TAG and t.name is 'body'
1624 return if open_els.length < 2
1625 second = open_els[open_els.length - 2]
1626 return unless second.ns is NS_HTML
1627 return unless second.name is 'body'
1628 return if template_tag_is_open()
1629 frameset_ok_flag = false
1631 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1633 if t.type is TYPE_START_TAG and t.name is 'frameset'
1635 return if open_els.length < 2
1636 second_i = open_els.length - 2
1637 second = open_els[second_i]
1638 return unless second.ns is NS_HTML
1639 return unless second.name is 'body'
1640 flag_frameset_ok = false
1642 for el, i in second.parent.children
1644 second.parent.children.splice i, 1
1646 open_els.splice second_i, 1
1647 # pop everything except the "root html element"
1648 while open_els.length > 1
1650 insert_html_element t
1651 ins_mode = ins_mode_in_frameset
1653 if t.type is TYPE_EOF
1655 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1656 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1657 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1660 unless ok_tags[t.name] is el.namespace
1663 if template_ins_modes.length > 0
1664 ins_mode_in_template t
1668 if t.type is TYPE_END_TAG and t.name is 'body'
1669 unless is_in_scope 'body', NS_HTML
1673 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1674 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1675 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1676 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1680 unless ok_tags[t.name] is el.namespace
1683 ins_mode = ins_mode_after_body
1685 if t.type is TYPE_END_TAG and t.name is 'html'
1686 unless is_in_scope 'body', NS_HTML
1690 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1691 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1692 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1693 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1697 unless ok_tags[t.name] is el.namespace
1700 ins_mode = ins_mode_after_body
1703 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1704 close_p_if_in_button_scope()
1705 insert_html_element t
1707 if t.type is TYPE_START_TAG and h_tags[t.name]?
1708 close_p_if_in_button_scope()
1709 if h_tags[open_els[0].name] is open_els[0].namespace
1712 insert_html_element t
1714 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1715 close_p_if_in_button_scope()
1716 insert_html_element t
1717 # spec: If the next token is a "LF" (U+000A) character token, then
1718 # ignore that token and move on to the next one. (Newlines at the
1719 # start of pre blocks are ignored as an authoring convenience.)
1720 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1722 flag_frameset_ok = false
1724 if t.type is TYPE_START_TAG and t.name is 'form'
1725 unless form_element_pointer is null or template_tag_is_open()
1728 close_p_if_in_button_scope()
1729 el = insert_html_element t
1730 unless template_tag_is_open()
1731 form_element_pointer = el
1733 if t.type is TYPE_START_TAG and t.name is 'li'
1734 flag_frameset_ok = false
1735 for node in open_els
1736 if node.name is 'li' and node.namespace is NS_HTML
1737 generate_implied_end_tags 'li' # arg is exception
1738 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1741 el = open_els.shift()
1742 if el.name is 'li' and el.namespace is NS_HTML
1745 if el_is_special_not_adp node
1747 close_p_if_in_button_scope()
1748 insert_html_element t
1750 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1751 flag_frameset_ok = false
1752 for node in open_els
1753 if node.name is 'dd' and node.namespace is NS_HTML
1754 generate_implied_end_tags 'dd' # arg is exception
1755 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1758 el = open_els.shift()
1759 if el.name is 'dd' and el.namespace is NS_HTML
1762 if node.name is 'dt' and node.namespace is NS_HTML
1763 generate_implied_end_tags 'dt' # arg is exception
1764 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1767 el = open_els.shift()
1768 if el.name is 'dt' and el.namespace is NS_HTML
1771 if el_is_special_not_adp node
1773 close_p_if_in_button_scope()
1774 insert_html_element t
1776 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1777 close_p_if_in_button_scope()
1778 insert_html_element t
1779 tok_state = tok_state_plaintext
1781 if t.type is TYPE_START_TAG and t.name is 'button'
1782 if is_in_scope 'button', NS_HTML
1784 generate_implied_end_tags()
1786 el = open_els.shift()
1787 if el.name is 'button' and el.namespace is NS_HTML
1790 insert_html_element t
1791 flag_frameset_ok = false
1793 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1794 unless is_in_scope t.name, NS_HTML
1797 generate_implied_end_tags()
1798 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1801 el = open_els.shift()
1802 if el.name is t.name and el.namespace is NS_HTML
1805 if t.type is TYPE_END_TAG and t.name is 'form'
1806 unless template_tag_is_open()
1807 node = form_element_pointer
1808 form_element_pointer = null
1809 if node is null or not el_is_in_scope node
1812 generate_implied_end_tags()
1813 if open_els[0] isnt node
1815 for el, i in open_els
1817 open_els.splice i, 1
1820 unless is_in_scope 'form', NS_HTML
1823 generate_implied_end_tags()
1824 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1827 el = open_els.shift()
1828 if el.name is 'form' and el.namespace is NS_HTML
1831 if t.type is TYPE_END_TAG and t.name is 'p'
1832 unless is_in_button_scope 'p', NS_HTML
1834 insert_html_element new_open_tag 'p'
1837 if t.type is TYPE_END_TAG and t.name is 'li'
1838 unless is_in_li_scope 'li', NS_HTML
1841 generate_implied_end_tags 'li' # arg is exception
1842 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1845 el = open_els.shift()
1846 if el.name is 'li' and el.namespace is NS_HTML
1849 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1850 unless is_in_scope t.name, NS_HTML
1853 generate_implied_end_tags t.name # arg is exception
1854 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1857 el = open_els.shift()
1858 if el.name is t.name and el.namespace is NS_HTML
1861 if t.type is TYPE_END_TAG and h_tags[t.name]?
1864 if h_tags[el.name] is el.namespace
1867 if standard_scopers[el.name] is el.namespace
1872 generate_implied_end_tags()
1873 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1876 el = open_els.shift()
1877 if h_tags[el.name] is el.namespace
1881 if t.type is TYPE_START_TAG and t.name is 'a'
1882 # If the list of active formatting elements contains an a element
1883 # between the end of the list and the last marker on the list (or
1884 # the start of the list if there is no marker on the list), then
1885 # this is a parse error; run the adoption agency algorithm for the
1886 # tag name "a", then remove that element from the list of active
1887 # formatting elements and the stack of open elements if the
1888 # adoption agency algorithm didn't already remove it (it might not
1889 # have if the element is not in table scope).
1892 if el.type is TYPE_AFE_MARKER
1894 if el.name is 'a' and el.namespace is NS_HTML
1902 for el, i in open_els
1904 open_els.splice i, 1
1906 el = insert_html_element t
1909 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1911 el = insert_html_element t
1914 if t.type is TYPE_START_TAG and t.name is 'nobr'
1916 el = insert_html_element t
1919 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1920 adoption_agency t.name
1922 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1924 insert_html_element t
1926 flag_frameset_ok = false
1928 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1929 unless is_in_scope t.name, NS_HTML
1932 generate_implied_end_tags()
1933 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1936 el = open_els.shift()
1937 if el.name is t.name and el.namespace is NS_HTML
1939 clear_afe_to_marker()
1941 if t.type is TYPE_START_TAG and t.name is 'table'
1942 close_p_if_in_button_scope() # fixfull quirksmode thing
1943 insert_html_element t
1944 flag_frameset_ok = false
1945 ins_mode = ins_mode_in_table
1947 if t.type is TYPE_END_TAG and t.name is 'br'
1949 t.type is TYPE_START_TAG
1951 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
1953 insert_html_element t
1955 t.acknowledge_self_closing()
1956 flag_frameset_ok = false
1958 if t.type is TYPE_START_TAG and t.name is 'input'
1960 insert_html_element t
1962 t.acknowledge_self_closing()
1963 unless is_input_hidden_tok t
1964 flag_frameset_ok = false
1966 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
1967 insert_html_element t
1969 t.acknowledge_self_closing()
1971 if t.type is TYPE_START_TAG and t.name is 'hr'
1972 close_p_if_in_button_scope()
1973 insert_html_element t
1975 t.acknowledge_self_closing()
1976 flag_frameset_ok = false
1978 if t.type is TYPE_START_TAG and t.name is 'image'
1983 if t.type is TYPE_START_TAG and t.name is 'isindex'
1985 if template_tag_is_open() is false and form_element_pointer isnt null
1987 t.acknowledge_self_closing()
1988 flag_frameset_ok = false
1989 close_p_if_in_button_scope()
1990 el = insert_html_element new_open_tag 'form'
1991 unless template_tag_is_open()
1992 form_element_pointer = el
1995 el.attrs['action'] = a[1]
1997 insert_html_element new_open_tag 'hr'
2000 insert_html_element new_open_tag 'label'
2001 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2002 input_el = new_open_tag 'input'
2007 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2008 input_el.attrs_a.push [a[0], a[1]]
2009 input_el.attrs_a.push ['name', 'isindex']
2010 # fixfull this next bit is in english... internationalize?
2011 prompt ?= "This is a searchable index. Enter search keywords: "
2012 insert_character new_character_token prompt # fixfull split
2013 # TODO submit typo "balue" in spec
2014 insert_html_element input_el
2016 # insert_character '' # you can put chars here if promt attr missing
2018 insert_html_element new_open_tag 'hr'
2021 unless template_tag_is_open()
2022 form_element_pointer = null
2024 if t.type is TYPE_START_TAG and t.name is 'textarea'
2025 insert_html_element t
2026 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2028 tok_state = tok_state_rcdata
2029 original_ins_mode = ins_mode
2030 flag_frameset_ok = false
2031 ins_mode = ins_mode_text
2033 if t.type is TYPE_START_TAG and t.name is 'xmp'
2034 close_p_if_in_button_scope()
2036 flag_frameset_ok = false
2037 parse_generic_raw_text t
2039 if t.type is TYPE_START_TAG and t.name is 'iframe'
2040 flag_frameset_ok = false
2041 parse_generic_raw_text t
2043 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2044 parse_generic_raw_text t
2046 if t.type is TYPE_START_TAG and t.name is 'select'
2048 insert_html_element t
2049 flag_frameset_ok = false
2050 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2051 ins_mode = ins_mode_in_select_in_table
2053 ins_mode = ins_mode_in_select
2055 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2056 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2059 insert_html_element t
2061 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2062 if is_in_scope 'ruby', NS_HTML
2063 generate_implied_end_tags()
2064 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2066 insert_html_element t
2068 if t.type is TYPE_START_TAG and t.name is 'rt'
2069 if is_in_scope 'ruby', NS_HTML
2070 generate_implied_end_tags 'rtc' # arg is exception
2071 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2073 insert_html_element t
2075 if t.type is TYPE_START_TAG and t.name is 'math'
2077 adjust_mathml_attributes t
2078 adjust_foreign_attributes t
2079 insert_foreign_element t, NS_MATHML
2080 if t.flag 'self-closing'
2082 t.acknowledge_self_closing()
2084 if t.type is TYPE_START_TAG and t.name is 'svg'
2086 adjust_svg_attributes t
2087 adjust_foreign_attributes t
2088 insert_foreign_element t, NS_SVG
2089 if t.flag 'self-closing'
2091 t.acknowledge_self_closing()
2093 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2096 if t.type is TYPE_START_TAG # any other start tag
2098 insert_html_element t
2100 if t.type is TYPE_END_TAG # any other end tag
2101 in_body_any_other_end_tag t.name
2105 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2106 ins_mode_text = (t) ->
2107 if t.type is TYPE_TEXT
2110 if t.type is TYPE_EOF
2112 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2113 open_els[0].flag 'already started', true
2115 ins_mode = original_ins_mode
2118 if t.type is TYPE_END_TAG and t.name is 'script'
2120 ins_mode = original_ins_mode
2121 # fixfull the spec seems to assume that I'm going to run the script
2122 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2124 if t.type is TYPE_END_TAG
2126 ins_mode = original_ins_mode
2128 console.log 'warning: end of ins_mode_text reached'
2130 # the functions below implement the tokenizer stats described here:
2131 # http://www.w3.org/TR/html5/syntax.html#tokenization
2133 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2134 ins_mode_in_table_else = (t) ->
2136 flag_foster_parenting = true
2138 flag_foster_parenting = false
2140 ins_mode_in_table = (t) ->
2143 if t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr'
2144 original_ins_mode = ins_mode
2145 ins_mode = ins_mode_in_table_text
2148 ins_mode_in_table_else t
2156 clear_stack_to_table_context()
2158 insert_html_element t
2159 ins_mode = ins_mode_in_caption
2161 clear_stack_to_table_context()
2162 insert_html_element t
2163 ins_mode = ins_mode_in_column_group
2165 clear_stack_to_table_context()
2166 insert_html_element new_open_tag 'colgroup'
2167 ins_mode = ins_mode_in_column_group
2169 when 'tbody', 'tfoot', 'thead'
2170 clear_stack_to_table_context()
2171 insert_html_element t
2172 ins_mode = ins_mode_in_table_body
2173 when 'td', 'th', 'tr'
2174 clear_stack_to_table_context()
2175 insert_html_element new_open_tag 'tbody'
2176 ins_mode = ins_mode_in_table_body
2180 if is_in_table_scope 'table', NS_HTML
2182 el = open_els.shift()
2183 if el.name is 'table' and el.namespace is NS_HTML
2187 when 'style', 'script', 'template'
2190 unless is_input_hidden_tok t
2191 ins_mode_in_table_else t
2194 el = insert_html_element t
2196 t.acknowledge_self_closing()
2199 if form_element_pointer?
2201 if template_tag_is_open()
2203 form_element_pointer = insert_html_element t
2206 ins_mode_in_table_else t
2210 if is_in_table_scope 'table', NS_HTML
2212 el = open_els.shift()
2213 if el.name is 'table' and el.namespace is NS_HTML
2218 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2223 ins_mode_in_table_else t
2227 ins_mode_in_table_else t
2230 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2231 ins_mode_in_table_text = (t) ->
2232 if t.type is TYPE_TEXT and t.text is "\u0000"
2233 # huh? I thought the tokenizer didn't emit these
2236 if t.type is TYPE_TEXT
2237 pending_table_character_tokens.push t
2241 for old in pending_table_character_tokens
2242 unless is_space_tok old
2246 for old in pending_table_character_tokens
2247 insert_character old
2249 for old in pending_table_character_tokens
2250 ins_mode_table_else old
2251 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
2252 ins_mode = original_ins_mode
2255 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2256 ins_mode_in_caption = (t) ->
2257 if t.type is TYPE_END_TAG and t.name is 'caption'
2258 if is_in_table_scope 'caption', NS_HTML
2259 generate_implied_end_tags()
2260 if open_els[0].name isnt 'caption'
2263 el = open_els.shift()
2264 if el.name is 'caption' and el.namespace is NS_HTML
2266 clear_afe_to_marker()
2267 ins_mode = ins_mode_in_table
2272 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2274 if is_in_table_scope 'caption', NS_HTML
2276 el = open_els.shift()
2277 if el.name is 'caption' and el.namespace is NS_HTML
2279 clear_afe_to_marker()
2280 ins_mode = ins_mode_in_table
2282 # else fragment case
2284 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2290 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2291 ins_mode_in_column_group = (t) ->
2295 if t.type is TYPE_COMMENT
2298 if t.type is TYPE_DOCTYPE
2301 if t.type is TYPE_START_TAG and t.name is 'html'
2304 if t.type is TYPE_START_TAG and t.name is 'col'
2305 el = insert_html_element t
2307 t.acknowledge_self_closing()
2309 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2310 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2312 ins_mode = ins_mode_in_table
2316 if t.type is TYPE_END_TAG and t.name is 'col'
2319 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2322 if t.type is TYPE_EOF
2326 if open_els[0].name isnt 'colgroup'
2330 ins_mode = ins_mode_in_table
2334 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2335 ins_mode_in_table_body = (t) ->
2336 if t.type is TYPE_START_TAG and t.name is 'tr'
2337 clear_stack_to_table_body_context()
2338 insert_html_element t
2339 ins_mode = ins_mode_in_row
2341 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2343 clear_stack_to_table_body_context()
2344 insert_html_element new_open_tag 'tr'
2345 ins_mode = ins_mode_in_row
2348 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2349 unless is_in_table_scope t.name, NS_HTML
2352 clear_stack_to_table_body_context()
2354 ins_mode = ins_mode_in_table
2356 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2359 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2362 if table_scopers[el.name] is el.namespace
2367 clear_stack_to_table_body_context()
2369 ins_mode = ins_mode_in_table
2372 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2378 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2379 ins_mode_in_row = (t) ->
2380 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2381 clear_stack_to_table_row_context()
2382 insert_html_element t
2383 ins_mode = ins_mode_in_cell
2386 if t.type is TYPE_END_TAG and t.name is 'tr'
2387 if is_in_table_scope 'tr', NS_HTML
2388 clear_stack_to_table_row_context()
2390 ins_mode = ins_mode_in_table_body
2394 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2395 if is_in_table_scope 'tr', NS_HTML
2396 clear_stack_to_table_row_context()
2398 ins_mode = ins_mode_in_table_body
2403 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2404 if is_in_table_scope t.name, NS_HTML
2405 if is_in_table_scope 'tr', NS_HTML
2406 clear_stack_to_table_row_context()
2408 ins_mode = ins_mode_in_table_body
2413 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2419 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2421 generate_implied_end_tags()
2422 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2425 el = open_els.shift()
2426 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2428 clear_afe_to_marker()
2429 ins_mode = ins_mode_in_row
2431 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2432 ins_mode_in_cell = (t) ->
2433 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2434 if is_in_table_scope t.name, NS_HTML
2435 generate_implied_end_tags()
2436 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2439 el = open_els.shift()
2440 if el.name is t.name and el.namespace is NS_HTML
2442 clear_afe_to_marker()
2443 ins_mode = ins_mode_in_row
2447 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2450 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2453 if table_scopers[el.name] is el.namespace
2461 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2464 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2465 if is_in_table_scope t.name, NS_HTML
2474 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2475 ins_mode_in_select = (t) ->
2476 if t.type is TYPE_TEXT and t.text is "\u0000"
2479 if t.type is TYPE_TEXT
2482 if t.type is TYPE_COMMENT
2485 if t.type is TYPE_DOCTYPE
2488 if t.type is TYPE_START_TAG and t.name is 'html'
2491 if t.type is TYPE_START_TAG and t.name is 'option'
2492 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2494 insert_html_element t
2496 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2497 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2499 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2501 insert_html_element t
2503 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2504 if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
2505 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2507 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2512 if t.type is TYPE_END_TAG and t.name is 'option'
2513 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2518 if t.type is TYPE_END_TAG and t.name is 'select'
2519 if is_in_select_scope 'select', NS_HTML
2521 el = open_els.shift()
2522 if el.name is 'select' and el.namespace is NS_HTML
2528 if t.type is TYPE_START_TAG and t.name is 'select'
2531 el = open_els.shift()
2532 if el.name is 'select' and el.namespace is NS_HTML
2535 # spec says that this is the same as </select> but it doesn't say
2536 # to check scope first
2538 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2540 if is_in_select_scope 'select', NS_HTML
2543 el = open_els.shift()
2544 if el.name is 'select' and el.namespace is NS_HTML
2549 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2552 if t.type is TYPE_EOF
2559 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2560 ins_mode_in_select_in_table = (t) ->
2561 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2564 el = open_els.shift()
2565 if el.name is 'select' and el.namespace is NS_HTML
2570 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2572 unless is_in_table_scope t.name, NS_HTML
2575 el = open_els.shift()
2576 if el.name is 'select' and el.namespace is NS_HTML
2582 ins_mode_in_select t
2585 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2586 ins_mode_in_template = (t) ->
2587 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2590 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2593 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2594 template_ins_modes.shift()
2595 template_ins_modes.unshift ins_mode_in_table
2596 ins_mode = ins_mode_in_table
2599 if t.type is TYPE_START_TAG and t.name is 'col'
2600 template_ins_modes.shift()
2601 template_ins_modes.unshift ins_mode_in_column_group
2602 ins_mode = ins_mode_in_column_group
2605 if t.type is TYPE_START_TAG and t.name is 'tr'
2606 template_ins_modes.shift()
2607 template_ins_modes.unshift ins_mode_in_table_body
2608 ins_mode = ins_mode_in_table_body
2611 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2612 template_ins_modes.shift()
2613 template_ins_modes.unshift ins_mode_in_row
2614 ins_mode = ins_mode_in_row
2617 if t.type is TYPE_START_TAG
2618 template_ins_modes.shift()
2619 template_ins_modes.unshift ins_mode_in_body
2620 ins_mode = ins_mode_in_body
2623 if t.type is TYPE_END_TAG
2626 if t.type is TYPE_EOF
2627 unless template_tag_is_open()
2632 el = open_els.shift()
2633 if el.name is 'template' and el.namespace is NS_HTML
2635 clear_afe_to_marker()
2636 template_ins_modes.shift()
2640 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2641 ins_mode_after_body = (t) ->
2645 if t.type is TYPE_COMMENT
2646 insert_comment t, [open_els[0], open_els[0].children.length]
2648 if t.type is TYPE_DOCTYPE
2651 if t.type is TYPE_START_TAG and t.name is 'html'
2654 if t.type is TYPE_END_TAG and t.name is 'html'
2655 # fixfull fragment case
2656 ins_mode = ins_mode_after_after_body
2658 if t.type is TYPE_EOF
2663 ins_mode = ins_mode_in_body
2666 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2667 ins_mode_in_frameset = (t) ->
2671 if t.type is TYPE_COMMENT
2674 if t.type is TYPE_DOCTYPE
2677 if t.type is TYPE_START_TAG and t.name is 'html'
2680 if t.type is TYPE_START_TAG and t.name is 'frameset'
2681 insert_html_element t
2683 if t.type is TYPE_END_TAG and t.name is 'frameset'
2684 if open_els.length is 1
2686 return # fragment case
2688 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2689 ins_mode = ins_mode_after_frameset
2691 if t.type is TYPE_START_TAG and t.name is 'frame'
2692 insert_html_element t
2694 t.acknowledge_self_closing()
2696 if t.type is TYPE_START_TAG and t.name is 'noframes'
2699 if t.type is TYPE_EOF
2700 if open_els.length isnt 1
2708 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2709 ins_mode_after_frameset = (t) ->
2713 if t.type is TYPE_COMMENT
2716 if t.type is TYPE_DOCTYPE
2719 if t.type is TYPE_START_TAG and t.name is 'html'
2722 if t.type is TYPE_END_TAG and t.name is 'html'
2723 insert_mode = ins_mode_after_after_frameset
2725 if t.type is TYPE_START_TAG and t.name is 'noframes'
2728 if t.type is TYPE_EOF
2735 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2736 ins_mode_after_after_body = (t) ->
2737 if t.type is TYPE_COMMENT
2738 insert_comment t, [doc, doc.children.length]
2740 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2743 if t.type is TYPE_EOF
2748 ins_mode = ins_mode_in_body
2751 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2752 ins_mode_after_after_frameset = (t) ->
2753 if t.type is TYPE_COMMENT
2754 insert_comment t, [doc, doc.children.length]
2756 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2759 if t.type is TYPE_EOF
2762 if t.type is TYPE_START_TAG and t.name is 'noframes'
2769 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2770 has_color_face_or_size = (t) ->
2772 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2775 in_foreign_content_end_script = ->
2779 in_foreign_content_other_start = (t) ->
2780 acn = adjusted_current_node()
2781 if acn.namespace is NS_MATHML
2782 adjust_mathml_attributes t
2783 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2784 t.name = svg_name_fixes[t.name]
2785 if acn.namespace is NS_SVG
2786 adjust_svg_attributes t
2787 adjust_foreign_attributes t
2788 insert_foreign_element t, acn.namespace
2789 if t.flag 'self-closing'
2790 if t.name is 'script'
2791 t.acknowledge_self_closing()
2792 in_foreign_content_end_script()
2795 t.acknowledge_self_closing()
2797 in_foreign_content = (t) ->
2798 if t.type is TYPE_TEXT and t.text is "\u0000"
2800 insert_character new_character_token "\ufffd"
2805 if t.type is TYPE_TEXT
2806 flag_frameset_ok = false
2809 if t.type is TYPE_COMMENT
2812 if t.type is TYPE_DOCTYPE
2815 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
2817 if flag_fragment_parsing
2818 in_foreign_content_other_start t
2820 loop # is this safe?
2823 if is_mathml_text_integration_point(cn) or is_html_integration(cn) or cn.namespace is NS_HTML
2827 if t.type is TYPE_START_TAG
2828 in_foreign_content_other_start t
2830 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
2831 in_foreign_content_end_script()
2833 if t.type is TYPE_END_TAG
2834 if open_els[0].name.toLowerCase() isnt t.name
2836 for node in open_els
2837 if node is open_els[open_els.length - 1]
2839 if node.name.toLowerCase() is t.name
2841 el = open_els.shift()
2844 if node.namespace is NS_HTML
2846 ins_mode t # explicitly call HTML insertion mode
2849 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2851 switch c = txt.charAt(cur++)
2853 return new_text_node parse_character_reference()
2855 tok_state = tok_state_tag_open
2858 return new_text_node c
2860 return new_eof_token()
2862 return new_text_node c
2865 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2866 # not needed: tok_state_character_reference_in_data = ->
2867 # just call parse_character_reference()
2869 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2870 tok_state_rcdata = ->
2871 switch c = txt.charAt(cur++)
2873 return new_text_node parse_character_reference()
2875 tok_state = tok_state_rcdata_less_than_sign
2878 return new_character_token "\ufffd"
2880 return new_eof_token()
2882 return new_character_token c
2885 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2886 # not needed: tok_state_character_reference_in_rcdata = ->
2887 # just call parse_character_reference()
2889 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2890 tok_state_rawtext = ->
2891 switch c = txt.charAt(cur++)
2893 tok_state = tok_state_rawtext_less_than_sign
2896 return new_character_token "\ufffd"
2898 return new_eof_token()
2900 return new_character_token c
2903 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2904 tok_state_script_data = ->
2905 switch c = txt.charAt(cur++)
2907 tok_state = tok_state_script_data_less_than_sign
2910 return new_character_token "\ufffd"
2912 return new_eof_token()
2914 return new_character_token c
2917 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2918 tok_state_plaintext = ->
2919 switch c = txt.charAt(cur++)
2922 return new_character_token "\ufffd"
2924 return new_eof_token()
2926 return new_character_token c
2930 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2931 tok_state_tag_open = ->
2932 switch c = txt.charAt(cur++)
2934 tok_state = tok_state_markup_declaration_open
2936 tok_state = tok_state_end_tag_open
2939 tok_cur_tag = new_comment_token '?'
2940 tok_state = tok_state_bogus_comment
2943 tok_cur_tag = new_open_tag c
2944 tok_state = tok_state_tag_name
2945 else if is_uc_alpha(c)
2946 tok_cur_tag = new_open_tag c.toLowerCase()
2947 tok_state = tok_state_tag_name
2950 tok_state = tok_state_data
2951 cur -= 1 # we didn't parse/handle the char after <
2952 return new_text_node '<'
2955 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2956 tok_state_end_tag_open = ->
2957 switch c = txt.charAt(cur++)
2960 tok_state = tok_state_data
2963 tok_state = tok_state_data
2964 return new_text_node '</'
2967 tok_cur_tag = new_end_tag c.toLowerCase()
2968 tok_state = tok_state_tag_name
2969 else if is_lc_alpha(c)
2970 tok_cur_tag = new_end_tag c
2971 tok_state = tok_state_tag_name
2974 tok_cur_tag = new_comment_token '/'
2975 tok_state = tok_state_bogus_comment
2978 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2979 tok_state_tag_name = ->
2980 switch c = txt.charAt(cur++)
2981 when "\t", "\n", "\u000c", ' '
2982 tok_state = tok_state_before_attribute_name
2984 tok_state = tok_state_self_closing_start_tag
2986 tok_state = tok_state_data
2992 tok_cur_tag.name += "\ufffd"
2995 tok_state = tok_state_data
2998 tok_cur_tag.name += c.toLowerCase()
3000 tok_cur_tag.name += c
3003 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3004 tok_state_rcdata_less_than_sign = ->
3005 c = txt.charAt(cur++)
3007 temporary_buffer = ''
3008 tok_state = tok_state_rcdata_end_tag_open
3011 tok_state = tok_state_rcdata
3012 cur -= 1 # reconsume the input character
3013 return new_character_token '<'
3015 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3016 tok_state_rcdata_end_tag_open = ->
3017 c = txt.charAt(cur++)
3019 tok_cur_tag = new_end_tag c.toLowerCase()
3020 temporary_buffer += c
3021 tok_state = tok_state_rcdata_end_tag_name
3024 tok_cur_tag = new_end_tag c
3025 temporary_buffer += c
3026 tok_state = tok_state_rcdata_end_tag_name
3029 tok_state = tok_state_rcdata
3030 cur -= 1 # reconsume the input character
3031 return new_character_token "</" # fixfull separate these
3033 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3034 is_appropriate_end_tag = (t) ->
3035 # spec says to check against "the tag name of the last start tag to
3036 # have been emitted from this tokenizer", but this is only called from
3037 # the various "raw" states, so it's hopefully ok to assume that
3038 # open_els[0].name will work instead TODO: verify this after the script
3039 # data states are implemented
3040 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3041 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3043 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3044 tok_state_rcdata_end_tag_name = ->
3045 c = txt.charAt(cur++)
3046 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3047 if is_appropriate_end_tag tok_cur_tag
3048 tok_state = tok_state_before_attribute_name
3050 # else fall through to "Anything else"
3052 if is_appropriate_end_tag tok_cur_tag
3053 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3055 # else fall through to "Anything else"
3057 if is_appropriate_end_tag tok_cur_tag
3058 tok_state = tok_state_data
3060 # else fall through to "Anything else"
3062 tok_cur_tag.name += c.toLowerCase()
3063 temporary_buffer += c
3066 tok_cur_tag.name += c
3067 temporary_buffer += c
3070 tok_state = tok_state_rcdata
3071 cur -= 1 # reconsume the input character
3072 return new_character_token '</' + temporary_buffer # fixfull separate these
3074 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3075 tok_state_rawtext_less_than_sign = ->
3076 c = txt.charAt(cur++)
3078 temporary_buffer = ''
3079 tok_state = tok_state_rawtext_end_tag_open
3082 tok_state = tok_state_rawtext
3083 cur -= 1 # reconsume the input character
3084 return new_character_token '<'
3086 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3087 tok_state_rawtext_end_tag_open = ->
3088 c = txt.charAt(cur++)
3090 tok_cur_tag = new_end_tag c.toLowerCase()
3091 temporary_buffer += c
3092 tok_state = tok_state_rawtext_end_tag_name
3095 tok_cur_tag = new_end_tag c
3096 temporary_buffer += c
3097 tok_state = tok_state_rawtext_end_tag_name
3100 tok_state = tok_state_rawtext
3101 cur -= 1 # reconsume the input character
3102 return new_character_token "</" # fixfull separate these
3104 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3105 tok_state_rawtext_end_tag_name = ->
3106 c = txt.charAt(cur++)
3107 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3108 if is_appropriate_end_tag tok_cur_tag
3109 tok_state = tok_state_before_attribute_name
3111 # else fall through to "Anything else"
3113 if is_appropriate_end_tag tok_cur_tag
3114 tok_state = tok_state_self_closing_start_tag
3116 # else fall through to "Anything else"
3118 if is_appropriate_end_tag tok_cur_tag
3119 tok_state = tok_state_data
3121 # else fall through to "Anything else"
3123 tok_cur_tag.name += c.toLowerCase()
3124 temporary_buffer += c
3127 tok_cur_tag.name += c
3128 temporary_buffer += c
3131 tok_state = tok_state_rawtext
3132 cur -= 1 # reconsume the input character
3133 return new_character_token '</' + temporary_buffer # fixfull separate these
3135 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3136 tok_state_script_data_less_than_sign = ->
3137 c = txt.charAt(cur++)
3139 temporary_buffer = ''
3140 tok_state = tok_state_script_data_end_tag_open
3143 tok_state = tok_state_script_data_escape_start
3144 return new_character_token '<!' # fixfull split
3146 tok_state = tok_state_script_data
3147 cur -= 1 # Reconsume
3148 return new_character_token '<'
3150 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3151 tok_state_script_data_end_tag_open = ->
3152 c = txt.charAt(cur++)
3154 tok_cur_tag = new_end_tag c.toLowerCase()
3155 temporary_buffer += c
3156 tok_state = tok_state_script_data_end_tag_name
3159 tok_cur_tag = new_end_tag c
3160 temporary_buffer += c
3161 tok_state = tok_state_script_data_end_tag_name
3164 tok_state = tok_state_script_data
3165 cur -= 1 # Reconsume
3166 return new_character_token '</'
3168 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3169 tok_state_script_data_end_tag_name = ->
3170 c = txt.charAt(cur++)
3171 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3172 if is_appropriate_end_tag tok_cur_tag
3173 tok_state = tok_state_before_attribute_name
3177 if is_appropriate_end_tag tok_cur_tag
3178 tok_state = tok_state_self_closing_start_tag
3182 if is_appropriate_end_tag tok_cur_tag
3183 tok_state = tok_state_data
3187 tok_cur_tag.name += c.toLowerCase()
3188 temporary_buffer += c
3191 tok_cur_tag.name += c
3192 temporary_buffer += c
3195 tok_state = tok_state_script_data
3196 cur -= 1 # Reconsume
3197 return new_character_token "</#{temporary_buffer}" # fixfull split
3199 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3200 tok_state_script_data_escape_start = ->
3201 c = txt.charAt(cur++)
3203 tok_state = tok_state_script_data_escape_start_dash
3204 return new_character_token '-'
3206 tok_state = tok_state_script_data
3207 cur -= 1 # Reconsume
3210 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3211 tok_state_script_data_escape_start_dash = ->
3212 c = txt.charAt(cur++)
3214 tok_state = tok_state_script_data_escaped_dash_dash
3215 return new_character_token '-'
3217 tok_state = tok_state_script_data
3218 cur -= 1 # Reconsume
3221 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3222 tok_state_script_data_escaped = ->
3223 c = txt.charAt(cur++)
3225 tok_state = tok_state_script_data_escaped_dash
3226 return new_character_token '-'
3228 tok_state = tok_state_script_data_escaped_less_than_sign
3232 return new_character_token "\ufffd"
3234 tok_state = tok_state_data
3236 cur -= 1 # Reconsume
3239 return new_character_token c
3241 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3242 tok_state_script_data_escaped_dash = ->
3243 c = txt.charAt(cur++)
3245 tok_state = tok_state_script_data_escaped_dash_dash
3246 return new_character_token '-'
3248 tok_state = tok_state_script_data_escaped_less_than_sign
3252 tok_state = tok_state_script_data_escaped
3253 return new_character_token "\ufffd"
3255 tok_state = tok_state_data
3257 cur -= 1 # Reconsume
3260 tok_state = tok_state_script_data_escaped
3261 return new_character_token c
3263 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3264 tok_state_script_data_escaped_dash_dash = ->
3265 c = txt.charAt(cur++)
3267 return new_character_token '-'
3269 tok_state = tok_state_script_data_escaped_less_than_sign
3272 tok_state = tok_state_script_data
3273 return new_character_token '>'
3276 tok_state = tok_state_script_data_escaped
3277 return new_character_token "\ufffd"
3280 tok_state = tok_state_data
3281 cur -= 1 # Reconsume
3284 tok_state = tok_state_script_data_escaped
3285 return new_character_token c
3287 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3288 tok_state_script_data_escaped_less_than_sign = ->
3289 c = txt.charAt(cur++)
3291 temporary_buffer = ''
3292 tok_state = tok_state_script_data_escaped_end_tag_open
3295 temporary_buffer = c.toLowerCase() # yes, really
3296 tok_state = tok_state_script_data_double_escape_start
3297 return new_character_token "<#{c}" # fixfull split
3299 temporary_buffer = c
3300 tok_state = tok_state_script_data_double_escape_start
3301 return new_character_token "<#{c}" # fixfull split
3303 tok_state = tok_state_script_data_escaped
3304 cur -= 1 # Reconsume
3305 return new_character_token c
3307 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3308 tok_state_script_data_escaped_end_tag_open = ->
3309 c = txt.charAt(cur++)
3311 tok_cur_tag = new_end_tag c.toLowerCase()
3312 temporary_buffer += c
3313 tok_state = tok_state_script_data_escaped_end_tag_name
3316 tok_cur_tag = new_end_tag c
3317 temporary_buffer += c
3318 tok_state = tok_state_script_data_escaped_end_tag_name
3321 tok_state = tok_state_script_data_escaped
3322 cur -= 1 # Reconsume
3323 return new_character_token '</' # fixfull split
3325 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3326 tok_state_script_data_escaped_end_tag_name = ->
3327 c = txt.charAt(cur++)
3328 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3329 if is_appropriate_end_tag tok_cur_tag
3330 tok_state = tok_state_before_attribute_name
3334 if is_appropriate_end_tag tok_cur_tag
3335 tok_state = tok_state_self_closing_start_tag
3339 if is_appropriate_end_tag tok_cur_tag
3340 tok_state = tok_state_data
3344 tok_cur_tag.name += c.toLowerCase()
3345 temporary_buffer += c.toLowerCase()
3348 tok_cur_tag.name += c
3349 temporary_buffer += c.toLowerCase()
3352 tok_state = tok_state_script_data_escaped
3353 cur -= 1 # Reconsume
3354 return new_character_token "</#{temporary_buffer}" # fixfull split
3356 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3357 tok_state_script_data_double_escape_start = ->
3358 c = txt.charAt(cur++)
3359 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3360 if temporary_buffer is 'script'
3361 tok_state = tok_state_script_data_double_escaped
3363 tok_state = tok_state_script_data_escaped
3364 return new_character_token c
3366 temporary_buffer += c.toLowerCase() # yes, really lowercase
3367 return new_character_token c
3369 temporary_buffer += c
3370 return new_character_token c
3372 tok_state = tok_state_script_data_escaped
3373 cur -= 1 # Reconsume
3376 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3377 tok_state_script_data_double_escaped = ->
3378 c = txt.charAt(cur++)
3380 tok_state = tok_state_script_data_double_escaped_dash
3381 return new_character_token '-'
3383 tok_state = tok_state_script_data_double_escaped_less_than_sign
3384 return new_character_token '<'
3387 return new_character_token "\ufffd"
3390 tok_state = tok_state_data
3391 cur -= 1 # Reconsume
3394 return new_character_token c
3396 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3397 tok_state_script_data_double_escaped_dash = ->
3398 c = txt.charAt(cur++)
3400 tok_state = tok_state_script_data_double_escaped_dash_dash
3401 return new_character_token '-'
3403 tok_state = tok_state_script_data_double_escaped_less_than_sign
3404 return new_character_token '<'
3407 tok_state = tok_state_script_data_double_escaped
3408 return new_character_token "\ufffd"
3411 tok_state = tok_state_data
3412 cur -= 1 # Reconsume
3415 tok_state = tok_state_script_data_double_escaped
3416 return new_character_token c
3418 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3419 tok_state_script_data_double_escaped_dash_dash = ->
3420 c = txt.charAt(cur++)
3422 return new_character_token '-'
3424 tok_state = tok_state_script_data_double_escaped_less_than_sign
3425 return new_character_token '<'
3427 tok_state = tok_state_script_data
3428 return new_character_token '>'
3431 tok_state = tok_state_script_data_double_escaped
3432 return new_character_token "\ufffd"
3435 tok_state = tok_state_data
3436 cur -= 1 # Reconsume
3439 tok_state = tok_state_script_data_double_escaped
3440 return new_character_token c
3442 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3443 tok_state_script_data_double_escaped_less_than_sign = ->
3444 c = txt.charAt(cur++)
3446 temporary_buffer = ''
3447 tok_state = tok_state_script_data_double_escape_end
3448 return new_character_token '/'
3450 tok_state = tok_state_script_data_double_escaped
3451 cur -= 1 # Reconsume
3454 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3455 tok_state_script_data_double_escape_end = ->
3456 c = txt.charAt(cur++)
3457 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3458 if temporary_buffer is 'script'
3459 tok_state = tok_state_script_data_escaped
3461 tok_state = tok_state_script_data_double_escaped
3462 return new_character_token c
3464 temporary_buffer += c.toLowerCase() # yes, really lowercase
3465 return new_character_token c
3467 temporary_buffer += c
3468 return new_character_token c
3470 tok_state = tok_state_script_data_double_escaped
3471 cur -= 1 # Reconsume
3474 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3475 tok_state_before_attribute_name = ->
3477 switch c = txt.charAt(cur++)
3478 when "\t", "\n", "\u000c", ' '
3481 tok_state = tok_state_self_closing_start_tag
3484 tok_state = tok_state_data
3490 attr_name = "\ufffd"
3491 when '"', "'", '<', '='
3496 tok_state = tok_state_data
3499 attr_name = c.toLowerCase()
3503 tok_cur_tag.attrs_a.unshift [attr_name, '']
3504 tok_state = tok_state_attribute_name
3507 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3508 tok_state_attribute_name = ->
3509 switch c = txt.charAt(cur++)
3510 when "\t", "\n", "\u000c", ' '
3511 tok_state = tok_state_after_attribute_name
3513 tok_state = tok_state_self_closing_start_tag
3515 tok_state = tok_state_before_attribute_value
3517 tok_state = tok_state_data
3523 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3526 tok_cur_tag.attrs_a[0][0] += c
3529 tok_state = tok_state_data
3532 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3534 tok_cur_tag.attrs_a[0][0] += c
3537 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3538 tok_state_after_attribute_name = ->
3539 c = txt.charAt(cur++)
3540 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3543 tok_state = tok_state_self_closing_start_tag
3546 tok_state = tok_state_before_attribute_value
3549 tok_state = tok_state_data
3552 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3553 tok_state = tok_state_attribute_name
3557 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3558 tok_state = tok_state_attribute_name
3562 tok_state = tok_state_data
3563 cur -= 1 # reconsume
3565 if c is '"' or c is "'" or c is '<'
3567 # fall through to Anything else
3569 tok_cur_tag.attrs_a.unshift [c, '']
3570 tok_state = tok_state_attribute_name
3572 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3573 tok_state_before_attribute_value = ->
3574 switch c = txt.charAt(cur++)
3575 when "\t", "\n", "\u000c", ' '
3578 tok_state = tok_state_attribute_value_double_quoted
3580 tok_state = tok_state_attribute_value_unquoted
3583 tok_state = tok_state_attribute_value_single_quoted
3586 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3587 tok_state = tok_state_attribute_value_unquoted
3590 tok_state = tok_state_data
3596 tok_state = tok_state_data
3598 tok_cur_tag.attrs_a[0][1] += c
3599 tok_state = tok_state_attribute_value_unquoted
3602 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3603 tok_state_attribute_value_double_quoted = ->
3604 switch c = txt.charAt(cur++)
3606 tok_state = tok_state_after_attribute_value_quoted
3608 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3611 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3614 tok_state = tok_state_data
3616 tok_cur_tag.attrs_a[0][1] += c
3619 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3620 tok_state_attribute_value_single_quoted = ->
3621 switch c = txt.charAt(cur++)
3623 tok_state = tok_state_after_attribute_value_quoted
3625 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3628 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3631 tok_state = tok_state_data
3633 tok_cur_tag.attrs_a[0][1] += c
3636 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3637 tok_state_attribute_value_unquoted = ->
3638 switch c = txt.charAt(cur++)
3639 when "\t", "\n", "\u000c", ' '
3640 tok_state = tok_state_before_attribute_name
3642 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3644 tok_state = tok_state_data
3649 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3652 tok_state = tok_state_data
3654 # Parse Error if ', <, = or ` (backtick)
3655 tok_cur_tag.attrs_a[0][1] += c
3658 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3659 tok_state_after_attribute_value_quoted = ->
3660 switch c = txt.charAt(cur++)
3661 when "\t", "\n", "\u000c", ' '
3662 tok_state = tok_state_before_attribute_name
3664 tok_state = tok_state_self_closing_start_tag
3666 tok_state = tok_state_data
3672 tok_state = tok_state_data
3675 tok_state = tok_state_before_attribute_name
3676 cur -= 1 # we didn't handle that char
3679 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3680 tok_state_self_closing_start_tag = ->
3681 c = txt.charAt(cur++)
3683 tok_cur_tag.flag 'self-closing'
3684 tok_state = tok_state_data
3688 tok_state = tok_state_data
3689 cur -= 1 # Reconsume
3693 tok_state = tok_state_before_attribute_name
3694 cur -= 1 # Reconsume
3697 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3698 # WARNING: put a comment token in tok_cur_tag before setting this state
3699 tok_state_bogus_comment = ->
3700 next_gt = txt.indexOf '>', cur
3702 val = txt.substr cur
3705 val = txt.substr cur, (next_gt - cur)
3707 val = val.replace "\u0000", "\ufffd"
3708 tok_cur_tag.text += val
3709 tok_state = tok_state_data
3712 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3713 tok_state_markup_declaration_open = ->
3714 if txt.substr(cur, 2) is '--'
3716 tok_cur_tag = new_comment_token ''
3717 tok_state = tok_state_comment_start
3719 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3721 tok_state = tok_state_doctype
3723 acn = adjusted_current_node()
3724 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3726 tok_state = tok_state_cdata_section
3730 tok_cur_tag = new_comment_token ''
3731 tok_state = tok_state_bogus_comment
3734 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3735 tok_state_comment_start = ->
3736 switch c = txt.charAt(cur++)
3738 tok_state = tok_state_comment_start_dash
3741 tok_state = tok_state_comment
3742 return new_character_token "\ufffd"
3745 tok_state = tok_state_data
3749 tok_state = tok_state_data
3750 cur -= 1 # Reconsume
3753 tok_cur_tag.text += c
3754 tok_state = tok_state_comment
3757 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3758 tok_state_comment_start_dash = ->
3759 switch c = txt.charAt(cur++)
3761 tok_state = tok_state_comment_end
3764 tok_cur_tag.text += "-\ufffd"
3765 tok_state = tok_state_comment
3768 tok_state = tok_state_data
3772 tok_state = tok_state_data
3773 cur -= 1 # Reconsume
3776 tok_cur_tag.text += "-#{c}"
3777 tok_state = tok_state_comment
3780 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3781 tok_state_comment = ->
3782 switch c = txt.charAt(cur++)
3784 tok_state = tok_state_comment_end_dash
3787 tok_cur_tag.text += "\ufffd"
3790 tok_state = tok_state_data
3791 cur -= 1 # Reconsume
3794 tok_cur_tag.text += c
3797 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3798 tok_state_comment_end_dash = ->
3799 switch c = txt.charAt(cur++)
3801 tok_state = tok_state_comment_end
3804 tok_cur_tag.text += "-\ufffd"
3805 tok_state = tok_state_comment
3808 tok_state = tok_state_data
3809 cur -= 1 # Reconsume
3812 tok_cur_tag.text += "-#{c}"
3813 tok_state = tok_state_comment
3816 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3817 tok_state_comment_end = ->
3818 switch c = txt.charAt(cur++)
3820 tok_state = tok_state_data
3824 tok_cur_tag.text += "--\ufffd"
3825 tok_state = tok_state_comment
3828 tok_state = tok_state_comment_end_bang
3831 tok_cur_tag.text += '-'
3834 tok_state = tok_state_data
3835 cur -= 1 # Reconsume
3839 tok_cur_tag.text += "--#{c}"
3840 tok_state = tok_state_comment
3843 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3844 tok_state_comment_end_bang = ->
3845 switch c = txt.charAt(cur++)
3847 tok_cur_tag.text += "--!#{c}"
3848 tok_state = tok_state_comment_end_dash
3850 tok_state = tok_state_data
3854 tok_cur_tag.text += "--!\ufffd"
3855 tok_state = tok_state_comment
3858 tok_state = tok_state_data
3859 cur -= 1 # Reconsume
3862 tok_cur_tag.text += "--!#{c}"
3863 tok_state = tok_state_comment
3866 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3867 tok_state_doctype = ->
3868 switch c = txt.charAt(cur++)
3869 when "\t", "\u000a", "\u000c", ' '
3870 tok_state = tok_state_before_doctype_name
3873 tok_state = tok_state_data
3874 el = new_doctype_token ''
3875 el.flag 'force-quirks', true
3876 cur -= 1 # Reconsume
3880 tok_state = tok_state_before_doctype_name
3881 cur -= 1 # Reconsume
3884 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3885 tok_state_before_doctype_name = ->
3886 c = txt.charAt(cur++)
3887 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3890 tok_cur_tag = new_doctype_token c.toLowerCase()
3891 tok_state = tok_state_doctype_name
3895 tok_cur_tag = new_doctype_token "\ufffd"
3896 tok_state = tok_state_doctype_name
3900 el = new_doctype_token ''
3901 el.flag 'force-quirks', true
3902 tok_state = tok_state_data
3906 tok_state = tok_state_data
3907 el = new_doctype_token ''
3908 el.flag 'force-quirks', true
3909 cur -= 1 # Reconsume
3912 tok_cur_tag = new_doctype_token c
3913 tok_state = tok_state_doctype_name
3916 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3917 tok_state_doctype_name = ->
3918 c = txt.charAt(cur++)
3919 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3920 tok_state = tok_state_after_doctype_name
3923 tok_state = tok_state_data
3926 tok_cur_tag.name += c.toLowerCase()
3930 tok_cur_tag.name += "\ufffd"
3934 tok_state = tok_state_data
3935 tok_cur_tag.flag 'force-quirks', true
3936 cur -= 1 # Reconsume
3939 tok_cur_tag.name += c
3942 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3943 tok_state_after_doctype_name = ->
3944 c = txt.charAt(cur++)
3945 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3948 tok_state = tok_state_data
3952 tok_state = tok_state_data
3953 tok_cur_tag.flag 'force-quirks', true
3954 cur -= 1 # Reconsume
3957 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3959 tok_state = tok_state_after_doctype_public_keyword
3961 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3963 tok_state = tok_state_after_doctype_system_keyword
3966 tok_cur_tag.flag 'force-quirks', true
3967 tok_state = tok_state_bogus_doctype
3970 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3971 tok_state_after_doctype_public_keyword = ->
3972 c = txt.charAt(cur++)
3973 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3974 tok_state = tok_state_before_doctype_public_identifier
3978 tok_cur_tag.public_identifier = ''
3979 tok_state = tok_state_doctype_public_identifier_double_quoted
3983 tok_cur_tag.public_identifier = ''
3984 tok_state = tok_state_doctype_public_identifier_single_quoted
3988 tok_cur_tag.flag 'force-quirks', true
3989 tok_state = tok_state_data
3993 tok_state = tok_state_data
3994 tok_cur_tag.flag 'force-quirks', true
3995 cur -= 1 # Reconsume
3999 tok_cur_tag.flag 'force-quirks', true
4000 tok_state = tok_state_bogus_doctype
4003 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4004 tok_state_before_doctype_public_identifier = ->
4005 c = txt.charAt(cur++)
4006 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4010 tok_cur_tag.public_identifier = ''
4011 tok_state = tok_state_doctype_public_identifier_double_quoted
4015 tok_cur_tag.public_identifier = ''
4016 tok_state = tok_state_doctype_public_identifier_single_quoted
4020 tok_cur_tag.flag 'force-quirks', true
4021 tok_state = tok_state_data
4025 tok_state = tok_state_data
4026 tok_cur_tag.flag 'force-quirks', true
4027 cur -= 1 # Reconsume
4031 tok_cur_tag.flag 'force-quirks', true
4032 tok_state = tok_state_bogus_doctype
4036 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4037 tok_state_doctype_public_identifier_double_quoted = ->
4038 c = txt.charAt(cur++)
4040 tok_state = tok_state_after_doctype_public_identifier
4044 tok_cur_tag.public_identifier += "\ufffd"
4048 tok_cur_tag.flag 'force-quirks', true
4049 tok_state = tok_state_data
4053 tok_state = tok_state_data
4054 tok_cur_tag.flag 'force-quirks', true
4055 cur -= 1 # Reconsume
4058 tok_cur_tag.public_identifier += c
4061 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4062 tok_state_doctype_public_identifier_single_quoted = ->
4063 c = txt.charAt(cur++)
4065 tok_state = tok_state_after_doctype_public_identifier
4069 tok_cur_tag.public_identifier += "\ufffd"
4073 tok_cur_tag.flag 'force-quirks', true
4074 tok_state = tok_state_data
4078 tok_state = tok_state_data
4079 tok_cur_tag.flag 'force-quirks', true
4080 cur -= 1 # Reconsume
4083 tok_cur_tag.public_identifier += c
4086 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4087 tok_state_after_doctype_public_identifier = ->
4088 c = txt.charAt(cur++)
4089 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4090 tok_state = tok_state_between_doctype_public_and_system_identifiers
4093 tok_state = tok_state_data
4097 tok_cur_tag.system_identifier = ''
4098 tok_state = tok_state_doctype_system_identifier_double_quoted
4102 tok_cur_tag.system_identifier = ''
4103 tok_state = tok_state_doctype_system_identifier_single_quoted
4107 tok_state = tok_state_data
4108 tok_cur_tag.flag 'force-quirks', true
4109 cur -= 1 # Reconsume
4113 tok_cur_tag.flag 'force-quirks', true
4114 tok_state = tok_state_bogus_doctype
4117 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4118 tok_state_between_doctype_public_and_system_identifiers = ->
4119 c = txt.charAt(cur++)
4120 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4123 tok_state = tok_state_data
4127 tok_cur_tag.system_identifier = ''
4128 tok_state = tok_state_doctype_system_identifier_double_quoted
4132 tok_cur_tag.system_identifier = ''
4133 tok_state = tok_state_doctype_system_identifier_single_quoted
4137 tok_state = tok_state_data
4138 tok_cur_tag.flag 'force-quirks', true
4139 cur -= 1 # Reconsume
4143 tok_cur_tag.flag 'force-quirks', true
4144 tok_state = tok_state_bogus_doctype
4147 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4148 tok_state_after_doctype_system_keyword = ->
4149 c = txt.charAt(cur++)
4150 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4151 tok_state = tok_state_before_doctype_system_identifier
4155 tok_cur_tag.system_identifier = ''
4156 tok_state = tok_state_doctype_system_identifier_double_quoted
4160 tok_cur_tag.system_identifier = ''
4161 tok_state = tok_state_doctype_system_identifier_single_quoted
4165 tok_cur_tag.flag 'force-quirks', true
4166 tok_state = tok_state_data
4170 tok_state = tok_state_data
4171 tok_cur_tag.flag 'force-quirks', true
4172 cur -= 1 # Reconsume
4176 tok_cur_tag.flag 'force-quirks', true
4177 tok_state = tok_state_bogus_doctype
4180 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4181 tok_state_before_doctype_system_identifier = ->
4182 c = txt.charAt(cur++)
4183 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4186 tok_cur_tag.system_identifier = ''
4187 tok_state = tok_state_doctype_system_identifier_double_quoted
4190 tok_cur_tag.system_identifier = ''
4191 tok_state = tok_state_doctype_system_identifier_single_quoted
4195 tok_cur_tag.flag 'force-quirks', true
4196 tok_state = tok_state_data
4200 tok_state = tok_state_data
4201 tok_cur_tag.flag 'force-quirks', true
4202 cur -= 1 # Reconsume
4206 tok_cur_tag.flag 'force-quirks', true
4207 tok_state = tok_state_bogus_doctype
4210 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4211 tok_state_doctype_system_identifier_double_quoted = ->
4212 c = txt.charAt(cur++)
4214 tok_state = tok_state_after_doctype_system_identifier
4218 tok_cur_tag.system_identifier += "\ufffd"
4222 tok_cur_tag.flag 'force-quirks', true
4223 tok_state = tok_state_data
4227 tok_state = tok_state_data
4228 tok_cur_tag.flag 'force-quirks', true
4229 cur -= 1 # Reconsume
4232 tok_cur_tag.system_identifier += c
4235 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4236 tok_state_doctype_system_identifier_single_quoted = ->
4237 c = txt.charAt(cur++)
4239 tok_state = tok_state_after_doctype_system_identifier
4243 tok_cur_tag.system_identifier += "\ufffd"
4247 tok_cur_tag.flag 'force-quirks', true
4248 tok_state = tok_state_data
4252 tok_state = tok_state_data
4253 tok_cur_tag.flag 'force-quirks', true
4254 cur -= 1 # Reconsume
4257 tok_cur_tag.system_identifier += c
4260 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4261 tok_state_after_doctype_system_identifier = ->
4262 c = txt.charAt(cur++)
4263 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4266 tok_state = tok_state_data
4270 tok_state = tok_state_data
4271 tok_cur_tag.flag 'force-quirks', true
4272 cur -= 1 # Reconsume
4276 # do _not_ tok_cur_tag.flag 'force-quirks', true
4277 tok_state = tok_state_bogus_doctype
4280 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4281 tok_state_bogus_doctype = ->
4282 c = txt.charAt(cur++)
4284 tok_state = tok_state_data
4287 tok_state = tok_state_data
4288 cur -= 1 # Reconsume
4293 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4294 tok_state_cdata_section = ->
4295 tok_state = tok_state_data
4296 next_gt = txt.indexOf ']]>', cur
4298 val = txt.substr cur
4301 val = txt.substr cur, (next_gt - cur)
4303 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4304 val = val.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4305 val = val.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4306 return new_character_token val # fixfull split
4308 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4309 # Don't set this as a state, just call it
4310 # returns a string (NOT a text node)
4311 parse_character_reference = (allowed_char = null, in_attr = false) ->
4312 if cur >= txt.length
4314 switch c = txt.charAt(cur)
4315 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4316 # explicitly not a parse error
4319 # there has to be "one or more" alnums between & and ; to be a parse error
4322 if cur + 1 >= txt.length
4324 if txt.charAt(cur + 1).toLowerCase() is 'x'
4333 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4337 if txt.charAt(start + i) is ';'
4339 # FIXME This is supposed to generate parse errors for some chars
4340 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
4347 if alnum.indexOf(txt.charAt(cur + i)) is -1
4350 # exit early, because parse_error() below needs at least one alnum
4352 if txt.charAt(cur + i) is ';'
4353 i += 1 # include ';' terminator in value
4354 decoded = decode_named_char_ref txt.substr(cur, i)
4361 # no ';' terminator (only legacy char refs)
4363 for i in [2..max] # no prefix matches, so ok to check shortest first
4364 c = legacy_char_refs[txt.substr(cur, i)]
4367 if txt.charAt(cur + i) is '='
4368 # "because some legacy user agents will
4369 # misinterpret the markup in those cases"
4372 if alnum.indexOf(txt.charAt(cur + i)) > -1
4373 # this makes attributes forgiving about url args
4375 # ok, and besides the weird exceptions for attributes...
4376 # return the matching char
4377 cur += i # consume entity chars
4378 parse_error() # because no terminating ";"
4382 return # never reached
4384 # tree constructor initialization
4385 # see comments on TYPE_TAG/etc for the structure of this data
4388 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4390 afe = [] # active formatting elements
4391 template_ins_modes = []
4392 ins_mode = ins_mode_initial
4393 original_ins_mode = ins_mode # TODO check spec
4394 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4395 flag_frameset_ok = true
4397 flag_foster_parenting = false
4398 form_element_pointer = null
4399 temporary_buffer = null
4400 pending_table_character_tokens = []
4401 head_element_pointer = null
4402 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4403 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4405 # tokenizer initialization
4406 tok_state = tok_state_data
4408 if args.name is "namespace-sensitivity.dat #1"
4411 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4416 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4419 serialize_els = (els, shallow, show_ids) ->
4425 serialized += t.serialize shallow, show_ids
4428 module.exports.parse_html = parse_html
4429 module.exports.debug_log_reset = debug_log_reset
4430 module.exports.debug_log_each = debug_log_each
4431 module.exports.TYPE_TAG = TYPE_TAG
4432 module.exports.TYPE_TEXT = TYPE_TEXT
4433 module.exports.TYPE_COMMENT = TYPE_COMMENT
4434 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4435 module.exports.NS_HTML = NS_HTML
4436 module.exports.NS_MATHML = NS_MATHML
4437 module.exports.NS_SVG = NS_SVG