1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
25 # Deviations from that spec:
27 # Purposeful: search this file for "WTAG"
29 # Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
34 # the spec uses a many different words do indicate which ends of lists/stacks
35 # they are talking about (and relative movement within the lists/stacks). This
36 # section splains. I'm implementing "lists" (afe and open_els) the same way
39 # stacks grow downward (current element is index=0)
41 # example: open_els = [a, b, c, d, e, f, g]
43 # "grows downwards" means it's visualized like this: (index: el, names)
45 # 6: g "start of the list", "topmost", "first"
47 # 4: e "previous" (to d), "above", "before"
48 # 3: d (previous/next are relative to this element)
49 # 2: c "next", "after", "lower", "below"
51 # 0: a "end of the list", "current node", "bottommost", "last"
55 # note: to get this to run outside a browser, you'll have to write a native
56 # implementation of decode_named_char_ref()
57 unless module?.exports?
59 module = exports: window.wheic
61 from_code_point = (x) ->
62 if String.fromCodePoint?
63 return String.fromCodePoint x
66 return String.fromCharCode x
68 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
70 # Each node is an obect of the Node class. Here are the Node types:
71 TYPE_TAG = 0 # name, {attributes}, [children]
72 TYPE_TEXT = 1 # "text"
75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
77 TYPE_END_TAG = 5 # name
79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
92 debug_log_each = (cb) ->
93 for str in g_debug_log
98 constructor: (type, args = {}) ->
99 @type = type # one of the TYPE_* constants above
100 @name = args.name ? '' # tag name
101 @text = args.text ? '' # contents for text/comment nodes
102 @attrs = args.attrs ? {}
103 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
104 @children = args.children ? []
105 @namespace = args.namespace ? NS_HTML
106 @parent = args.parent ? null
107 @token = args.token ? null
108 @flags = args.flags ? {}
112 @id = "#{++prev_node_id}"
113 acknowledge_self_closing: ->
115 @token.flag 'did_self_close', true
117 @flag 'did_self_close', true
118 flag: (key, value = null) ->
123 serialize: (shallow = false, show_ids = false) -> # for unit tests
128 ret += JSON.stringify @name
143 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
149 ret += c.serialize shallow, show_ids
153 ret += JSON.stringify @text
156 ret += JSON.stringify @text
158 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
161 when TYPE_AAA_BOOKMARK
162 ret += 'aaa_bookmark'
165 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
168 # helpers: (only take args that are normally known when parser creates nodes)
169 new_open_tag = (name) ->
170 return new Node TYPE_START_TAG, name: name
171 new_end_tag = (name) ->
172 return new Node TYPE_END_TAG, name: name
173 new_element = (name) ->
174 return new Node TYPE_TAG, name: name
175 new_text_node = (txt) ->
176 return new Node TYPE_TEXT, text: txt
177 new_character_token = new_text_node
178 new_comment_token = (txt) ->
179 return new Node TYPE_COMMENT, text: txt
180 new_doctype_token = (name) ->
181 return new Node TYPE_DOCTYPE, name: name
183 return new Node TYPE_EOF
185 return new Node TYPE_AFE_MARKER
186 new_aaa_bookmark = ->
187 return new Node TYPE_AAA_BOOKMARK
189 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
190 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
191 digits = "0123456789"
192 alnum = lc_alpha + uc_alpha + digits
193 hex_chars = digits + "abcdefABCDEF"
195 is_uc_alpha = (str) ->
196 return str.length is 1 and uc_alpha.indexOf(str) > -1
197 is_lc_alpha = (str) ->
198 return str.length is 1 and lc_alpha.indexOf(str) > -1
200 # some SVG elements have dashes in them
201 tag_name_chars = alnum + "-"
203 # http://www.w3.org/TR/html5/infrastructure.html#space-character
204 space_chars = "\u0009\u000a\u000c\u000d\u0020"
206 return txt.length is 1 and space_chars.indexOf(txt) > -1
207 is_space_tok = (t) ->
208 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
210 is_input_hidden_tok = (t) ->
211 return false unless t.type is TYPE_START_TAG
214 if a[1].toLowerCase() is 'hidden'
219 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
220 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
223 unicode_fixes[0x00] = "\uFFFD"
224 unicode_fixes[0x80] = "\u20AC"
225 unicode_fixes[0x82] = "\u201A"
226 unicode_fixes[0x83] = "\u0192"
227 unicode_fixes[0x84] = "\u201E"
228 unicode_fixes[0x85] = "\u2026"
229 unicode_fixes[0x86] = "\u2020"
230 unicode_fixes[0x87] = "\u2021"
231 unicode_fixes[0x88] = "\u02C6"
232 unicode_fixes[0x89] = "\u2030"
233 unicode_fixes[0x8A] = "\u0160"
234 unicode_fixes[0x8B] = "\u2039"
235 unicode_fixes[0x8C] = "\u0152"
236 unicode_fixes[0x8E] = "\u017D"
237 unicode_fixes[0x91] = "\u2018"
238 unicode_fixes[0x92] = "\u2019"
239 unicode_fixes[0x93] = "\u201C"
240 unicode_fixes[0x94] = "\u201D"
241 unicode_fixes[0x95] = "\u2022"
242 unicode_fixes[0x96] = "\u2013"
243 unicode_fixes[0x97] = "\u2014"
244 unicode_fixes[0x98] = "\u02DC"
245 unicode_fixes[0x99] = "\u2122"
246 unicode_fixes[0x9A] = "\u0161"
247 unicode_fixes[0x9B] = "\u203A"
248 unicode_fixes[0x9C] = "\u0153"
249 unicode_fixes[0x9E] = "\u017E"
250 unicode_fixes[0x9F] = "\u0178"
252 # These are the character references that don't need a terminating semicolon
253 # min length: 2, max: 6, none are a prefix of any other.
255 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
256 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
257 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
258 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
259 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
260 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
261 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
262 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
263 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
264 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
265 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
266 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
267 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
268 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
269 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
270 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
271 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
275 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
276 raw_text_elements = ['script', 'style']
277 escapable_raw_text_elements = ['textarea', 'title']
278 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
280 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
281 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
282 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
283 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
284 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
285 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
286 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
287 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
288 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
289 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
290 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
291 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
292 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
293 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
297 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
299 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
300 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
301 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
302 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
303 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
304 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
305 'determinant', 'diff', 'divergence', 'divide', 'domain',
306 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
307 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
308 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
309 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
310 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
311 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
312 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
313 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
314 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
315 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
316 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
317 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
318 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
319 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
320 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
321 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
322 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
323 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
324 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
325 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
326 'vectorproduct', 'xor'
328 # foreign_elements = [svg_elements..., mathml_elements...]
329 #normal_elements = All other allowed HTML elements are normal elements.
333 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
334 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
335 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
336 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
337 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
338 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
339 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
340 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
341 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
342 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
343 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
345 menu:NS_HTML,menuitem:NS_HTML, # WATWG adds these
347 meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
348 noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
349 plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
350 select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
351 table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
352 textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
353 tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
356 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
357 'annotation-xml':NS_MATHML,
360 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
363 formatting_elements = {
364 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
365 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
369 mathml_text_integration = {
370 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
372 is_mathml_text_integration_point = (el) ->
373 return mathml_text_integration[el.name] is el.namespace
374 is_html_integration = (el) -> # DON'T PASS A TOKEN
375 if el.namespace is NS_MATHML
376 if el.name is 'annotation-xml'
377 if el.attrs.encoding?
378 if el.attrs.encoding.toLowerCase() is 'text/html'
380 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
383 if el.namespace is NS_SVG
384 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
389 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
392 foster_parenting_targets = {
413 el_is_special = (e) ->
414 return special_elements[e.name] is e.namespace
416 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
417 el_is_special_not_adp = (el) ->
418 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
422 altglyphdef: 'altGlyphDef'
423 altglyphitem: 'altGlyphItem'
424 animatecolor: 'animateColor'
425 animatemotion: 'animateMotion'
426 animatetransform: 'animateTransform'
429 fecolormatrix: 'feColorMatrix'
430 fecomponenttransfer: 'feComponentTransfer'
431 fecomposite: 'feComposite'
432 feconvolvematrix: 'feConvolveMatrix'
433 fediffuselighting: 'feDiffuseLighting'
434 fedisplacementmap: 'feDisplacementMap'
435 fedistantlight: 'feDistantLight'
436 fedropshadow: 'feDropShadow'
442 fegaussianblur: 'feGaussianBlur'
445 femergenode: 'feMergeNode'
446 femorphology: 'feMorphology'
448 fepointlight: 'fePointLight'
449 fespecularlighting: 'feSpecularLighting'
450 fespotlight: 'feSpotLight'
452 feturbulence: 'feTurbulence'
453 foreignobject: 'foreignObject'
455 lineargradient: 'linearGradient'
456 radialgradient: 'radialGradient'
459 svg_attribute_fixes = {
460 attributename: 'attributeName'
461 attributetype: 'attributeType'
462 basefrequency: 'baseFrequency'
463 baseprofile: 'baseProfile'
465 clippathunits: 'clipPathUnits'
466 contentscripttype: 'contentScriptType'
467 contentstyletype: 'contentStyleType'
468 diffuseconstant: 'diffuseConstant'
470 externalresourcesrequired: 'externalResourcesRequired'
471 # WTAG removes this: filterres: 'filterRes'
472 filterunits: 'filterUnits'
474 gradienttransform: 'gradientTransform'
475 gradientunits: 'gradientUnits'
476 kernelmatrix: 'kernelMatrix'
477 kernelunitlength: 'kernelUnitLength'
478 keypoints: 'keyPoints'
479 keysplines: 'keySplines'
481 lengthadjust: 'lengthAdjust'
482 limitingconeangle: 'limitingConeAngle'
483 markerheight: 'markerHeight'
484 markerunits: 'markerUnits'
485 markerwidth: 'markerWidth'
486 maskcontentunits: 'maskContentUnits'
487 maskunits: 'maskUnits'
488 numoctaves: 'numOctaves'
489 pathlength: 'pathLength'
490 patterncontentunits: 'patternContentUnits'
491 patterntransform: 'patternTransform'
492 patternunits: 'patternUnits'
493 pointsatx: 'pointsAtX'
494 pointsaty: 'pointsAtY'
495 pointsatz: 'pointsAtZ'
496 preservealpha: 'preserveAlpha'
497 preserveaspectratio: 'preserveAspectRatio'
498 primitiveunits: 'primitiveUnits'
501 repeatcount: 'repeatCount'
502 repeatdur: 'repeatDur'
503 requiredextensions: 'requiredExtensions'
504 requiredfeatures: 'requiredFeatures'
505 specularconstant: 'specularConstant'
506 specularexponent: 'specularExponent'
507 spreadmethod: 'spreadMethod'
508 startoffset: 'startOffset'
509 stddeviation: 'stdDeviation'
510 stitchtiles: 'stitchTiles'
511 surfacescale: 'surfaceScale'
512 systemlanguage: 'systemLanguage'
513 tablevalues: 'tableValues'
516 textlength: 'textLength'
518 viewtarget: 'viewTarget'
519 xchannelselector: 'xChannelSelector'
520 ychannelselector: 'yChannelSelector'
521 zoomandpan: 'zoomAndPan'
523 foreign_attr_fixes = {
524 'xlink:actuate': 'xlink actuate'
525 'xlink:arcrole': 'xlink arcrole'
526 'xlink:href': 'xlink href'
527 'xlink:role': 'xlink role'
528 'xlink:show': 'xlink show'
529 'xlink:title': 'xlink title'
530 'xlink:type': 'xlink type'
531 'xml:base': 'xml base'
532 'xml:lang': 'xml lang'
533 'xml:space': 'xml space'
535 'xmlns:xlink': 'xmlns xlink'
537 adjust_mathml_attributes = (t) ->
539 if a[0] is 'definitionurl'
540 a[0] = 'definitionURL'
542 adjust_svg_attributes = (t) ->
544 if svg_attribute_fixes[a[0]]?
545 a[0] = svg_attribute_fixes[a[0]]
547 adjust_foreign_attributes = (t) ->
550 if foreign_attr_fixes[a[0]]?
551 a[0] = foreign_attr_fixes[a[0]]
554 # decode_named_char_ref()
556 # The list of named character references is _huge_ so ask the browser to decode
557 # for us instead of wasting bandwidth/space on including the table here.
559 # Pass without the "&" but with the ";" examples:
560 # for "&" pass "amp;"
561 # for "′" pass "x2032;"
564 textarea: document.createElement('textarea')
566 # TODO test this in IE8
567 decode_named_char_ref = (txt) ->
569 decoded = g_dncr.cache[txt]
570 return decoded if decoded?
571 g_dncr.textarea.innerHTML = txt
572 decoded = g_dncr.textarea.value
573 return null if decoded is txt
574 return g_dncr.cache[txt] = decoded
576 parse_html = (args) ->
578 cur = null # index of next char in txt to be parsed
579 # declare doc and tokenizer variables so they're in scope below
581 open_els = null # stack of open elements
582 afe = null # active formatting elements
583 template_ins_modes = null
585 original_ins_mode = null
587 tok_cur_tag = null # partially parsed tag
588 flag_scripting = null
589 flag_frameset_ok = null
591 flag_foster_parenting = null
592 form_element_pointer = null
593 temporary_buffer = null
594 pending_table_character_tokens = null
595 head_element_pointer = null
596 flag_fragment_parsing = null
597 context_element = null
606 console.log "Parse error at character #{cur} of #{txt.length}"
608 afe_push = (new_el) ->
611 if el.name is new_el.name and el.namespace is new_el.namespace
613 continue unless new_el.attrs[k] is v
614 for k, v of new_el.attrs
615 continue unless el.attrs[k] is v
622 afe.unshift new_afe_marker()
624 # the functions below impliment the Tree Contstruction algorithm
625 # http://www.w3.org/TR/html5/syntax.html#tree-construction
627 # But first... the helpers
628 template_tag_is_open = ->
630 if t.name is 'template' and t.namespace is NS_HTML
633 is_in_scope_x = (tag_name, scope, namespace) ->
635 if t.name is tag_name and (namespace is null or namespace is t.namespace)
637 if scope[t.name] is t.namespace
640 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
642 if t.name is tag_name and (namespace is null or namespace is t.namespace)
644 if scope[t.name] is t.namespace
646 if scope2[t.name] is t.namespace
650 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
651 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
654 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
655 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
657 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
659 button_scopers = button: NS_HTML
660 li_scopers = ol: NS_HTML, ul: NS_HTML
661 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
662 is_in_scope = (tag_name, namespace = null) ->
663 return is_in_scope_x tag_name, standard_scopers, namespace
664 is_in_button_scope = (tag_name, namespace = null) ->
665 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
666 is_in_table_scope = (tag_name, namespace = null) ->
667 return is_in_scope_x tag_name, table_scopers, namespace
668 # aka is_in_list_item_scope
669 is_in_li_scope = (tag_name, namespace = null) ->
670 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
671 is_in_select_scope = (tag_name, namespace = null) ->
673 if t.name is tag_name and (namespace is null or namespace is t.namespace)
675 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
678 # this checks for a particular element, not by name
679 # this requires a namespace match
680 el_is_in_scope = (needle) ->
684 if standard_scopers[el.name] is el.namespace
688 clear_to_table_stopers = {
693 clear_stack_to_table_context = ->
695 if clear_to_table_stopers[open_els[0].name]?
699 clear_to_table_body_stopers = {
706 clear_stack_to_table_body_context = ->
708 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
712 clear_to_table_row_stopers = {
717 clear_stack_to_table_row_context = ->
719 if clear_to_table_row_stopers[open_els[0].name]?
723 clear_afe_to_marker = ->
725 return unless afe.length > 0 # this happens in fragment case, ?spec error
727 if el.type is TYPE_AFE_MARKER
732 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
734 # 1. Let last be false.
736 # 2. Let node be the last node in the stack of open elements.
738 node = open_els[node_i]
739 # 3. Loop: If node is the first node in the stack of open elements,
740 # then set last to true, and, if the parser was originally created as
741 # part of the HTML fragment parsing algorithm (fragment case) set node
742 # to the context element.
744 if node_i is open_els.length - 1
746 # fixfull (fragment case)
748 # 4. If node is a select element, run these substeps:
749 if node.name is 'select' and node.namespace is NS_HTML
750 # 1. If last is true, jump to the step below labeled done.
752 # 2. Let ancestor be node.
755 # 3. Loop: If ancestor is the first node in the stack of
756 # open elements, jump to the step below labeled done.
758 if ancestor_i is open_els.length - 1
760 # 4. Let ancestor be the node before ancestor in the stack
763 ancestor = open_els[ancestor_i]
764 # 5. If ancestor is a template node, jump to the step below
766 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
768 # 6. If ancestor is a table node, switch the insertion mode
769 # to "in select in table" and abort these steps.
770 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
771 ins_mode = ins_mode_in_select_in_table
773 # 7. Jump back to the step labeled loop.
774 # 8. Done: Switch the insertion mode to "in select" and abort
776 ins_mode = ins_mode_in_select
778 # 5. If node is a td or th element and last is false, then switch
779 # the insertion mode to "in cell" and abort these steps.
780 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
781 ins_mode = ins_mode_in_cell
783 # 6. If node is a tr element, then switch the insertion mode to "in
784 # row" and abort these steps.
785 if node.name is 'tr' and node.namespace is NS_HTML
786 ins_mode = ins_mode_in_row
788 # 7. If node is a tbody, thead, or tfoot element, then switch the
789 # insertion mode to "in table body" and abort these steps.
790 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
791 ins_mode = ins_mode_in_table_body
793 # 8. If node is a caption element, then switch the insertion mode
794 # to "in caption" and abort these steps.
795 if node.name is 'caption' and node.namespace is NS_HTML
796 ins_mode = ins_mode_in_caption
798 # 9. If node is a colgroup element, then switch the insertion mode
799 # to "in column group" and abort these steps.
800 if node.name is 'colgroup' and node.namespace is NS_HTML
801 ins_mode = ins_mode_in_column_group
803 # 10. If node is a table element, then switch the insertion mode to
804 # "in table" and abort these steps.
805 if node.name is 'table' and node.namespace is NS_HTML
806 ins_mode = ins_mode_in_table
808 # 11. If node is a template element, then switch the insertion mode
809 # to the current template insertion mode and abort these steps.
810 if node.name is 'template' and node.namespace is NS_HTML
811 ins_mode = template_ins_modes[0]
813 # 12. If node is a head element and last is true, then switch the
814 # insertion mode to "in body" ("in body"! not "in head"!) and abort
815 # these steps. (fragment case)
816 if node.name is 'head' and node.namespace is NS_HTML and last
817 ins_mode = ins_mode_in_body
819 # 13. If node is a head element and last is false, then switch the
820 # insertion mode to "in head" and abort these steps.
821 if node.name is 'head' and node.namespace is NS_HTML and last is false
822 ins_mode = ins_mode_in_head
824 # 14. If node is a body element, then switch the insertion mode to
825 # "in body" and abort these steps.
826 if node.name is 'body' and node.namespace is NS_HTML
827 ins_mode = ins_mode_in_body
829 # 15. If node is a frameset element, then switch the insertion mode
830 # to "in frameset" and abort these steps. (fragment case)
831 if node.name is 'frameset' and node.namespace is NS_HTML
832 ins_mode = ins_mode_in_frameset
834 # 16. If node is an html element, run these substeps:
835 if node.name is 'html' and node.namespace is NS_HTML
836 # 1. If the head element pointer is null, switch the insertion
837 # mode to "before head" and abort these steps. (fragment case)
838 if head_element_pointer is null
839 ins_mode = ins_mode_before_head
841 # 2. Otherwise, the head element pointer is not null,
842 # switch the insertion mode to "after head" and abort these
844 ins_mode = ins_mode_after_head
846 # 17. If last is true, then switch the insertion mode to "in body"
847 # and abort these steps. (fragment case)
849 ins_mode = ins_mode_in_body
851 # 18. Let node now be the node before node in the stack of open
854 node = open_els[node_i]
855 # 19. Return to the step labeled loop.
859 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
860 adjusted_current_node = ->
861 if open_els.length is 1 and flag_fragment_parsing
862 return context_element
865 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
866 # this implementation is structured (mostly) as described at the link above.
867 # capitalized comments are the "labels" described at the link above.
869 return if afe.length is 0
870 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
875 if i is afe.length - 1
878 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
883 el = insert_html_element afe[i].token
888 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
889 # adoption agency algorithm
891 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
892 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
893 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
894 adoption_agency = (subject) ->
895 debug_log "adoption_agency()"
896 debug_log "tree: #{serialize_els doc.children, false, true}"
897 debug_log "open_els: #{serialize_els open_els, true, true}"
898 debug_log "afe: #{serialize_els afe, true, true}"
899 # FIXME CONTINUE do WATWG thing here
900 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
901 el = open_els.shift()
902 # remove it from the list of active formatting elements (if found)
907 debug_log "aaa: starting off with subject on top of stack, exiting"
914 # 5. Let formatting element be the last element in the list of
915 # active formatting elements that: is between the end of the list
916 # and the last scope marker in the list, if any, or the start of
917 # the list otherwise, and has the tag name subject.
919 for t, fe_of_afe in afe
920 if t.type is TYPE_AFE_MARKER
925 # If there is no such element, then abort these steps and instead
926 # act as described in the "any other end tag" entry above.
928 debug_log "aaa: fe not found in afe"
929 in_body_any_other_end_tag subject
931 # 6. If formatting element is not in the stack of open elements,
932 # then this is a parse error; remove the element from the list, and
935 for t, fe_of_open_els in open_els
940 debug_log "aaa: fe not found in open_els"
942 # "remove it from the list" must mean afe, since it's not in open_els
943 afe.splice fe_of_afe, 1
945 # 7. If formatting element is in the stack of open elements, but
946 # the element is not in scope, then this is a parse error; abort
948 unless el_is_in_scope fe
949 debug_log "aaa: fe not in scope"
952 # 8. If formatting element is not the current node, this is a parse
953 # error. (But do not abort these steps.)
954 unless open_els[0] is fe
957 # 9. Let furthest block be the topmost node in the stack of open
958 # elements that is lower in the stack than formatting element, and
959 # is an element in the special category. There might not be one.
961 fb_of_open_els = null
968 # and continue, to see if there's one that's more "topmost"
969 # 10. If there is no furthest block, then the UA must first pop all
970 # the nodes from the bottom of the stack of open elements, from the
971 # current node up to and including formatting element, then remove
972 # formatting element from the list of active formatting elements,
973 # and finally abort these steps.
975 debug_log "aaa: no fb"
979 afe.splice fe_of_afe, 1
981 # 11. Let common ancestor be the element immediately above
982 # formatting element in the stack of open elements.
983 ca = open_els[fe_of_open_els + 1] # common ancestor
985 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
986 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
987 bookmark = new_aaa_bookmark()
990 afe.splice i, 0, bookmark
992 node = last_node = fb
996 # 3. Let node be the element immediately above node in the
997 # stack of open elements, or if node is no longer in the stack
998 # of open elements (e.g. because it got removed by this
999 # algorithm), the element that was immediately above node in
1000 # the stack of open elements before node was removed.
1002 for t, i in open_els
1004 node_next = open_els[i + 1]
1006 node = node_next ? node_above
1007 debug_log "inner loop #{inner}"
1008 debug_log "tree: #{serialize_els doc.children, false, true}"
1009 debug_log "open_els: #{serialize_els open_els, true, true}"
1010 debug_log "afe: #{serialize_els afe, true, true}"
1011 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1012 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1013 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1014 debug_log "node: #{node.serialize true, true}"
1015 # TODO make sure node_above gets re-set if/when node is removed from open_els
1017 # 4. If node is formatting element, then go to the next step in
1018 # the overall algorithm.
1021 debug_log "the meat"
1022 # 5. If inner loop counter is greater than three and node is in
1023 # the list of active formatting elements, then remove node from
1024 # the list of active formatting elements.
1030 debug_log "max out inner"
1035 # 6. If node is not in the list of active formatting elements,
1036 # then remove node from the stack of open elements and then go
1037 # back to the step labeled inner loop.
1039 debug_log "not in afe"
1040 for t, i in open_els
1042 node_above = open_els[i + 1]
1043 open_els.splice i, 1
1046 debug_log "the bones"
1047 # 7. create an element for the token for which the element node
1048 # was created, in the HTML namespace, with common ancestor as
1049 # the intended parent; replace the entry for node in the list
1050 # of active formatting elements with an entry for the new
1051 # element, replace the entry for node in the stack of open
1052 # elements with an entry for the new element, and let node be
1054 new_node = token_to_element node.token, NS_HTML, ca
1058 debug_log "replaced in afe"
1060 for t, i in open_els
1062 node_above = open_els[i + 1]
1063 open_els[i] = new_node
1064 debug_log "replaced in open_els"
1067 # 8. If last node is furthest block, then move the
1068 # aforementioned bookmark to be immediately after the new node
1069 # in the list of active formatting elements.
1074 debug_log "removed bookmark"
1078 # "after" means lower
1079 afe.splice i, 0, bookmark # "after as <-
1080 debug_log "placed bookmark after node"
1081 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1083 # 9. Insert last node into node, first removing it from its
1084 # previous parent node if any.
1085 if last_node.parent?
1086 debug_log "last_node has parent"
1087 for c, i in last_node.parent.children
1089 debug_log "removing last_node from parent"
1090 last_node.parent.children.splice i, 1
1092 node.children.push last_node
1093 last_node.parent = node
1094 # 10. Let last node be node.
1097 # 11. Return to the step labeled inner loop.
1098 # 14. Insert whatever last node ended up being in the previous step
1099 # at the appropriate place for inserting a node, but using common
1100 # ancestor as the override target.
1102 # In the case where fe is immediately followed by fb:
1103 # * inner loop exits out early (node==fe)
1105 # * last_node is still in the tree (not a duplicate)
1106 if last_node.parent?
1107 debug_log "FEFIRST? last_node has parent"
1108 for c, i in last_node.parent.children
1110 debug_log "removing last_node from parent"
1111 last_node.parent.children.splice i, 1
1114 debug_log "after aaa inner loop"
1115 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1116 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1117 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1118 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1119 debug_log "tree: #{serialize_els doc.children, false, true}"
1124 # can't use standard insert token thing, because it's already in
1125 # open_els and must stay at it's current position in open_els
1126 dest = adjusted_insertion_location ca
1127 dest[0].children.splice dest[1], 0, last_node
1128 last_node.parent = dest[0]
1131 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1132 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1133 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1134 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1135 debug_log "tree: #{serialize_els doc.children, false, true}"
1137 # 15. Create an element for the token for which formatting element
1138 # was created, in the HTML namespace, with furthest block as the
1140 new_element = token_to_element fe.token, NS_HTML, fb
1141 # 16. Take all of the child nodes of furthest block and append them
1142 # to the element created in the last step.
1143 while fb.children.length
1144 t = fb.children.shift()
1145 t.parent = new_element
1146 new_element.children.push t
1147 # 17. Append that new element to furthest block.
1148 new_element.parent = fb
1149 fb.children.push new_element
1150 # 18. Remove formatting element from the list of active formatting
1151 # elements, and insert the new element into the list of active
1152 # formatting elements at the position of the aforementioned
1160 afe[i] = new_element
1162 # 19. Remove formatting element from the stack of open elements,
1163 # and insert the new element into the stack of open elements
1164 # immediately below the position of furthest block in that stack.
1165 for t, i in open_els
1167 open_els.splice i, 1
1169 for t, i in open_els
1171 open_els.splice i, 0, new_element
1173 # 20. Jump back to the step labeled outer loop.
1174 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1175 debug_log "tree: #{serialize_els doc.children, false, true}"
1176 debug_log "open_els: #{serialize_els open_els, true, true}"
1177 debug_log "afe: #{serialize_els afe, true, true}"
1178 debug_log "AAA DONE"
1180 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1181 close_p_element = ->
1182 generate_implied_end_tags 'p' # arg is exception
1183 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1185 while open_els.length > 1 # just in case
1186 el = open_els.shift()
1187 if el.name is 'p' and el.namespace is NS_HTML
1189 close_p_if_in_button_scope = ->
1190 if is_in_button_scope 'p', NS_HTML
1193 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1194 # aka insert_a_character = (t) ->
1195 insert_character = (t) ->
1196 dest = adjusted_insertion_location()
1197 # fixfull check for Document node
1199 prev = dest[0].children[dest[1] - 1]
1200 if prev.type is TYPE_TEXT
1203 dest[0].children.splice dest[1], 0, t
1206 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1207 process_token = (t) ->
1208 acn = adjusted_current_node()
1212 if acn.namespace is NS_HTML
1215 if is_mathml_text_integration_point(acn)
1216 if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1219 if t.type is TYPE_TEXT
1222 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1225 if is_html_integration acn
1226 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1229 if t.type is TYPE_EOF
1232 in_foreign_content t
1236 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1237 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1238 adjusted_insertion_location = (override_target = null) ->
1239 # 1. If there was an override target specified, then let target be the
1242 target = override_target
1243 else # Otherwise, let target be the current node.
1244 target = open_els[0]
1245 # 2. Determine the adjusted insertion location using the first matching
1246 # steps from the following list:
1248 # If foster parenting is enabled and target is a table, tbody, tfoot,
1249 # thead, or tr element Foster parenting happens when content is
1250 # misnested in tables.
1251 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1252 loop # once. this is here so we can ``break`` to "abort these substeps"
1253 # 1. Let last template be the last template element in the
1254 # stack of open elements, if any.
1255 last_template = null
1256 last_template_i = null
1257 for el, i in open_els
1258 if el.name is 'template' and el.namespace is NS_HTML
1262 # 2. Let last table be the last table element in the stack of
1263 # open elements, if any.
1266 for el, i in open_els
1267 if el.name is 'table' and el.namespace is NS_HTML
1271 # 3. If there is a last template and either there is no last
1272 # table, or there is one, but last template is lower (more
1273 # recently added) than last table in the stack of open
1274 # elements, then: let adjusted insertion location be inside
1275 # last template's template contents, after its last child (if
1276 # any), and abort these substeps.
1277 if last_template and (last_table is null or last_template_i < last_table_i)
1278 target = last_template # fixfull should be it's contents
1279 target_i = target.children.length
1281 # 4. If there is no last table, then let adjusted insertion
1282 # location be inside the first element in the stack of open
1283 # elements (the html element), after its last child (if any),
1284 # and abort these substeps. (fragment case)
1285 if last_table is null
1287 target = open_els[open_els.length - 1]
1288 target_i = target.children.length
1290 # 5. If last table has a parent element, then let adjusted
1291 # insertion location be inside last table's parent element,
1292 # immediately before last table, and abort these substeps.
1293 if last_table.parent?
1294 for c, i in last_table.parent.children
1296 target = last_table.parent
1300 # 6. Let previous element be the element immediately above last
1301 # table in the stack of open elements.
1303 # huh? how could it not have a parent?
1304 previous_element = open_els[last_table_i + 1]
1305 # 7. Let adjusted insertion location be inside previous
1306 # element, after its last child (if any).
1307 target = previous_element
1308 target_i = target.children.length
1309 # Note: These steps are involved in part because it's possible
1310 # for elements, the table element in this case in particular,
1311 # to have been moved by a script around in the DOM, or indeed
1312 # removed from the DOM entirely, after the element was inserted
1314 break # don't really loop
1316 # Otherwise Let adjusted insertion location be inside target, after
1317 # its last child (if any).
1318 target_i = target.children.length
1320 # 3. If the adjusted insertion location is inside a template element,
1321 # let it instead be inside the template element's template contents,
1322 # after its last child (if any).
1323 # fixfull (template)
1325 # 4. Return the adjusted insertion location.
1326 return [target, target_i]
1328 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1329 # aka create_an_element_for_token
1330 token_to_element = (t, namespace, intended_parent) ->
1331 # convert attributes into a hash
1334 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1335 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1337 # TODO 2. If the newly created element has an xmlns attribute in the
1338 # XMLNS namespace whose value is not exactly the same as the element's
1339 # namespace, that is a parse error. Similarly, if the newly created
1340 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1341 # value is not the XLink Namespace, that is a parse error.
1343 # fixfull: the spec says stuff about form pointers and ownerDocument
1347 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1348 insert_foreign_element = (token, namespace) ->
1349 ail = adjusted_insertion_location()
1352 el = token_to_element token, namespace, ail_el
1353 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1355 ail_el.children.splice ail_i, 0, el
1358 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1359 insert_html_element = (token) ->
1360 insert_foreign_element token, NS_HTML
1362 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1363 # position should be [node, index_within_children]
1364 insert_comment = (t, position = null) ->
1365 position ?= adjusted_insertion_location()
1366 position[0].children.splice position[1], 0, t
1369 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1370 parse_generic_raw_text = (t) ->
1371 insert_html_element t
1372 tok_state = tok_state_rawtext
1373 original_ins_mode = ins_mode
1374 ins_mode = ins_mode_text
1375 parse_generic_rcdata_text = (t) ->
1376 insert_html_element t
1377 tok_state = tok_state_rcdata
1378 original_ins_mode = ins_mode
1379 ins_mode = ins_mode_text
1381 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1382 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1383 generate_implied_end_tags = (except = null) ->
1384 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1387 # 8.2.5.4 The rules for parsing tokens in HTML content
1388 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1390 # 8.2.5.4.1 The "initial" insertion mode
1391 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1392 ins_mode_initial = (t) ->
1395 if t.type is TYPE_COMMENT
1399 if t.type is TYPE_DOCTYPE
1400 # FIXME check identifiers, set quirks, etc
1403 ins_mode = ins_mode_before_html
1406 #fixfull (iframe, quirks)
1407 ins_mode = ins_mode_before_html
1411 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1412 ins_mode_before_html = (t) ->
1413 if t.type is TYPE_DOCTYPE
1416 if t.type is TYPE_COMMENT
1421 if t.type is TYPE_START_TAG and t.name is 'html'
1422 el = token_to_element t, NS_HTML, doc
1423 doc.children.push el
1424 open_els.unshift(el)
1425 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1426 ins_mode = ins_mode_before_head
1428 if t.type is TYPE_END_TAG
1429 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1430 # fall through to "anything else"
1435 html_tok = new_open_tag 'html'
1436 el = token_to_element html_tok, NS_HTML, doc
1437 doc.children.push el
1439 # ?fixfull browsing context
1440 ins_mode = ins_mode_before_head
1444 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1445 ins_mode_before_head = (t) ->
1448 if t.type is TYPE_COMMENT
1451 if t.type is TYPE_DOCTYPE
1454 if t.type is TYPE_START_TAG and t.name is 'html'
1457 if t.type is TYPE_START_TAG and t.name is 'head'
1458 el = insert_html_element t
1459 head_element_pointer = el
1460 ins_mode = ins_mode_in_head
1462 if t.type is TYPE_END_TAG
1463 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1464 # fall through to Anything else below
1469 head_tok = new_open_tag 'head'
1470 el = insert_html_element head_tok
1471 head_element_pointer = el
1472 ins_mode = ins_mode_in_head
1475 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1476 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1477 open_els.shift() # spec says this will be a 'head' node
1478 ins_mode = ins_mode_after_head
1480 ins_mode_in_head = (t) ->
1481 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1484 if t.type is TYPE_COMMENT
1487 if t.type is TYPE_DOCTYPE
1490 if t.type is TYPE_START_TAG and t.name is 'html'
1493 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1494 el = insert_html_element t
1496 t.acknowledge_self_closing()
1498 if t.type is TYPE_START_TAG and t.name is 'meta'
1499 el = insert_html_element t
1501 t.acknowledge_self_closing()
1502 # fixfull encoding stuff
1504 if t.type is TYPE_START_TAG and t.name is 'title'
1505 parse_generic_rcdata_text t
1507 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1508 parse_generic_raw_text t
1510 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1511 insert_html_element t
1512 ins_mode = ins_mode_in_head_noscript
1514 if t.type is TYPE_START_TAG and t.name is 'script'
1515 ail = adjusted_insertion_location()
1516 el = token_to_element t, NS_HTML, ail
1517 el.flag 'parser-inserted', true
1518 # fixfull frament case
1519 ail[0].children.splice ail[1], 0, el
1521 tok_state = tok_state_script_data
1522 original_ins_mode = ins_mode # make sure orig... is defined
1523 ins_mode = ins_mode_text
1525 if t.type is TYPE_END_TAG and t.name is 'head'
1526 open_els.shift() # will be a head element... spec says so
1527 ins_mode = ins_mode_after_head
1529 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1530 ins_mode_in_head_else t
1532 if t.type is TYPE_START_TAG and t.name is 'template'
1533 insert_html_element t
1535 flag_frameset_ok = false
1536 ins_mode = ins_mode_in_template
1537 template_ins_modes.unshift ins_mode_in_template
1539 if t.type is TYPE_END_TAG and t.name is 'template'
1540 if template_tag_is_open()
1541 generate_implied_end_tags
1542 if open_els[0].name isnt 'template'
1545 el = open_els.shift()
1546 if el.name is 'template' and el.namespace is NS_HTML
1548 clear_afe_to_marker()
1549 template_ins_modes.shift()
1554 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1557 ins_mode_in_head_else t
1559 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1560 ins_mode_in_head_noscript_else = (t) ->
1563 ins_mode = ins_mode_in_head
1565 ins_mode_in_head_noscript = (t) ->
1566 if t.type is TYPE_DOCTYPE
1569 if t.type is TYPE_START_TAG and t.name is 'html'
1572 if t.type is TYPE_END_TAG and t.name is 'noscript'
1574 ins_mode = ins_mode_in_head
1576 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1579 if t.type is TYPE_END_TAG and t.name is 'br'
1580 ins_mode_in_head_noscript_else t
1582 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1586 ins_mode_in_head_noscript_else t
1591 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1592 ins_mode_after_head_else = (t) ->
1593 body_tok = new_open_tag 'body'
1594 insert_html_element body_tok
1595 ins_mode = ins_mode_in_body
1598 ins_mode_after_head = (t) ->
1602 if t.type is TYPE_COMMENT
1605 if t.type is TYPE_DOCTYPE
1608 if t.type is TYPE_START_TAG and t.name is 'html'
1611 if t.type is TYPE_START_TAG and t.name is 'body'
1612 insert_html_element t
1613 flag_frameset_ok = false
1614 ins_mode = ins_mode_in_body
1616 if t.type is TYPE_START_TAG and t.name is 'frameset'
1617 insert_html_element t
1618 ins_mode = ins_mode_in_frameset
1620 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1622 open_els.unshift head_element_pointer
1624 for el, i of open_els
1625 if el is head_element_pointer
1626 open_els.splice i, 1
1628 console.log "warning: 23904 couldn't find head element in open_els"
1630 if t.type is TYPE_END_TAG and t.name is 'template'
1633 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1634 ins_mode_after_head_else t
1636 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1640 ins_mode_after_head_else t
1642 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1643 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1644 for el, i in open_els
1645 if el.name is name and el.namespace is NS_HTML
1646 generate_implied_end_tags name # arg is exception
1647 parse_error() unless i is 0
1652 if special_elements[el.name] is el.namespace
1656 ins_mode_in_body = (t) ->
1657 if t.type is TYPE_TEXT and t.text is "\u0000"
1664 if t.type is TYPE_TEXT
1667 flag_frameset_ok = false
1669 if t.type is TYPE_COMMENT
1672 if t.type is TYPE_DOCTYPE
1675 if t.type is TYPE_START_TAG and t.name is 'html'
1677 return if template_tag_is_open()
1678 root_attrs = open_els[open_els.length - 1].attrs
1680 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1683 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1686 if t.type is TYPE_START_TAG and t.name is 'body'
1688 return if open_els.length < 2
1689 second = open_els[open_els.length - 2]
1690 return unless second.namespace is NS_HTML
1691 return unless second.name is 'body'
1692 return if template_tag_is_open()
1693 flag_frameset_ok = false
1695 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1697 if t.type is TYPE_START_TAG and t.name is 'frameset'
1699 return if open_els.length < 2
1700 second_i = open_els.length - 2
1701 second = open_els[second_i]
1702 return unless second.namespace is NS_HTML
1703 return unless second.name is 'body'
1704 if flag_frameset_ok is false
1707 for el, i in second.parent.children
1709 second.parent.children.splice i, 1
1711 open_els.splice second_i, 1
1712 # pop everything except the "root html element"
1713 while open_els.length > 1
1715 insert_html_element t
1716 ins_mode = ins_mode_in_frameset
1718 if t.type is TYPE_EOF
1720 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1721 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1722 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1725 unless ok_tags[t.name] is el.namespace
1728 if template_ins_modes.length > 0
1729 ins_mode_in_template t
1733 if t.type is TYPE_END_TAG and t.name is 'body'
1734 unless is_in_scope 'body', NS_HTML
1738 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1739 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1740 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1741 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1745 unless ok_tags[t.name] is el.namespace
1748 ins_mode = ins_mode_after_body
1750 if t.type is TYPE_END_TAG and t.name is 'html'
1751 unless is_in_scope 'body', NS_HTML
1755 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1756 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1757 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1758 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1762 unless ok_tags[t.name] is el.namespace
1765 ins_mode = ins_mode_after_body
1768 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1769 close_p_if_in_button_scope()
1770 insert_html_element t
1772 if t.type is TYPE_START_TAG and h_tags[t.name]?
1773 close_p_if_in_button_scope()
1774 if h_tags[open_els[0].name] is open_els[0].namespace
1777 insert_html_element t
1779 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1780 close_p_if_in_button_scope()
1781 insert_html_element t
1782 # spec: If the next token is a "LF" (U+000A) character token, then
1783 # ignore that token and move on to the next one. (Newlines at the
1784 # start of pre blocks are ignored as an authoring convenience.)
1785 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1787 flag_frameset_ok = false
1789 if t.type is TYPE_START_TAG and t.name is 'form'
1790 unless form_element_pointer is null or template_tag_is_open()
1793 close_p_if_in_button_scope()
1794 el = insert_html_element t
1795 unless template_tag_is_open()
1796 form_element_pointer = el
1798 if t.type is TYPE_START_TAG and t.name is 'li'
1799 flag_frameset_ok = false
1800 for node in open_els
1801 if node.name is 'li' and node.namespace is NS_HTML
1802 generate_implied_end_tags 'li' # arg is exception
1803 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1806 el = open_els.shift()
1807 if el.name is 'li' and el.namespace is NS_HTML
1810 if el_is_special_not_adp node
1812 close_p_if_in_button_scope()
1813 insert_html_element t
1815 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1816 flag_frameset_ok = false
1817 for node in open_els
1818 if node.name is 'dd' and node.namespace is NS_HTML
1819 generate_implied_end_tags 'dd' # arg is exception
1820 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1823 el = open_els.shift()
1824 if el.name is 'dd' and el.namespace is NS_HTML
1827 if node.name is 'dt' and node.namespace is NS_HTML
1828 generate_implied_end_tags 'dt' # arg is exception
1829 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1832 el = open_els.shift()
1833 if el.name is 'dt' and el.namespace is NS_HTML
1836 if el_is_special_not_adp node
1838 close_p_if_in_button_scope()
1839 insert_html_element t
1841 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1842 close_p_if_in_button_scope()
1843 insert_html_element t
1844 tok_state = tok_state_plaintext
1846 if t.type is TYPE_START_TAG and t.name is 'button'
1847 if is_in_scope 'button', NS_HTML
1849 generate_implied_end_tags()
1851 el = open_els.shift()
1852 if el.name is 'button' and el.namespace is NS_HTML
1855 insert_html_element t
1856 flag_frameset_ok = false
1858 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1859 unless is_in_scope t.name, NS_HTML
1862 generate_implied_end_tags()
1863 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1866 el = open_els.shift()
1867 if el.name is t.name and el.namespace is NS_HTML
1870 if t.type is TYPE_END_TAG and t.name is 'form'
1871 unless template_tag_is_open()
1872 node = form_element_pointer
1873 form_element_pointer = null
1874 if node is null or not el_is_in_scope node
1877 generate_implied_end_tags()
1878 if open_els[0] isnt node
1880 for el, i in open_els
1882 open_els.splice i, 1
1885 unless is_in_scope 'form', NS_HTML
1888 generate_implied_end_tags()
1889 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1892 el = open_els.shift()
1893 if el.name is 'form' and el.namespace is NS_HTML
1896 if t.type is TYPE_END_TAG and t.name is 'p'
1897 unless is_in_button_scope 'p', NS_HTML
1899 insert_html_element new_open_tag 'p'
1902 if t.type is TYPE_END_TAG and t.name is 'li'
1903 unless is_in_li_scope 'li', NS_HTML
1906 generate_implied_end_tags 'li' # arg is exception
1907 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1910 el = open_els.shift()
1911 if el.name is 'li' and el.namespace is NS_HTML
1914 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1915 unless is_in_scope t.name, NS_HTML
1918 generate_implied_end_tags t.name # arg is exception
1919 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1922 el = open_els.shift()
1923 if el.name is t.name and el.namespace is NS_HTML
1926 if t.type is TYPE_END_TAG and h_tags[t.name]?
1929 if h_tags[el.name] is el.namespace
1932 if standard_scopers[el.name] is el.namespace
1937 generate_implied_end_tags()
1938 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1941 el = open_els.shift()
1942 if h_tags[el.name] is el.namespace
1946 if t.type is TYPE_START_TAG and t.name is 'a'
1947 # If the list of active formatting elements contains an a element
1948 # between the end of the list and the last marker on the list (or
1949 # the start of the list if there is no marker on the list), then
1950 # this is a parse error; run the adoption agency algorithm for the
1951 # tag name "a", then remove that element from the list of active
1952 # formatting elements and the stack of open elements if the
1953 # adoption agency algorithm didn't already remove it (it might not
1954 # have if the element is not in table scope).
1957 if el.type is TYPE_AFE_MARKER
1959 if el.name is 'a' and el.namespace is NS_HTML
1967 for el, i in open_els
1969 open_els.splice i, 1
1971 el = insert_html_element t
1974 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1976 el = insert_html_element t
1979 if t.type is TYPE_START_TAG and t.name is 'nobr'
1981 el = insert_html_element t
1984 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1985 adoption_agency t.name
1987 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1989 insert_html_element t
1991 flag_frameset_ok = false
1993 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1994 unless is_in_scope t.name, NS_HTML
1997 generate_implied_end_tags()
1998 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2001 el = open_els.shift()
2002 if el.name is t.name and el.namespace is NS_HTML
2004 clear_afe_to_marker()
2006 if t.type is TYPE_START_TAG and t.name is 'table'
2007 close_p_if_in_button_scope() # fixfull quirksmode thing
2008 insert_html_element t
2009 flag_frameset_ok = false
2010 ins_mode = ins_mode_in_table
2012 if t.type is TYPE_END_TAG and t.name is 'br'
2014 t.type is TYPE_START_TAG
2016 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2018 insert_html_element t
2020 t.acknowledge_self_closing()
2021 flag_frameset_ok = false
2023 if t.type is TYPE_START_TAG and t.name is 'input'
2025 insert_html_element t
2027 t.acknowledge_self_closing()
2028 unless is_input_hidden_tok t
2029 flag_frameset_ok = false
2031 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
2032 insert_html_element t
2034 t.acknowledge_self_closing()
2036 if t.type is TYPE_START_TAG and t.name is 'hr'
2037 close_p_if_in_button_scope()
2038 insert_html_element t
2040 t.acknowledge_self_closing()
2041 flag_frameset_ok = false
2043 if t.type is TYPE_START_TAG and t.name is 'image'
2048 if t.type is TYPE_START_TAG and t.name is 'isindex'
2050 if template_tag_is_open() is false and form_element_pointer isnt null
2052 t.acknowledge_self_closing()
2053 flag_frameset_ok = false
2054 close_p_if_in_button_scope()
2055 el = insert_html_element new_open_tag 'form'
2056 unless template_tag_is_open()
2057 form_element_pointer = el
2060 el.attrs['action'] = a[1]
2062 insert_html_element new_open_tag 'hr'
2065 insert_html_element new_open_tag 'label'
2066 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2067 input_el = new_open_tag 'input'
2072 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2073 input_el.attrs_a.push [a[0], a[1]]
2074 input_el.attrs_a.push ['name', 'isindex']
2075 # fixfull this next bit is in english... internationalize?
2076 prompt ?= "This is a searchable index. Enter search keywords: "
2077 insert_character new_character_token prompt # fixfull split
2078 # TODO submit typo "balue" in spec
2079 insert_html_element input_el
2081 # insert_character '' # you can put chars here if promt attr missing
2083 insert_html_element new_open_tag 'hr'
2086 unless template_tag_is_open()
2087 form_element_pointer = null
2089 if t.type is TYPE_START_TAG and t.name is 'textarea'
2090 insert_html_element t
2091 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2093 tok_state = tok_state_rcdata
2094 original_ins_mode = ins_mode
2095 flag_frameset_ok = false
2096 ins_mode = ins_mode_text
2098 if t.type is TYPE_START_TAG and t.name is 'xmp'
2099 close_p_if_in_button_scope()
2101 flag_frameset_ok = false
2102 parse_generic_raw_text t
2104 if t.type is TYPE_START_TAG and t.name is 'iframe'
2105 flag_frameset_ok = false
2106 parse_generic_raw_text t
2108 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2109 parse_generic_raw_text t
2111 if t.type is TYPE_START_TAG and t.name is 'select'
2113 insert_html_element t
2114 flag_frameset_ok = false
2115 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2116 ins_mode = ins_mode_in_select_in_table
2118 ins_mode = ins_mode_in_select
2120 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2121 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2124 insert_html_element t
2126 # this comment block implements the W3C spec
2127 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2128 # if is_in_scope 'ruby', NS_HTML
2129 # generate_implied_end_tags()
2130 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2132 # insert_html_element t
2134 # if t.type is TYPE_START_TAG and t.name is 'rt'
2135 # if is_in_scope 'ruby', NS_HTML
2136 # generate_implied_end_tags 'rtc' # arg is exception
2137 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2139 # insert_html_element t
2141 # below implements the WATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2142 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2143 if is_in_scope 'ruby', NS_HTML
2144 generate_implied_end_tags()
2145 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2147 insert_html_element t
2149 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2150 if is_in_scope 'ruby', NS_HTML
2151 generate_implied_end_tags 'rtc'
2152 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2154 insert_html_element t
2157 if t.type is TYPE_START_TAG and t.name is 'math'
2159 adjust_mathml_attributes t
2160 adjust_foreign_attributes t
2161 insert_foreign_element t, NS_MATHML
2162 if t.flag 'self-closing'
2164 t.acknowledge_self_closing()
2166 if t.type is TYPE_START_TAG and t.name is 'svg'
2168 adjust_svg_attributes t
2169 adjust_foreign_attributes t
2170 insert_foreign_element t, NS_SVG
2171 if t.flag 'self-closing'
2173 t.acknowledge_self_closing()
2175 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2178 if t.type is TYPE_START_TAG # any other start tag
2180 insert_html_element t
2182 if t.type is TYPE_END_TAG # any other end tag
2183 in_body_any_other_end_tag t.name
2187 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2188 ins_mode_text = (t) ->
2189 if t.type is TYPE_TEXT
2192 if t.type is TYPE_EOF
2194 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2195 open_els[0].flag 'already started', true
2197 ins_mode = original_ins_mode
2200 if t.type is TYPE_END_TAG and t.name is 'script'
2202 ins_mode = original_ins_mode
2203 # fixfull the spec seems to assume that I'm going to run the script
2204 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2206 if t.type is TYPE_END_TAG
2208 ins_mode = original_ins_mode
2210 console.log 'warning: end of ins_mode_text reached'
2212 # the functions below implement the tokenizer stats described here:
2213 # http://www.w3.org/TR/html5/syntax.html#tokenization
2215 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2216 ins_mode_in_table_else = (t) ->
2218 flag_foster_parenting = true
2220 flag_foster_parenting = false
2222 ins_mode_in_table = (t) ->
2225 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2226 pending_table_character_tokens = []
2227 original_ins_mode = ins_mode
2228 ins_mode = ins_mode_in_table_text
2231 ins_mode_in_table_else t
2239 clear_stack_to_table_context()
2241 insert_html_element t
2242 ins_mode = ins_mode_in_caption
2244 clear_stack_to_table_context()
2245 insert_html_element t
2246 ins_mode = ins_mode_in_column_group
2248 clear_stack_to_table_context()
2249 insert_html_element new_open_tag 'colgroup'
2250 ins_mode = ins_mode_in_column_group
2252 when 'tbody', 'tfoot', 'thead'
2253 clear_stack_to_table_context()
2254 insert_html_element t
2255 ins_mode = ins_mode_in_table_body
2256 when 'td', 'th', 'tr'
2257 clear_stack_to_table_context()
2258 insert_html_element new_open_tag 'tbody'
2259 ins_mode = ins_mode_in_table_body
2263 if is_in_table_scope 'table', NS_HTML
2265 el = open_els.shift()
2266 if el.name is 'table' and el.namespace is NS_HTML
2270 when 'style', 'script', 'template'
2273 unless is_input_hidden_tok t
2274 ins_mode_in_table_else t
2277 el = insert_html_element t
2279 t.acknowledge_self_closing()
2282 if form_element_pointer?
2284 if template_tag_is_open()
2286 form_element_pointer = insert_html_element t
2289 ins_mode_in_table_else t
2293 if is_in_table_scope 'table', NS_HTML
2295 el = open_els.shift()
2296 if el.name is 'table' and el.namespace is NS_HTML
2301 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2306 ins_mode_in_table_else t
2310 ins_mode_in_table_else t
2313 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2314 ins_mode_in_table_text = (t) ->
2315 if t.type is TYPE_TEXT and t.text is "\u0000"
2319 if t.type is TYPE_TEXT
2320 pending_table_character_tokens.push t
2324 for old in pending_table_character_tokens
2325 unless is_space_tok old
2329 for old in pending_table_character_tokens
2330 insert_character old
2332 for old in pending_table_character_tokens
2333 ins_mode_in_table_else old
2334 pending_table_character_tokens = []
2335 ins_mode = original_ins_mode
2338 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2339 ins_mode_in_caption = (t) ->
2340 if t.type is TYPE_END_TAG and t.name is 'caption'
2341 if is_in_table_scope 'caption', NS_HTML
2342 generate_implied_end_tags()
2343 if open_els[0].name isnt 'caption'
2346 el = open_els.shift()
2347 if el.name is 'caption' and el.namespace is NS_HTML
2349 clear_afe_to_marker()
2350 ins_mode = ins_mode_in_table
2355 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2357 if is_in_table_scope 'caption', NS_HTML
2359 el = open_els.shift()
2360 if el.name is 'caption' and el.namespace is NS_HTML
2362 clear_afe_to_marker()
2363 ins_mode = ins_mode_in_table
2365 # else fragment case
2367 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2373 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2374 ins_mode_in_column_group = (t) ->
2378 if t.type is TYPE_COMMENT
2381 if t.type is TYPE_DOCTYPE
2384 if t.type is TYPE_START_TAG and t.name is 'html'
2387 if t.type is TYPE_START_TAG and t.name is 'col'
2388 el = insert_html_element t
2390 t.acknowledge_self_closing()
2392 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2393 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2395 ins_mode = ins_mode_in_table
2399 if t.type is TYPE_END_TAG and t.name is 'col'
2402 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2405 if t.type is TYPE_EOF
2409 if open_els[0].name isnt 'colgroup'
2413 ins_mode = ins_mode_in_table
2417 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2418 ins_mode_in_table_body = (t) ->
2419 if t.type is TYPE_START_TAG and t.name is 'tr'
2420 clear_stack_to_table_body_context()
2421 insert_html_element t
2422 ins_mode = ins_mode_in_row
2424 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2426 clear_stack_to_table_body_context()
2427 insert_html_element new_open_tag 'tr'
2428 ins_mode = ins_mode_in_row
2431 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2432 unless is_in_table_scope t.name, NS_HTML
2435 clear_stack_to_table_body_context()
2437 ins_mode = ins_mode_in_table
2439 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2442 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2445 if table_scopers[el.name] is el.namespace
2450 clear_stack_to_table_body_context()
2452 ins_mode = ins_mode_in_table
2455 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2461 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2462 ins_mode_in_row = (t) ->
2463 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2464 clear_stack_to_table_row_context()
2465 insert_html_element t
2466 ins_mode = ins_mode_in_cell
2469 if t.type is TYPE_END_TAG and t.name is 'tr'
2470 if is_in_table_scope 'tr', NS_HTML
2471 clear_stack_to_table_row_context()
2473 ins_mode = ins_mode_in_table_body
2477 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2478 if is_in_table_scope 'tr', NS_HTML
2479 clear_stack_to_table_row_context()
2481 ins_mode = ins_mode_in_table_body
2486 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2487 if is_in_table_scope t.name, NS_HTML
2488 if is_in_table_scope 'tr', NS_HTML
2489 clear_stack_to_table_row_context()
2491 ins_mode = ins_mode_in_table_body
2496 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2502 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2504 generate_implied_end_tags()
2505 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2508 el = open_els.shift()
2509 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2511 clear_afe_to_marker()
2512 ins_mode = ins_mode_in_row
2514 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2515 ins_mode_in_cell = (t) ->
2516 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2517 if is_in_table_scope t.name, NS_HTML
2518 generate_implied_end_tags()
2519 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2522 el = open_els.shift()
2523 if el.name is t.name and el.namespace is NS_HTML
2525 clear_afe_to_marker()
2526 ins_mode = ins_mode_in_row
2530 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2533 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2536 if table_scopers[el.name] is el.namespace
2544 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2547 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2548 if is_in_table_scope t.name, NS_HTML
2557 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2558 ins_mode_in_select = (t) ->
2559 if t.type is TYPE_TEXT and t.text is "\u0000"
2562 if t.type is TYPE_TEXT
2565 if t.type is TYPE_COMMENT
2568 if t.type is TYPE_DOCTYPE
2571 if t.type is TYPE_START_TAG and t.name is 'html'
2574 if t.type is TYPE_START_TAG and t.name is 'option'
2575 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2577 insert_html_element t
2579 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2580 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2582 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2584 insert_html_element t
2586 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2587 if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
2588 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2590 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2595 if t.type is TYPE_END_TAG and t.name is 'option'
2596 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2601 if t.type is TYPE_END_TAG and t.name is 'select'
2602 if is_in_select_scope 'select', NS_HTML
2604 el = open_els.shift()
2605 if el.name is 'select' and el.namespace is NS_HTML
2611 if t.type is TYPE_START_TAG and t.name is 'select'
2614 el = open_els.shift()
2615 if el.name is 'select' and el.namespace is NS_HTML
2618 # spec says that this is the same as </select> but it doesn't say
2619 # to check scope first
2621 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2623 if is_in_select_scope 'select', NS_HTML
2626 el = open_els.shift()
2627 if el.name is 'select' and el.namespace is NS_HTML
2632 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2635 if t.type is TYPE_EOF
2642 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2643 ins_mode_in_select_in_table = (t) ->
2644 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2647 el = open_els.shift()
2648 if el.name is 'select' and el.namespace is NS_HTML
2653 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2655 unless is_in_table_scope t.name, NS_HTML
2658 el = open_els.shift()
2659 if el.name is 'select' and el.namespace is NS_HTML
2665 ins_mode_in_select t
2668 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2669 ins_mode_in_template = (t) ->
2670 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2673 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2676 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2677 template_ins_modes.shift()
2678 template_ins_modes.unshift ins_mode_in_table
2679 ins_mode = ins_mode_in_table
2682 if t.type is TYPE_START_TAG and t.name is 'col'
2683 template_ins_modes.shift()
2684 template_ins_modes.unshift ins_mode_in_column_group
2685 ins_mode = ins_mode_in_column_group
2688 if t.type is TYPE_START_TAG and t.name is 'tr'
2689 template_ins_modes.shift()
2690 template_ins_modes.unshift ins_mode_in_table_body
2691 ins_mode = ins_mode_in_table_body
2694 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2695 template_ins_modes.shift()
2696 template_ins_modes.unshift ins_mode_in_row
2697 ins_mode = ins_mode_in_row
2700 if t.type is TYPE_START_TAG
2701 template_ins_modes.shift()
2702 template_ins_modes.unshift ins_mode_in_body
2703 ins_mode = ins_mode_in_body
2706 if t.type is TYPE_END_TAG
2709 if t.type is TYPE_EOF
2710 unless template_tag_is_open()
2715 el = open_els.shift()
2716 if el.name is 'template' and el.namespace is NS_HTML
2718 clear_afe_to_marker()
2719 template_ins_modes.shift()
2723 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2724 ins_mode_after_body = (t) ->
2728 if t.type is TYPE_COMMENT
2729 first = open_els[open_els.length - 1]
2730 insert_comment t, [first, first.children.length]
2732 if t.type is TYPE_DOCTYPE
2735 if t.type is TYPE_START_TAG and t.name is 'html'
2738 if t.type is TYPE_END_TAG and t.name is 'html'
2739 if flag_fragment_parsing
2742 ins_mode = ins_mode_after_after_body
2744 if t.type is TYPE_EOF
2749 ins_mode = ins_mode_in_body
2752 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2753 ins_mode_in_frameset = (t) ->
2757 if t.type is TYPE_COMMENT
2760 if t.type is TYPE_DOCTYPE
2763 if t.type is TYPE_START_TAG and t.name is 'html'
2766 if t.type is TYPE_START_TAG and t.name is 'frameset'
2767 insert_html_element t
2769 if t.type is TYPE_END_TAG and t.name is 'frameset'
2770 if open_els.length is 1
2772 return # fragment case
2774 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2775 ins_mode = ins_mode_after_frameset
2777 if t.type is TYPE_START_TAG and t.name is 'frame'
2778 insert_html_element t
2780 t.acknowledge_self_closing()
2782 if t.type is TYPE_START_TAG and t.name is 'noframes'
2785 if t.type is TYPE_EOF
2786 if open_els.length isnt 1
2794 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2795 ins_mode_after_frameset = (t) ->
2799 if t.type is TYPE_COMMENT
2802 if t.type is TYPE_DOCTYPE
2805 if t.type is TYPE_START_TAG and t.name is 'html'
2808 if t.type is TYPE_END_TAG and t.name is 'html'
2809 ins_mode = ins_mode_after_after_frameset
2811 if t.type is TYPE_START_TAG and t.name is 'noframes'
2814 if t.type is TYPE_EOF
2821 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2822 ins_mode_after_after_body = (t) ->
2823 if t.type is TYPE_COMMENT
2824 insert_comment t, [doc, doc.children.length]
2826 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2829 if t.type is TYPE_EOF
2834 ins_mode = ins_mode_in_body
2838 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2839 ins_mode_after_after_frameset = (t) ->
2840 if t.type is TYPE_COMMENT
2841 insert_comment t, [doc, doc.children.length]
2843 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2846 if t.type is TYPE_EOF
2849 if t.type is TYPE_START_TAG and t.name is 'noframes'
2856 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2857 has_color_face_or_size = (t) ->
2859 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2862 in_foreign_content_end_script = ->
2866 in_foreign_content_other_start = (t) ->
2867 acn = adjusted_current_node()
2868 if acn.namespace is NS_MATHML
2869 adjust_mathml_attributes t
2870 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2871 t.name = svg_name_fixes[t.name]
2872 if acn.namespace is NS_SVG
2873 adjust_svg_attributes t
2874 adjust_foreign_attributes t
2875 insert_foreign_element t, acn.namespace
2876 if t.flag 'self-closing'
2877 if t.name is 'script'
2878 t.acknowledge_self_closing()
2879 in_foreign_content_end_script()
2883 t.acknowledge_self_closing()
2885 in_foreign_content = (t) ->
2886 if t.type is TYPE_TEXT and t.text is "\u0000"
2888 insert_character new_character_token "\ufffd"
2893 if t.type is TYPE_TEXT
2894 flag_frameset_ok = false
2897 if t.type is TYPE_COMMENT
2900 if t.type is TYPE_DOCTYPE
2903 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
2905 if flag_fragment_parsing
2906 in_foreign_content_other_start t
2908 loop # is this safe?
2910 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
2914 if t.type is TYPE_START_TAG
2915 in_foreign_content_other_start t
2917 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
2918 in_foreign_content_end_script()
2920 if t.type is TYPE_END_TAG
2923 if node.name.toLowerCase() isnt t.name
2926 if node is open_els[open_els.length - 1]
2928 if node.name.toLowerCase() is t.name
2930 el = open_els.shift()
2935 if node.namespace is NS_HTML
2937 ins_mode t # explicitly call HTML insertion mode
2940 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2942 switch c = txt.charAt(cur++)
2944 return new_text_node parse_character_reference()
2946 tok_state = tok_state_tag_open
2949 return new_text_node "\ufffd"
2951 return new_eof_token()
2953 return new_text_node c
2956 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2957 # not needed: tok_state_character_reference_in_data = ->
2958 # just call parse_character_reference()
2960 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2961 tok_state_rcdata = ->
2962 switch c = txt.charAt(cur++)
2964 return new_text_node parse_character_reference()
2966 tok_state = tok_state_rcdata_less_than_sign
2969 return new_character_token "\ufffd"
2971 return new_eof_token()
2973 return new_character_token c
2976 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2977 # not needed: tok_state_character_reference_in_rcdata = ->
2978 # just call parse_character_reference()
2980 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2981 tok_state_rawtext = ->
2982 switch c = txt.charAt(cur++)
2984 tok_state = tok_state_rawtext_less_than_sign
2987 return new_character_token "\ufffd"
2989 return new_eof_token()
2991 return new_character_token c
2994 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2995 tok_state_script_data = ->
2996 switch c = txt.charAt(cur++)
2998 tok_state = tok_state_script_data_less_than_sign
3001 return new_character_token "\ufffd"
3003 return new_eof_token()
3005 return new_character_token c
3008 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3009 tok_state_plaintext = ->
3010 switch c = txt.charAt(cur++)
3013 return new_character_token "\ufffd"
3015 return new_eof_token()
3017 return new_character_token c
3021 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3022 tok_state_tag_open = ->
3023 c = txt.charAt(cur++)
3025 tok_state = tok_state_markup_declaration_open
3028 tok_state = tok_state_end_tag_open
3031 tok_cur_tag = new_open_tag c.toLowerCase()
3032 tok_state = tok_state_tag_name
3035 tok_cur_tag = new_open_tag c
3036 tok_state = tok_state_tag_name
3040 tok_cur_tag = new_comment_token '?' # FIXME right?
3041 tok_state = tok_state_bogus_comment
3045 tok_state = tok_state_data
3046 cur -= 1 # we didn't parse/handle the char after <
3047 return new_text_node '<'
3049 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3050 tok_state_end_tag_open = ->
3051 c = txt.charAt(cur++)
3053 tok_cur_tag = new_end_tag c.toLowerCase()
3054 tok_state = tok_state_tag_name
3057 tok_cur_tag = new_end_tag c
3058 tok_state = tok_state_tag_name
3062 tok_state = tok_state_data
3066 tok_state = tok_state_data
3067 return new_text_node '</'
3070 tok_cur_tag = new_comment_token c
3071 tok_state = tok_state_bogus_comment
3074 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3075 tok_state_tag_name = ->
3076 switch c = txt.charAt(cur++)
3077 when "\t", "\n", "\u000c", ' '
3078 tok_state = tok_state_before_attribute_name
3080 tok_state = tok_state_self_closing_start_tag
3082 tok_state = tok_state_data
3088 tok_cur_tag.name += "\ufffd"
3091 tok_state = tok_state_data
3094 tok_cur_tag.name += c.toLowerCase()
3096 tok_cur_tag.name += c
3099 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3100 tok_state_rcdata_less_than_sign = ->
3101 c = txt.charAt(cur++)
3103 temporary_buffer = ''
3104 tok_state = tok_state_rcdata_end_tag_open
3107 tok_state = tok_state_rcdata
3108 cur -= 1 # reconsume the input character
3109 return new_character_token '<'
3111 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3112 tok_state_rcdata_end_tag_open = ->
3113 c = txt.charAt(cur++)
3115 tok_cur_tag = new_end_tag c.toLowerCase()
3116 temporary_buffer += c
3117 tok_state = tok_state_rcdata_end_tag_name
3120 tok_cur_tag = new_end_tag c
3121 temporary_buffer += c
3122 tok_state = tok_state_rcdata_end_tag_name
3125 tok_state = tok_state_rcdata
3126 cur -= 1 # reconsume the input character
3127 return new_character_token "</" # fixfull separate these
3129 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3130 is_appropriate_end_tag = (t) ->
3131 # spec says to check against "the tag name of the last start tag to
3132 # have been emitted from this tokenizer", but this is only called from
3133 # the various "raw" states, so it's hopefully ok to assume that
3134 # open_els[0].name will work instead TODO: verify this after the script
3135 # data states are implemented
3136 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3137 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3139 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3140 tok_state_rcdata_end_tag_name = ->
3141 c = txt.charAt(cur++)
3142 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3143 if is_appropriate_end_tag tok_cur_tag
3144 tok_state = tok_state_before_attribute_name
3146 # else fall through to "Anything else"
3148 if is_appropriate_end_tag tok_cur_tag
3149 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3151 # else fall through to "Anything else"
3153 if is_appropriate_end_tag tok_cur_tag
3154 tok_state = tok_state_data
3156 # else fall through to "Anything else"
3158 tok_cur_tag.name += c.toLowerCase()
3159 temporary_buffer += c
3162 tok_cur_tag.name += c
3163 temporary_buffer += c
3166 tok_state = tok_state_rcdata
3167 cur -= 1 # reconsume the input character
3168 return new_character_token '</' + temporary_buffer # fixfull separate these
3170 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3171 tok_state_rawtext_less_than_sign = ->
3172 c = txt.charAt(cur++)
3174 temporary_buffer = ''
3175 tok_state = tok_state_rawtext_end_tag_open
3178 tok_state = tok_state_rawtext
3179 cur -= 1 # reconsume the input character
3180 return new_character_token '<'
3182 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3183 tok_state_rawtext_end_tag_open = ->
3184 c = txt.charAt(cur++)
3186 tok_cur_tag = new_end_tag c.toLowerCase()
3187 temporary_buffer += c
3188 tok_state = tok_state_rawtext_end_tag_name
3191 tok_cur_tag = new_end_tag c
3192 temporary_buffer += c
3193 tok_state = tok_state_rawtext_end_tag_name
3196 tok_state = tok_state_rawtext
3197 cur -= 1 # reconsume the input character
3198 return new_character_token "</" # fixfull separate these
3200 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3201 tok_state_rawtext_end_tag_name = ->
3202 c = txt.charAt(cur++)
3203 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3204 if is_appropriate_end_tag tok_cur_tag
3205 tok_state = tok_state_before_attribute_name
3207 # else fall through to "Anything else"
3209 if is_appropriate_end_tag tok_cur_tag
3210 tok_state = tok_state_self_closing_start_tag
3212 # else fall through to "Anything else"
3214 if is_appropriate_end_tag tok_cur_tag
3215 tok_state = tok_state_data
3217 # else fall through to "Anything else"
3219 tok_cur_tag.name += c.toLowerCase()
3220 temporary_buffer += c
3223 tok_cur_tag.name += c
3224 temporary_buffer += c
3227 tok_state = tok_state_rawtext
3228 cur -= 1 # reconsume the input character
3229 return new_character_token '</' + temporary_buffer # fixfull separate these
3231 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3232 tok_state_script_data_less_than_sign = ->
3233 c = txt.charAt(cur++)
3235 temporary_buffer = ''
3236 tok_state = tok_state_script_data_end_tag_open
3239 tok_state = tok_state_script_data_escape_start
3240 return new_character_token '<!' # fixfull split
3242 tok_state = tok_state_script_data
3243 cur -= 1 # Reconsume
3244 return new_character_token '<'
3246 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3247 tok_state_script_data_end_tag_open = ->
3248 c = txt.charAt(cur++)
3250 tok_cur_tag = new_end_tag c.toLowerCase()
3251 temporary_buffer += c
3252 tok_state = tok_state_script_data_end_tag_name
3255 tok_cur_tag = new_end_tag c
3256 temporary_buffer += c
3257 tok_state = tok_state_script_data_end_tag_name
3260 tok_state = tok_state_script_data
3261 cur -= 1 # Reconsume
3262 return new_character_token '</'
3264 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3265 tok_state_script_data_end_tag_name = ->
3266 c = txt.charAt(cur++)
3267 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3268 if is_appropriate_end_tag tok_cur_tag
3269 tok_state = tok_state_before_attribute_name
3273 if is_appropriate_end_tag tok_cur_tag
3274 tok_state = tok_state_self_closing_start_tag
3278 if is_appropriate_end_tag tok_cur_tag
3279 tok_state = tok_state_data
3283 tok_cur_tag.name += c.toLowerCase()
3284 temporary_buffer += c
3287 tok_cur_tag.name += c
3288 temporary_buffer += c
3291 tok_state = tok_state_script_data
3292 cur -= 1 # Reconsume
3293 return new_character_token "</#{temporary_buffer}" # fixfull split
3295 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3296 tok_state_script_data_escape_start = ->
3297 c = txt.charAt(cur++)
3299 tok_state = tok_state_script_data_escape_start_dash
3300 return new_character_token '-'
3302 tok_state = tok_state_script_data
3303 cur -= 1 # Reconsume
3306 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3307 tok_state_script_data_escape_start_dash = ->
3308 c = txt.charAt(cur++)
3310 tok_state = tok_state_script_data_escaped_dash_dash
3311 return new_character_token '-'
3313 tok_state = tok_state_script_data
3314 cur -= 1 # Reconsume
3317 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3318 tok_state_script_data_escaped = ->
3319 c = txt.charAt(cur++)
3321 tok_state = tok_state_script_data_escaped_dash
3322 return new_character_token '-'
3324 tok_state = tok_state_script_data_escaped_less_than_sign
3328 return new_character_token "\ufffd"
3330 tok_state = tok_state_data
3332 cur -= 1 # Reconsume
3335 return new_character_token c
3337 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3338 tok_state_script_data_escaped_dash = ->
3339 c = txt.charAt(cur++)
3341 tok_state = tok_state_script_data_escaped_dash_dash
3342 return new_character_token '-'
3344 tok_state = tok_state_script_data_escaped_less_than_sign
3348 tok_state = tok_state_script_data_escaped
3349 return new_character_token "\ufffd"
3351 tok_state = tok_state_data
3353 cur -= 1 # Reconsume
3356 tok_state = tok_state_script_data_escaped
3357 return new_character_token c
3359 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3360 tok_state_script_data_escaped_dash_dash = ->
3361 c = txt.charAt(cur++)
3363 return new_character_token '-'
3365 tok_state = tok_state_script_data_escaped_less_than_sign
3368 tok_state = tok_state_script_data
3369 return new_character_token '>'
3372 tok_state = tok_state_script_data_escaped
3373 return new_character_token "\ufffd"
3376 tok_state = tok_state_data
3377 cur -= 1 # Reconsume
3380 tok_state = tok_state_script_data_escaped
3381 return new_character_token c
3383 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3384 tok_state_script_data_escaped_less_than_sign = ->
3385 c = txt.charAt(cur++)
3387 temporary_buffer = ''
3388 tok_state = tok_state_script_data_escaped_end_tag_open
3391 temporary_buffer = c.toLowerCase() # yes, really
3392 tok_state = tok_state_script_data_double_escape_start
3393 return new_character_token "<#{c}" # fixfull split
3395 temporary_buffer = c
3396 tok_state = tok_state_script_data_double_escape_start
3397 return new_character_token "<#{c}" # fixfull split
3399 tok_state = tok_state_script_data_escaped
3400 cur -= 1 # Reconsume
3401 return new_character_token '<'
3403 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3404 tok_state_script_data_escaped_end_tag_open = ->
3405 c = txt.charAt(cur++)
3407 tok_cur_tag = new_end_tag c.toLowerCase()
3408 temporary_buffer += c
3409 tok_state = tok_state_script_data_escaped_end_tag_name
3412 tok_cur_tag = new_end_tag c
3413 temporary_buffer += c
3414 tok_state = tok_state_script_data_escaped_end_tag_name
3417 tok_state = tok_state_script_data_escaped
3418 cur -= 1 # Reconsume
3419 return new_character_token '</' # fixfull split
3421 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3422 tok_state_script_data_escaped_end_tag_name = ->
3423 c = txt.charAt(cur++)
3424 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3425 if is_appropriate_end_tag tok_cur_tag
3426 tok_state = tok_state_before_attribute_name
3430 if is_appropriate_end_tag tok_cur_tag
3431 tok_state = tok_state_self_closing_start_tag
3435 if is_appropriate_end_tag tok_cur_tag
3436 tok_state = tok_state_data
3440 tok_cur_tag.name += c.toLowerCase()
3441 temporary_buffer += c.toLowerCase()
3444 tok_cur_tag.name += c
3445 temporary_buffer += c.toLowerCase()
3448 tok_state = tok_state_script_data_escaped
3449 cur -= 1 # Reconsume
3450 return new_character_token "</#{temporary_buffer}" # fixfull split
3452 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3453 tok_state_script_data_double_escape_start = ->
3454 c = txt.charAt(cur++)
3455 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3456 if temporary_buffer is 'script'
3457 tok_state = tok_state_script_data_double_escaped
3459 tok_state = tok_state_script_data_escaped
3460 return new_character_token c
3462 temporary_buffer += c.toLowerCase() # yes, really lowercase
3463 return new_character_token c
3465 temporary_buffer += c
3466 return new_character_token c
3468 tok_state = tok_state_script_data_escaped
3469 cur -= 1 # Reconsume
3472 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3473 tok_state_script_data_double_escaped = ->
3474 c = txt.charAt(cur++)
3476 tok_state = tok_state_script_data_double_escaped_dash
3477 return new_character_token '-'
3479 tok_state = tok_state_script_data_double_escaped_less_than_sign
3480 return new_character_token '<'
3483 return new_character_token "\ufffd"
3486 tok_state = tok_state_data
3487 cur -= 1 # Reconsume
3490 return new_character_token c
3492 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3493 tok_state_script_data_double_escaped_dash = ->
3494 c = txt.charAt(cur++)
3496 tok_state = tok_state_script_data_double_escaped_dash_dash
3497 return new_character_token '-'
3499 tok_state = tok_state_script_data_double_escaped_less_than_sign
3500 return new_character_token '<'
3503 tok_state = tok_state_script_data_double_escaped
3504 return new_character_token "\ufffd"
3507 tok_state = tok_state_data
3508 cur -= 1 # Reconsume
3511 tok_state = tok_state_script_data_double_escaped
3512 return new_character_token c
3514 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3515 tok_state_script_data_double_escaped_dash_dash = ->
3516 c = txt.charAt(cur++)
3518 return new_character_token '-'
3520 tok_state = tok_state_script_data_double_escaped_less_than_sign
3521 return new_character_token '<'
3523 tok_state = tok_state_script_data
3524 return new_character_token '>'
3527 tok_state = tok_state_script_data_double_escaped
3528 return new_character_token "\ufffd"
3531 tok_state = tok_state_data
3532 cur -= 1 # Reconsume
3535 tok_state = tok_state_script_data_double_escaped
3536 return new_character_token c
3538 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3539 tok_state_script_data_double_escaped_less_than_sign = ->
3540 c = txt.charAt(cur++)
3542 temporary_buffer = ''
3543 tok_state = tok_state_script_data_double_escape_end
3544 return new_character_token '/'
3546 tok_state = tok_state_script_data_double_escaped
3547 cur -= 1 # Reconsume
3550 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3551 tok_state_script_data_double_escape_end = ->
3552 c = txt.charAt(cur++)
3553 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3554 if temporary_buffer is 'script'
3555 tok_state = tok_state_script_data_escaped
3557 tok_state = tok_state_script_data_double_escaped
3558 return new_character_token c
3560 temporary_buffer += c.toLowerCase() # yes, really lowercase
3561 return new_character_token c
3563 temporary_buffer += c
3564 return new_character_token c
3566 tok_state = tok_state_script_data_double_escaped
3567 cur -= 1 # Reconsume
3570 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3571 tok_state_before_attribute_name = ->
3573 switch c = txt.charAt(cur++)
3574 when "\t", "\n", "\u000c", ' '
3577 tok_state = tok_state_self_closing_start_tag
3580 tok_state = tok_state_data
3586 attr_name = "\ufffd"
3587 when '"', "'", '<', '='
3592 tok_state = tok_state_data
3595 attr_name = c.toLowerCase()
3599 tok_cur_tag.attrs_a.unshift [attr_name, '']
3600 tok_state = tok_state_attribute_name
3603 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3604 tok_state_attribute_name = ->
3605 switch c = txt.charAt(cur++)
3606 when "\t", "\n", "\u000c", ' '
3607 tok_state = tok_state_after_attribute_name
3609 tok_state = tok_state_self_closing_start_tag
3611 tok_state = tok_state_before_attribute_value
3613 tok_state = tok_state_data
3619 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3622 tok_cur_tag.attrs_a[0][0] += c
3625 tok_state = tok_state_data
3628 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3630 tok_cur_tag.attrs_a[0][0] += c
3633 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3634 tok_state_after_attribute_name = ->
3635 c = txt.charAt(cur++)
3636 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3639 tok_state = tok_state_self_closing_start_tag
3642 tok_state = tok_state_before_attribute_value
3645 tok_state = tok_state_data
3648 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3649 tok_state = tok_state_attribute_name
3653 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3654 tok_state = tok_state_attribute_name
3658 tok_state = tok_state_data
3659 cur -= 1 # reconsume
3661 if c is '"' or c is "'" or c is '<'
3663 # fall through to Anything else
3665 tok_cur_tag.attrs_a.unshift [c, '']
3666 tok_state = tok_state_attribute_name
3668 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3669 tok_state_before_attribute_value = ->
3670 switch c = txt.charAt(cur++)
3671 when "\t", "\n", "\u000c", ' '
3674 tok_state = tok_state_attribute_value_double_quoted
3676 tok_state = tok_state_attribute_value_unquoted
3679 tok_state = tok_state_attribute_value_single_quoted
3682 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3683 tok_state = tok_state_attribute_value_unquoted
3686 tok_state = tok_state_data
3692 tok_state = tok_state_data
3694 tok_cur_tag.attrs_a[0][1] += c
3695 tok_state = tok_state_attribute_value_unquoted
3698 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3699 tok_state_attribute_value_double_quoted = ->
3700 switch c = txt.charAt(cur++)
3702 tok_state = tok_state_after_attribute_value_quoted
3704 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3707 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3710 tok_state = tok_state_data
3712 tok_cur_tag.attrs_a[0][1] += c
3715 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3716 tok_state_attribute_value_single_quoted = ->
3717 switch c = txt.charAt(cur++)
3719 tok_state = tok_state_after_attribute_value_quoted
3721 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3724 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3727 tok_state = tok_state_data
3729 tok_cur_tag.attrs_a[0][1] += c
3732 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3733 tok_state_attribute_value_unquoted = ->
3734 switch c = txt.charAt(cur++)
3735 when "\t", "\n", "\u000c", ' '
3736 tok_state = tok_state_before_attribute_name
3738 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3740 tok_state = tok_state_data
3745 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3748 tok_state = tok_state_data
3750 # Parse Error if ', <, = or ` (backtick)
3751 tok_cur_tag.attrs_a[0][1] += c
3754 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3755 tok_state_after_attribute_value_quoted = ->
3756 switch c = txt.charAt(cur++)
3757 when "\t", "\n", "\u000c", ' '
3758 tok_state = tok_state_before_attribute_name
3760 tok_state = tok_state_self_closing_start_tag
3762 tok_state = tok_state_data
3768 tok_state = tok_state_data
3771 tok_state = tok_state_before_attribute_name
3772 cur -= 1 # we didn't handle that char
3775 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3776 tok_state_self_closing_start_tag = ->
3777 c = txt.charAt(cur++)
3779 tok_cur_tag.flag 'self-closing', true
3780 tok_state = tok_state_data
3784 tok_state = tok_state_data
3785 cur -= 1 # Reconsume
3789 tok_state = tok_state_before_attribute_name
3790 cur -= 1 # Reconsume
3793 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3794 # WARNING: put a comment token in tok_cur_tag before setting this state
3795 tok_state_bogus_comment = ->
3796 next_gt = txt.indexOf '>', cur
3798 val = txt.substr cur
3801 val = txt.substr cur, (next_gt - cur)
3803 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3804 tok_cur_tag.text += val
3805 tok_state = tok_state_data
3808 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3809 tok_state_markup_declaration_open = ->
3810 if txt.substr(cur, 2) is '--'
3812 tok_cur_tag = new_comment_token ''
3813 tok_state = tok_state_comment_start
3815 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3817 tok_state = tok_state_doctype
3819 acn = adjusted_current_node()
3820 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3822 tok_state = tok_state_cdata_section
3826 tok_cur_tag = new_comment_token ''
3827 tok_state = tok_state_bogus_comment
3830 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3831 tok_state_comment_start = ->
3832 switch c = txt.charAt(cur++)
3834 tok_state = tok_state_comment_start_dash
3837 tok_state = tok_state_comment
3838 return new_character_token "\ufffd"
3841 tok_state = tok_state_data
3845 tok_state = tok_state_data
3846 cur -= 1 # Reconsume
3849 tok_cur_tag.text += c
3850 tok_state = tok_state_comment
3853 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3854 tok_state_comment_start_dash = ->
3855 switch c = txt.charAt(cur++)
3857 tok_state = tok_state_comment_end
3860 tok_cur_tag.text += "-\ufffd"
3861 tok_state = tok_state_comment
3864 tok_state = tok_state_data
3868 tok_state = tok_state_data
3869 cur -= 1 # Reconsume
3872 tok_cur_tag.text += "-#{c}"
3873 tok_state = tok_state_comment
3876 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3877 tok_state_comment = ->
3878 switch c = txt.charAt(cur++)
3880 tok_state = tok_state_comment_end_dash
3883 tok_cur_tag.text += "\ufffd"
3886 tok_state = tok_state_data
3887 cur -= 1 # Reconsume
3890 tok_cur_tag.text += c
3893 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3894 tok_state_comment_end_dash = ->
3895 switch c = txt.charAt(cur++)
3897 tok_state = tok_state_comment_end
3900 tok_cur_tag.text += "-\ufffd"
3901 tok_state = tok_state_comment
3904 tok_state = tok_state_data
3905 cur -= 1 # Reconsume
3908 tok_cur_tag.text += "-#{c}"
3909 tok_state = tok_state_comment
3912 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3913 tok_state_comment_end = ->
3914 switch c = txt.charAt(cur++)
3916 tok_state = tok_state_data
3920 tok_cur_tag.text += "--\ufffd"
3921 tok_state = tok_state_comment
3924 tok_state = tok_state_comment_end_bang
3927 tok_cur_tag.text += '-'
3930 tok_state = tok_state_data
3931 cur -= 1 # Reconsume
3935 tok_cur_tag.text += "--#{c}"
3936 tok_state = tok_state_comment
3939 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3940 tok_state_comment_end_bang = ->
3941 switch c = txt.charAt(cur++)
3943 tok_cur_tag.text += "--!#{c}"
3944 tok_state = tok_state_comment_end_dash
3946 tok_state = tok_state_data
3950 tok_cur_tag.text += "--!\ufffd"
3951 tok_state = tok_state_comment
3954 tok_state = tok_state_data
3955 cur -= 1 # Reconsume
3958 tok_cur_tag.text += "--!#{c}"
3959 tok_state = tok_state_comment
3962 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3963 tok_state_doctype = ->
3964 switch c = txt.charAt(cur++)
3965 when "\t", "\u000a", "\u000c", ' '
3966 tok_state = tok_state_before_doctype_name
3969 tok_state = tok_state_data
3970 el = new_doctype_token ''
3971 el.flag 'force-quirks', true
3972 cur -= 1 # Reconsume
3976 tok_state = tok_state_before_doctype_name
3977 cur -= 1 # Reconsume
3980 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3981 tok_state_before_doctype_name = ->
3982 c = txt.charAt(cur++)
3983 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3986 tok_cur_tag = new_doctype_token c.toLowerCase()
3987 tok_state = tok_state_doctype_name
3991 tok_cur_tag = new_doctype_token "\ufffd"
3992 tok_state = tok_state_doctype_name
3996 el = new_doctype_token ''
3997 el.flag 'force-quirks', true
3998 tok_state = tok_state_data
4002 tok_state = tok_state_data
4003 el = new_doctype_token ''
4004 el.flag 'force-quirks', true
4005 cur -= 1 # Reconsume
4008 tok_cur_tag = new_doctype_token c
4009 tok_state = tok_state_doctype_name
4012 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4013 tok_state_doctype_name = ->
4014 c = txt.charAt(cur++)
4015 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4016 tok_state = tok_state_after_doctype_name
4019 tok_state = tok_state_data
4022 tok_cur_tag.name += c.toLowerCase()
4026 tok_cur_tag.name += "\ufffd"
4030 tok_state = tok_state_data
4031 tok_cur_tag.flag 'force-quirks', true
4032 cur -= 1 # Reconsume
4035 tok_cur_tag.name += c
4038 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4039 tok_state_after_doctype_name = ->
4040 c = txt.charAt(cur++)
4041 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4044 tok_state = tok_state_data
4048 tok_state = tok_state_data
4049 tok_cur_tag.flag 'force-quirks', true
4050 cur -= 1 # Reconsume
4053 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4055 tok_state = tok_state_after_doctype_public_keyword
4057 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4059 tok_state = tok_state_after_doctype_system_keyword
4062 tok_cur_tag.flag 'force-quirks', true
4063 tok_state = tok_state_bogus_doctype
4066 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4067 tok_state_after_doctype_public_keyword = ->
4068 c = txt.charAt(cur++)
4069 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4070 tok_state = tok_state_before_doctype_public_identifier
4074 tok_cur_tag.public_identifier = ''
4075 tok_state = tok_state_doctype_public_identifier_double_quoted
4079 tok_cur_tag.public_identifier = ''
4080 tok_state = tok_state_doctype_public_identifier_single_quoted
4084 tok_cur_tag.flag 'force-quirks', true
4085 tok_state = tok_state_data
4089 tok_state = tok_state_data
4090 tok_cur_tag.flag 'force-quirks', true
4091 cur -= 1 # Reconsume
4095 tok_cur_tag.flag 'force-quirks', true
4096 tok_state = tok_state_bogus_doctype
4099 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4100 tok_state_before_doctype_public_identifier = ->
4101 c = txt.charAt(cur++)
4102 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4106 tok_cur_tag.public_identifier = ''
4107 tok_state = tok_state_doctype_public_identifier_double_quoted
4111 tok_cur_tag.public_identifier = ''
4112 tok_state = tok_state_doctype_public_identifier_single_quoted
4116 tok_cur_tag.flag 'force-quirks', true
4117 tok_state = tok_state_data
4121 tok_state = tok_state_data
4122 tok_cur_tag.flag 'force-quirks', true
4123 cur -= 1 # Reconsume
4127 tok_cur_tag.flag 'force-quirks', true
4128 tok_state = tok_state_bogus_doctype
4132 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4133 tok_state_doctype_public_identifier_double_quoted = ->
4134 c = txt.charAt(cur++)
4136 tok_state = tok_state_after_doctype_public_identifier
4140 tok_cur_tag.public_identifier += "\ufffd"
4144 tok_cur_tag.flag 'force-quirks', true
4145 tok_state = tok_state_data
4149 tok_state = tok_state_data
4150 tok_cur_tag.flag 'force-quirks', true
4151 cur -= 1 # Reconsume
4154 tok_cur_tag.public_identifier += c
4157 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4158 tok_state_doctype_public_identifier_single_quoted = ->
4159 c = txt.charAt(cur++)
4161 tok_state = tok_state_after_doctype_public_identifier
4165 tok_cur_tag.public_identifier += "\ufffd"
4169 tok_cur_tag.flag 'force-quirks', true
4170 tok_state = tok_state_data
4174 tok_state = tok_state_data
4175 tok_cur_tag.flag 'force-quirks', true
4176 cur -= 1 # Reconsume
4179 tok_cur_tag.public_identifier += c
4182 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4183 tok_state_after_doctype_public_identifier = ->
4184 c = txt.charAt(cur++)
4185 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4186 tok_state = tok_state_between_doctype_public_and_system_identifiers
4189 tok_state = tok_state_data
4193 tok_cur_tag.system_identifier = ''
4194 tok_state = tok_state_doctype_system_identifier_double_quoted
4198 tok_cur_tag.system_identifier = ''
4199 tok_state = tok_state_doctype_system_identifier_single_quoted
4203 tok_state = tok_state_data
4204 tok_cur_tag.flag 'force-quirks', true
4205 cur -= 1 # Reconsume
4209 tok_cur_tag.flag 'force-quirks', true
4210 tok_state = tok_state_bogus_doctype
4213 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4214 tok_state_between_doctype_public_and_system_identifiers = ->
4215 c = txt.charAt(cur++)
4216 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4219 tok_state = tok_state_data
4223 tok_cur_tag.system_identifier = ''
4224 tok_state = tok_state_doctype_system_identifier_double_quoted
4228 tok_cur_tag.system_identifier = ''
4229 tok_state = tok_state_doctype_system_identifier_single_quoted
4233 tok_state = tok_state_data
4234 tok_cur_tag.flag 'force-quirks', true
4235 cur -= 1 # Reconsume
4239 tok_cur_tag.flag 'force-quirks', true
4240 tok_state = tok_state_bogus_doctype
4243 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4244 tok_state_after_doctype_system_keyword = ->
4245 c = txt.charAt(cur++)
4246 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4247 tok_state = tok_state_before_doctype_system_identifier
4251 tok_cur_tag.system_identifier = ''
4252 tok_state = tok_state_doctype_system_identifier_double_quoted
4256 tok_cur_tag.system_identifier = ''
4257 tok_state = tok_state_doctype_system_identifier_single_quoted
4261 tok_cur_tag.flag 'force-quirks', true
4262 tok_state = tok_state_data
4266 tok_state = tok_state_data
4267 tok_cur_tag.flag 'force-quirks', true
4268 cur -= 1 # Reconsume
4272 tok_cur_tag.flag 'force-quirks', true
4273 tok_state = tok_state_bogus_doctype
4276 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4277 tok_state_before_doctype_system_identifier = ->
4278 c = txt.charAt(cur++)
4279 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4282 tok_cur_tag.system_identifier = ''
4283 tok_state = tok_state_doctype_system_identifier_double_quoted
4286 tok_cur_tag.system_identifier = ''
4287 tok_state = tok_state_doctype_system_identifier_single_quoted
4291 tok_cur_tag.flag 'force-quirks', true
4292 tok_state = tok_state_data
4296 tok_state = tok_state_data
4297 tok_cur_tag.flag 'force-quirks', true
4298 cur -= 1 # Reconsume
4302 tok_cur_tag.flag 'force-quirks', true
4303 tok_state = tok_state_bogus_doctype
4306 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4307 tok_state_doctype_system_identifier_double_quoted = ->
4308 c = txt.charAt(cur++)
4310 tok_state = tok_state_after_doctype_system_identifier
4314 tok_cur_tag.system_identifier += "\ufffd"
4318 tok_cur_tag.flag 'force-quirks', true
4319 tok_state = tok_state_data
4323 tok_state = tok_state_data
4324 tok_cur_tag.flag 'force-quirks', true
4325 cur -= 1 # Reconsume
4328 tok_cur_tag.system_identifier += c
4331 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4332 tok_state_doctype_system_identifier_single_quoted = ->
4333 c = txt.charAt(cur++)
4335 tok_state = tok_state_after_doctype_system_identifier
4339 tok_cur_tag.system_identifier += "\ufffd"
4343 tok_cur_tag.flag 'force-quirks', true
4344 tok_state = tok_state_data
4348 tok_state = tok_state_data
4349 tok_cur_tag.flag 'force-quirks', true
4350 cur -= 1 # Reconsume
4353 tok_cur_tag.system_identifier += c
4356 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4357 tok_state_after_doctype_system_identifier = ->
4358 c = txt.charAt(cur++)
4359 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4362 tok_state = tok_state_data
4366 tok_state = tok_state_data
4367 tok_cur_tag.flag 'force-quirks', true
4368 cur -= 1 # Reconsume
4372 # do _not_ tok_cur_tag.flag 'force-quirks', true
4373 tok_state = tok_state_bogus_doctype
4376 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4377 tok_state_bogus_doctype = ->
4378 c = txt.charAt(cur++)
4380 tok_state = tok_state_data
4383 tok_state = tok_state_data
4384 cur -= 1 # Reconsume
4389 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4390 tok_state_cdata_section = ->
4391 tok_state = tok_state_data
4392 next_gt = txt.indexOf ']]>', cur
4394 val = txt.substr cur
4397 val = txt.substr cur, (next_gt - cur)
4399 return new_character_token val # fixfull split
4401 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4402 # Don't set this as a state, just call it
4403 # returns a string (NOT a text node)
4404 parse_character_reference = (allowed_char = null, in_attr = false) ->
4405 if cur >= txt.length
4407 switch c = txt.charAt(cur)
4408 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4409 # explicitly not a parse error
4412 # there has to be "one or more" alnums between & and ; to be a parse error
4415 if cur + 1 >= txt.length
4417 if txt.charAt(cur + 1).toLowerCase() is 'x'
4426 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4431 if txt.charAt(start + i) is ';'
4435 code_point = txt.substr(start, i)
4436 while code_point.charAt(0) is '0' and code_point.length > 1
4437 code_point = code_point.substr 1
4438 code_point = parseInt(code_point, base)
4439 if unicode_fixes[code_point]?
4441 return unicode_fixes[code_point]
4443 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4447 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4449 return from_code_point code_point
4453 if alnum.indexOf(txt.charAt(cur + i)) is -1
4456 # exit early, because parse_error() below needs at least one alnum
4458 if txt.charAt(cur + i) is ';'
4459 i += 1 # include ';' terminator in value
4460 decoded = decode_named_char_ref txt.substr(cur, i)
4467 # no ';' terminator (only legacy char refs)
4469 for i in [2..max] # no prefix matches, so ok to check shortest first
4470 c = legacy_char_refs[txt.substr(cur, i)]
4473 if txt.charAt(cur + i) is '='
4474 # "because some legacy user agents will
4475 # misinterpret the markup in those cases"
4478 if alnum.indexOf(txt.charAt(cur + i)) > -1
4479 # this makes attributes forgiving about url args
4481 # ok, and besides the weird exceptions for attributes...
4482 # return the matching char
4483 cur += i # consume entity chars
4484 parse_error() # because no terminating ";"
4488 return # never reached
4490 # tree constructor initialization
4491 # see comments on TYPE_TAG/etc for the structure of this data
4494 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4496 afe = [] # active formatting elements
4497 template_ins_modes = []
4498 ins_mode = ins_mode_initial
4499 original_ins_mode = ins_mode # TODO check spec
4500 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4501 flag_frameset_ok = true
4503 flag_foster_parenting = false
4504 form_element_pointer = null
4505 temporary_buffer = null
4506 pending_table_character_tokens = []
4507 head_element_pointer = null
4508 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4509 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4511 # tokenizer initialization
4512 tok_state = tok_state_data
4514 # text pre-processing
4515 # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4516 txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4517 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4518 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4520 if args.name is "tests18.dat #17"
4523 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4528 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4531 serialize_els = (els, shallow, show_ids) ->
4537 serialized += t.serialize shallow, show_ids
4540 module.exports.parse_html = parse_html
4541 module.exports.debug_log_reset = debug_log_reset
4542 module.exports.debug_log_each = debug_log_each
4543 module.exports.TYPE_TAG = TYPE_TAG
4544 module.exports.TYPE_TEXT = TYPE_TEXT
4545 module.exports.TYPE_COMMENT = TYPE_COMMENT
4546 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4547 module.exports.NS_HTML = NS_HTML
4548 module.exports.NS_MATHML = NS_MATHML
4549 module.exports.NS_SVG = NS_SVG