1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
25 # Deviations from that spec:
27 # Purposeful: search this file for "WTAG"
29 # Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
34 # the spec uses a many different words do indicate which ends of lists/stacks
35 # they are talking about (and relative movement within the lists/stacks). This
36 # section splains. I'm implementing "lists" (afe and open_els) the same way
39 # stacks grow downward (current element is index=0)
41 # example: open_els = [a, b, c, d, e, f, g]
43 # "grows downwards" means it's visualized like this: (index: el, names)
45 # 6: g "start of the list", "topmost", "first"
47 # 4: e "previous" (to d), "above", "before"
48 # 3: d (previous/next are relative to this element)
49 # 2: c "next", "after", "lower", "below"
51 # 0: a "end of the list", "current node", "bottommost", "last"
55 # note: to get this to run outside a browser, you'll have to write a native
56 # implementation of decode_named_char_ref()
57 unless module?.exports?
59 module = exports: window.wheic
61 from_code_point = (x) ->
62 if String.fromCodePoint?
63 return String.fromCodePoint x
66 return String.fromCharCode x
68 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
70 # Each node is an obect of the Node class. Here are the Node types:
71 TYPE_TAG = 0 # name, {attributes}, [children]
72 TYPE_TEXT = 1 # "text"
75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
77 TYPE_END_TAG = 5 # name
79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
92 debug_log_each = (cb) ->
93 for str in g_debug_log
98 constructor: (type, args = {}) ->
99 @type = type # one of the TYPE_* constants above
100 @name = args.name ? '' # tag name
101 @text = args.text ? '' # contents for text/comment nodes
102 @attrs = args.attrs ? {}
103 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
104 @children = args.children ? []
105 @namespace = args.namespace ? NS_HTML
106 @parent = args.parent ? null
107 @token = args.token ? null
108 @flags = args.flags ? {}
112 @id = "#{++prev_node_id}"
113 acknowledge_self_closing: ->
115 @token.flag 'did_self_close', true
117 @flag 'did_self_close', true
118 flag: (key, value = null) ->
123 serialize: (shallow = false, show_ids = false) -> # for unit tests
128 ret += JSON.stringify @name
143 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
149 ret += c.serialize shallow, show_ids
153 ret += JSON.stringify @text
156 ret += JSON.stringify @text
158 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
161 when TYPE_AAA_BOOKMARK
162 ret += 'aaa_bookmark'
165 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
168 # helpers: (only take args that are normally known when parser creates nodes)
169 new_open_tag = (name) ->
170 return new Node TYPE_START_TAG, name: name
171 new_end_tag = (name) ->
172 return new Node TYPE_END_TAG, name: name
173 new_element = (name) ->
174 return new Node TYPE_TAG, name: name
175 new_text_node = (txt) ->
176 return new Node TYPE_TEXT, text: txt
177 new_character_token = new_text_node
178 new_comment_token = (txt) ->
179 return new Node TYPE_COMMENT, text: txt
180 new_doctype_token = (name) ->
181 return new Node TYPE_DOCTYPE, name: name
183 return new Node TYPE_EOF
185 return new Node TYPE_AFE_MARKER
186 new_aaa_bookmark = ->
187 return new Node TYPE_AAA_BOOKMARK
189 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
190 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
191 digits = "0123456789"
192 alnum = lc_alpha + uc_alpha + digits
193 hex_chars = digits + "abcdefABCDEF"
195 is_uc_alpha = (str) ->
196 return str.length is 1 and uc_alpha.indexOf(str) > -1
197 is_lc_alpha = (str) ->
198 return str.length is 1 and lc_alpha.indexOf(str) > -1
200 # some SVG elements have dashes in them
201 tag_name_chars = alnum + "-"
203 # http://www.w3.org/TR/html5/infrastructure.html#space-character
204 space_chars = "\u0009\u000a\u000c\u000d\u0020"
206 return txt.length is 1 and space_chars.indexOf(txt) > -1
207 is_space_tok = (t) ->
208 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
210 is_input_hidden_tok = (t) ->
211 return false unless t.type is TYPE_START_TAG
214 if a[1].toLowerCase() is 'hidden'
219 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
220 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
223 unicode_fixes[0x00] = "\uFFFD"
224 unicode_fixes[0x80] = "\u20AC"
225 unicode_fixes[0x82] = "\u201A"
226 unicode_fixes[0x83] = "\u0192"
227 unicode_fixes[0x84] = "\u201E"
228 unicode_fixes[0x85] = "\u2026"
229 unicode_fixes[0x86] = "\u2020"
230 unicode_fixes[0x87] = "\u2021"
231 unicode_fixes[0x88] = "\u02C6"
232 unicode_fixes[0x89] = "\u2030"
233 unicode_fixes[0x8A] = "\u0160"
234 unicode_fixes[0x8B] = "\u2039"
235 unicode_fixes[0x8C] = "\u0152"
236 unicode_fixes[0x8E] = "\u017D"
237 unicode_fixes[0x91] = "\u2018"
238 unicode_fixes[0x92] = "\u2019"
239 unicode_fixes[0x93] = "\u201C"
240 unicode_fixes[0x94] = "\u201D"
241 unicode_fixes[0x95] = "\u2022"
242 unicode_fixes[0x96] = "\u2013"
243 unicode_fixes[0x97] = "\u2014"
244 unicode_fixes[0x98] = "\u02DC"
245 unicode_fixes[0x99] = "\u2122"
246 unicode_fixes[0x9A] = "\u0161"
247 unicode_fixes[0x9B] = "\u203A"
248 unicode_fixes[0x9C] = "\u0153"
249 unicode_fixes[0x9E] = "\u017E"
250 unicode_fixes[0x9F] = "\u0178"
252 # These are the character references that don't need a terminating semicolon
253 # min length: 2, max: 6, none are a prefix of any other.
255 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
256 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
257 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
258 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
259 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
260 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
261 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
262 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
263 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
264 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
265 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
266 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
267 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
268 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
269 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
270 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
271 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
275 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
276 raw_text_elements = ['script', 'style']
277 escapable_raw_text_elements = ['textarea', 'title']
278 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
280 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
281 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
282 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
283 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
284 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
285 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
286 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
287 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
288 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
289 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
290 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
291 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
292 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
293 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
297 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
299 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
300 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
301 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
302 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
303 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
304 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
305 'determinant', 'diff', 'divergence', 'divide', 'domain',
306 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
307 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
308 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
309 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
310 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
311 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
312 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
313 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
314 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
315 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
316 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
317 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
318 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
319 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
320 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
321 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
322 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
323 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
324 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
325 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
326 'vectorproduct', 'xor'
328 # foreign_elements = [svg_elements..., mathml_elements...]
329 #normal_elements = All other allowed HTML elements are normal elements.
333 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
334 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
335 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
336 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
337 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
338 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
339 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
340 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
341 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
342 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
343 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
345 menu:NS_HTML,menuitem:NS_HTML, # WATWG adds these
347 meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
348 noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
349 plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
350 select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
351 table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
352 textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
353 tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
356 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
357 'annotation-xml':NS_MATHML,
360 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
363 formatting_elements = {
364 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
365 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
369 mathml_text_integration = {
370 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
372 is_mathml_text_integration_point = (el) ->
373 return mathml_text_integration[el.name] is el.namespace
374 is_html_integration = (el) -> # DON'T PASS A TOKEN
375 if el.namespace is NS_MATHML
376 if el.name is 'annotation-xml'
377 if el.attrs.encoding?
378 if el.attrs.encoding.toLowerCase() is 'text/html'
380 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
383 if el.namespace is NS_SVG
384 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
389 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
392 foster_parenting_targets = {
413 el_is_special = (e) ->
414 return special_elements[e.name] is e.namespace
416 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
417 el_is_special_not_adp = (el) ->
418 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
422 altglyphdef: 'altGlyphDef'
423 altglyphitem: 'altGlyphItem'
424 animatecolor: 'animateColor'
425 animatemotion: 'animateMotion'
426 animatetransform: 'animateTransform'
429 fecolormatrix: 'feColorMatrix'
430 fecomponenttransfer: 'feComponentTransfer'
431 fecomposite: 'feComposite'
432 feconvolvematrix: 'feConvolveMatrix'
433 fediffuselighting: 'feDiffuseLighting'
434 fedisplacementmap: 'feDisplacementMap'
435 fedistantlight: 'feDistantLight'
436 fedropshadow: 'feDropShadow'
442 fegaussianblur: 'feGaussianBlur'
445 femergenode: 'feMergeNode'
446 femorphology: 'feMorphology'
448 fepointlight: 'fePointLight'
449 fespecularlighting: 'feSpecularLighting'
450 fespotlight: 'feSpotLight'
452 feturbulence: 'feTurbulence'
453 foreignobject: 'foreignObject'
455 lineargradient: 'linearGradient'
456 radialgradient: 'radialGradient'
459 svg_attribute_fixes = {
460 attributename: 'attributeName'
461 attributetype: 'attributeType'
462 basefrequency: 'baseFrequency'
463 baseprofile: 'baseProfile'
465 clippathunits: 'clipPathUnits'
466 contentscripttype: 'contentScriptType'
467 contentstyletype: 'contentStyleType'
468 diffuseconstant: 'diffuseConstant'
470 externalresourcesrequired: 'externalResourcesRequired'
471 # WTAG removes this: filterres: 'filterRes'
472 filterunits: 'filterUnits'
474 gradienttransform: 'gradientTransform'
475 gradientunits: 'gradientUnits'
476 kernelmatrix: 'kernelMatrix'
477 kernelunitlength: 'kernelUnitLength'
478 keypoints: 'keyPoints'
479 keysplines: 'keySplines'
481 lengthadjust: 'lengthAdjust'
482 limitingconeangle: 'limitingConeAngle'
483 markerheight: 'markerHeight'
484 markerunits: 'markerUnits'
485 markerwidth: 'markerWidth'
486 maskcontentunits: 'maskContentUnits'
487 maskunits: 'maskUnits'
488 numoctaves: 'numOctaves'
489 pathlength: 'pathLength'
490 patterncontentunits: 'patternContentUnits'
491 patterntransform: 'patternTransform'
492 patternunits: 'patternUnits'
493 pointsatx: 'pointsAtX'
494 pointsaty: 'pointsAtY'
495 pointsatz: 'pointsAtZ'
496 preservealpha: 'preserveAlpha'
497 preserveaspectratio: 'preserveAspectRatio'
498 primitiveunits: 'primitiveUnits'
501 repeatcount: 'repeatCount'
502 repeatdur: 'repeatDur'
503 requiredextensions: 'requiredExtensions'
504 requiredfeatures: 'requiredFeatures'
505 specularconstant: 'specularConstant'
506 specularexponent: 'specularExponent'
507 spreadmethod: 'spreadMethod'
508 startoffset: 'startOffset'
509 stddeviation: 'stdDeviation'
510 stitchtiles: 'stitchTiles'
511 surfacescale: 'surfaceScale'
512 systemlanguage: 'systemLanguage'
513 tablevalues: 'tableValues'
516 textlength: 'textLength'
518 viewtarget: 'viewTarget'
519 xchannelselector: 'xChannelSelector'
520 ychannelselector: 'yChannelSelector'
521 zoomandpan: 'zoomAndPan'
523 foreign_attr_fixes = {
524 'xlink:actuate': 'xlink actuate'
525 'xlink:arcrole': 'xlink arcrole'
526 'xlink:href': 'xlink href'
527 'xlink:role': 'xlink role'
528 'xlink:show': 'xlink show'
529 'xlink:title': 'xlink title'
530 'xlink:type': 'xlink type'
531 'xml:base': 'xml base'
532 'xml:lang': 'xml lang'
533 'xml:space': 'xml space'
535 'xmlns:xlink': 'xmlns xlink'
537 adjust_mathml_attributes = (t) ->
539 if a[0] is 'definitionurl'
540 a[0] = 'definitionURL'
542 adjust_svg_attributes = (t) ->
544 if svg_attribute_fixes[a[0]]?
545 a[0] = svg_attribute_fixes[a[0]]
547 adjust_foreign_attributes = (t) ->
550 if foreign_attr_fixes[a[0]]?
551 a[0] = foreign_attr_fixes[a[0]]
554 # decode_named_char_ref()
556 # The list of named character references is _huge_ so ask the browser to decode
557 # for us instead of wasting bandwidth/space on including the table here.
559 # Pass without the "&" but with the ";" examples:
560 # for "&" pass "amp;"
561 # for "′" pass "x2032;"
564 textarea: document.createElement('textarea')
566 # TODO test this in IE8
567 decode_named_char_ref = (txt) ->
569 decoded = g_dncr.cache[txt]
570 return decoded if decoded?
571 g_dncr.textarea.innerHTML = txt
572 decoded = g_dncr.textarea.value
573 return null if decoded is txt
574 return g_dncr.cache[txt] = decoded
576 parse_html = (args) ->
578 cur = null # index of next char in txt to be parsed
579 # declare doc and tokenizer variables so they're in scope below
581 open_els = null # stack of open elements
582 afe = null # active formatting elements
583 template_ins_modes = null
585 original_ins_mode = null
587 tok_cur_tag = null # partially parsed tag
588 flag_scripting = null
589 flag_frameset_ok = null
591 flag_foster_parenting = null
592 form_element_pointer = null
593 temporary_buffer = null
594 pending_table_character_tokens = null
595 head_element_pointer = null
596 flag_fragment_parsing = null
597 context_element = null
606 console.log "Parse error at character #{cur} of #{txt.length}"
608 afe_push = (new_el) ->
611 if el.name is new_el.name and el.namespace is new_el.namespace
613 continue unless new_el.attrs[k] is v
614 for k, v of new_el.attrs
615 continue unless el.attrs[k] is v
622 afe.unshift new_afe_marker()
624 # the functions below impliment the Tree Contstruction algorithm
625 # http://www.w3.org/TR/html5/syntax.html#tree-construction
627 # But first... the helpers
628 template_tag_is_open = ->
630 if t.name is 'template' and t.namespace is NS_HTML
633 is_in_scope_x = (tag_name, scope, namespace) ->
635 if t.name is tag_name and (namespace is null or namespace is t.namespace)
637 if scope[t.name] is t.namespace
640 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
642 if t.name is tag_name and (namespace is null or namespace is t.namespace)
644 if scope[t.name] is t.namespace
646 if scope2[t.name] is t.namespace
650 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
651 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
654 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
655 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
657 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
659 button_scopers = button: NS_HTML
660 li_scopers = ol: NS_HTML, ul: NS_HTML
661 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
662 is_in_scope = (tag_name, namespace = null) ->
663 return is_in_scope_x tag_name, standard_scopers, namespace
664 is_in_button_scope = (tag_name, namespace = null) ->
665 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
666 is_in_table_scope = (tag_name, namespace = null) ->
667 return is_in_scope_x tag_name, table_scopers, namespace
668 # aka is_in_list_item_scope
669 is_in_li_scope = (tag_name, namespace = null) ->
670 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
671 is_in_select_scope = (tag_name, namespace = null) ->
673 if t.name is tag_name and (namespace is null or namespace is t.namespace)
675 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
678 # this checks for a particular element, not by name
679 # this requires a namespace match
680 el_is_in_scope = (needle) ->
684 if standard_scopers[el.name] is el.namespace
688 clear_to_table_stopers = {
693 clear_stack_to_table_context = ->
695 if clear_to_table_stopers[open_els[0].name]?
699 clear_to_table_body_stopers = {
706 clear_stack_to_table_body_context = ->
708 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
712 clear_to_table_row_stopers = {
717 clear_stack_to_table_row_context = ->
719 if clear_to_table_row_stopers[open_els[0].name]?
723 clear_afe_to_marker = ->
725 return unless afe.length > 0 # this happens in fragment case, ?spec error
727 if el.type is TYPE_AFE_MARKER
732 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
734 # 1. Let last be false.
736 # 2. Let node be the last node in the stack of open elements.
738 node = open_els[node_i]
739 # 3. Loop: If node is the first node in the stack of open elements,
740 # then set last to true, and, if the parser was originally created as
741 # part of the HTML fragment parsing algorithm (fragment case) set node
742 # to the context element.
744 if node_i is open_els.length - 1
746 # fixfull (fragment case)
748 # 4. If node is a select element, run these substeps:
749 if node.name is 'select' and node.namespace is NS_HTML
750 # 1. If last is true, jump to the step below labeled done.
752 # 2. Let ancestor be node.
755 # 3. Loop: If ancestor is the first node in the stack of
756 # open elements, jump to the step below labeled done.
758 if ancestor_i is open_els.length - 1
760 # 4. Let ancestor be the node before ancestor in the stack
763 ancestor = open_els[ancestor_i]
764 # 5. If ancestor is a template node, jump to the step below
766 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
768 # 6. If ancestor is a table node, switch the insertion mode
769 # to "in select in table" and abort these steps.
770 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
771 ins_mode = ins_mode_in_select_in_table
773 # 7. Jump back to the step labeled loop.
774 # 8. Done: Switch the insertion mode to "in select" and abort
776 ins_mode = ins_mode_in_select
778 # 5. If node is a td or th element and last is false, then switch
779 # the insertion mode to "in cell" and abort these steps.
780 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
781 ins_mode = ins_mode_in_cell
783 # 6. If node is a tr element, then switch the insertion mode to "in
784 # row" and abort these steps.
785 if node.name is 'tr' and node.namespace is NS_HTML
786 ins_mode = ins_mode_in_row
788 # 7. If node is a tbody, thead, or tfoot element, then switch the
789 # insertion mode to "in table body" and abort these steps.
790 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
791 ins_mode = ins_mode_in_table_body
793 # 8. If node is a caption element, then switch the insertion mode
794 # to "in caption" and abort these steps.
795 if node.name is 'caption' and node.namespace is NS_HTML
796 ins_mode = ins_mode_in_caption
798 # 9. If node is a colgroup element, then switch the insertion mode
799 # to "in column group" and abort these steps.
800 if node.name is 'colgroup' and node.namespace is NS_HTML
801 ins_mode = ins_mode_in_column_group
803 # 10. If node is a table element, then switch the insertion mode to
804 # "in table" and abort these steps.
805 if node.name is 'table' and node.namespace is NS_HTML
806 ins_mode = ins_mode_in_table
808 # 11. If node is a template element, then switch the insertion mode
809 # to the current template insertion mode and abort these steps.
810 if node.name is 'template' and node.namespace is NS_HTML
811 ins_mode = template_ins_modes[0]
813 # 12. If node is a head element and last is true, then switch the
814 # insertion mode to "in body" ("in body"! not "in head"!) and abort
815 # these steps. (fragment case)
816 if node.name is 'head' and node.namespace is NS_HTML and last
817 ins_mode = ins_mode_in_body
819 # 13. If node is a head element and last is false, then switch the
820 # insertion mode to "in head" and abort these steps.
821 if node.name is 'head' and node.namespace is NS_HTML and last is false
822 ins_mode = ins_mode_in_head
824 # 14. If node is a body element, then switch the insertion mode to
825 # "in body" and abort these steps.
826 if node.name is 'body' and node.namespace is NS_HTML
827 ins_mode = ins_mode_in_body
829 # 15. If node is a frameset element, then switch the insertion mode
830 # to "in frameset" and abort these steps. (fragment case)
831 if node.name is 'frameset' and node.namespace is NS_HTML
832 ins_mode = ins_mode_in_frameset
834 # 16. If node is an html element, run these substeps:
835 if node.name is 'html' and node.namespace is NS_HTML
836 # 1. If the head element pointer is null, switch the insertion
837 # mode to "before head" and abort these steps. (fragment case)
838 if head_element_pointer is null
839 ins_mode = ins_mode_before_head
841 # 2. Otherwise, the head element pointer is not null,
842 # switch the insertion mode to "after head" and abort these
844 ins_mode = ins_mode_after_head
846 # 17. If last is true, then switch the insertion mode to "in body"
847 # and abort these steps. (fragment case)
849 ins_mode = ins_mode_in_body
851 # 18. Let node now be the node before node in the stack of open
854 node = open_els[node_i]
855 # 19. Return to the step labeled loop.
859 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
860 adjusted_current_node = ->
861 if open_els.length is 1 and flag_fragment_parsing
862 return context_element
865 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
866 # this implementation is structured (mostly) as described at the link above.
867 # capitalized comments are the "labels" described at the link above.
869 return if afe.length is 0
870 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
875 if i is afe.length - 1
878 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
883 el = insert_html_element afe[i].token
888 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
889 # adoption agency algorithm
891 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
892 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
893 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
894 adoption_agency = (subject) ->
895 debug_log "adoption_agency()"
896 debug_log "tree: #{serialize_els doc.children, false, true}"
897 debug_log "open_els: #{serialize_els open_els, true, true}"
898 debug_log "afe: #{serialize_els afe, true, true}"
899 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
902 # remove it from the list of active formatting elements (if found)
907 debug_log "aaa: starting off with subject on top of stack, exiting"
914 # 5. Let formatting element be the last element in the list of
915 # active formatting elements that: is between the end of the list
916 # and the last scope marker in the list, if any, or the start of
917 # the list otherwise, and has the tag name subject.
919 for t, fe_of_afe in afe
920 if t.type is TYPE_AFE_MARKER
925 # If there is no such element, then abort these steps and instead
926 # act as described in the "any other end tag" entry above.
928 debug_log "aaa: fe not found in afe"
929 in_body_any_other_end_tag subject
931 # 6. If formatting element is not in the stack of open elements,
932 # then this is a parse error; remove the element from the list, and
935 for t, fe_of_open_els in open_els
940 debug_log "aaa: fe not found in open_els"
942 # "remove it from the list" must mean afe, since it's not in open_els
943 afe.splice fe_of_afe, 1
945 # 7. If formatting element is in the stack of open elements, but
946 # the element is not in scope, then this is a parse error; abort
948 unless el_is_in_scope fe
949 debug_log "aaa: fe not in scope"
952 # 8. If formatting element is not the current node, this is a parse
953 # error. (But do not abort these steps.)
954 unless open_els[0] is fe
957 # 9. Let furthest block be the topmost node in the stack of open
958 # elements that is lower in the stack than formatting element, and
959 # is an element in the special category. There might not be one.
961 fb_of_open_els = null
968 # and continue, to see if there's one that's more "topmost"
969 # 10. If there is no furthest block, then the UA must first pop all
970 # the nodes from the bottom of the stack of open elements, from the
971 # current node up to and including formatting element, then remove
972 # formatting element from the list of active formatting elements,
973 # and finally abort these steps.
975 debug_log "aaa: no fb"
979 afe.splice fe_of_afe, 1
981 # 11. Let common ancestor be the element immediately above
982 # formatting element in the stack of open elements.
983 ca = open_els[fe_of_open_els + 1] # common ancestor
985 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
986 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
987 bookmark = new_aaa_bookmark()
990 afe.splice i, 0, bookmark
992 node = last_node = fb
996 # 3. Let node be the element immediately above node in the
997 # stack of open elements, or if node is no longer in the stack
998 # of open elements (e.g. because it got removed by this
999 # algorithm), the element that was immediately above node in
1000 # the stack of open elements before node was removed.
1002 for t, i in open_els
1004 node_next = open_els[i + 1]
1006 node = node_next ? node_above
1007 debug_log "inner loop #{inner}"
1008 debug_log "tree: #{serialize_els doc.children, false, true}"
1009 debug_log "open_els: #{serialize_els open_els, true, true}"
1010 debug_log "afe: #{serialize_els afe, true, true}"
1011 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1012 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1013 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1014 debug_log "node: #{node.serialize true, true}"
1015 # TODO make sure node_above gets re-set if/when node is removed from open_els
1017 # 4. If node is formatting element, then go to the next step in
1018 # the overall algorithm.
1021 debug_log "the meat"
1022 # 5. If inner loop counter is greater than three and node is in
1023 # the list of active formatting elements, then remove node from
1024 # the list of active formatting elements.
1030 debug_log "max out inner"
1035 # 6. If node is not in the list of active formatting elements,
1036 # then remove node from the stack of open elements and then go
1037 # back to the step labeled inner loop.
1039 debug_log "not in afe"
1040 for t, i in open_els
1042 node_above = open_els[i + 1]
1043 open_els.splice i, 1
1046 debug_log "the bones"
1047 # 7. create an element for the token for which the element node
1048 # was created, in the HTML namespace, with common ancestor as
1049 # the intended parent; replace the entry for node in the list
1050 # of active formatting elements with an entry for the new
1051 # element, replace the entry for node in the stack of open
1052 # elements with an entry for the new element, and let node be
1054 new_node = token_to_element node.token, NS_HTML, ca
1058 debug_log "replaced in afe"
1060 for t, i in open_els
1062 node_above = open_els[i + 1]
1063 open_els[i] = new_node
1064 debug_log "replaced in open_els"
1067 # 8. If last node is furthest block, then move the
1068 # aforementioned bookmark to be immediately after the new node
1069 # in the list of active formatting elements.
1074 debug_log "removed bookmark"
1078 # "after" means lower
1079 afe.splice i, 0, bookmark # "after as <-
1080 debug_log "placed bookmark after node"
1081 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1083 # 9. Insert last node into node, first removing it from its
1084 # previous parent node if any.
1085 if last_node.parent?
1086 debug_log "last_node has parent"
1087 for c, i in last_node.parent.children
1089 debug_log "removing last_node from parent"
1090 last_node.parent.children.splice i, 1
1092 node.children.push last_node
1093 last_node.parent = node
1094 # 10. Let last node be node.
1097 # 11. Return to the step labeled inner loop.
1098 # 14. Insert whatever last node ended up being in the previous step
1099 # at the appropriate place for inserting a node, but using common
1100 # ancestor as the override target.
1102 # In the case where fe is immediately followed by fb:
1103 # * inner loop exits out early (node==fe)
1105 # * last_node is still in the tree (not a duplicate)
1106 if last_node.parent?
1107 debug_log "FEFIRST? last_node has parent"
1108 for c, i in last_node.parent.children
1110 debug_log "removing last_node from parent"
1111 last_node.parent.children.splice i, 1
1114 debug_log "after aaa inner loop"
1115 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1116 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1117 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1118 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1119 debug_log "tree: #{serialize_els doc.children, false, true}"
1124 # can't use standard insert token thing, because it's already in
1125 # open_els and must stay at it's current position in open_els
1126 dest = adjusted_insertion_location ca
1127 dest[0].children.splice dest[1], 0, last_node
1128 last_node.parent = dest[0]
1131 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1132 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1133 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1134 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1135 debug_log "tree: #{serialize_els doc.children, false, true}"
1137 # 15. Create an element for the token for which formatting element
1138 # was created, in the HTML namespace, with furthest block as the
1140 new_element = token_to_element fe.token, NS_HTML, fb
1141 # 16. Take all of the child nodes of furthest block and append them
1142 # to the element created in the last step.
1143 while fb.children.length
1144 t = fb.children.shift()
1145 t.parent = new_element
1146 new_element.children.push t
1147 # 17. Append that new element to furthest block.
1148 new_element.parent = fb
1149 fb.children.push new_element
1150 # 18. Remove formatting element from the list of active formatting
1151 # elements, and insert the new element into the list of active
1152 # formatting elements at the position of the aforementioned
1160 afe[i] = new_element
1162 # 19. Remove formatting element from the stack of open elements,
1163 # and insert the new element into the stack of open elements
1164 # immediately below the position of furthest block in that stack.
1165 for t, i in open_els
1167 open_els.splice i, 1
1169 for t, i in open_els
1171 open_els.splice i, 0, new_element
1173 # 20. Jump back to the step labeled outer loop.
1174 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1175 debug_log "tree: #{serialize_els doc.children, false, true}"
1176 debug_log "open_els: #{serialize_els open_els, true, true}"
1177 debug_log "afe: #{serialize_els afe, true, true}"
1178 debug_log "AAA DONE"
1180 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1181 close_p_element = ->
1182 generate_implied_end_tags 'p' # arg is exception
1183 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1185 while open_els.length > 1 # just in case
1186 el = open_els.shift()
1187 if el.name is 'p' and el.namespace is NS_HTML
1189 close_p_if_in_button_scope = ->
1190 if is_in_button_scope 'p', NS_HTML
1193 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1194 # aka insert_a_character = (t) ->
1195 insert_character = (t) ->
1196 dest = adjusted_insertion_location()
1197 # fixfull check for Document node
1199 prev = dest[0].children[dest[1] - 1]
1200 if prev.type is TYPE_TEXT
1203 dest[0].children.splice dest[1], 0, t
1206 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1207 process_token = (t) ->
1208 acn = adjusted_current_node()
1212 if acn.namespace is NS_HTML
1215 if is_mathml_text_integration_point(acn)
1216 if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1219 if t.type is TYPE_TEXT
1222 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1225 if is_html_integration acn
1226 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1229 if t.type is TYPE_EOF
1232 in_foreign_content t
1236 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1237 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1238 adjusted_insertion_location = (override_target = null) ->
1239 # 1. If there was an override target specified, then let target be the
1242 target = override_target
1243 else # Otherwise, let target be the current node.
1244 target = open_els[0]
1245 # 2. Determine the adjusted insertion location using the first matching
1246 # steps from the following list:
1248 # If foster parenting is enabled and target is a table, tbody, tfoot,
1249 # thead, or tr element Foster parenting happens when content is
1250 # misnested in tables.
1251 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1252 loop # once. this is here so we can ``break`` to "abort these substeps"
1253 # 1. Let last template be the last template element in the
1254 # stack of open elements, if any.
1255 last_template = null
1256 last_template_i = null
1257 for el, i in open_els
1258 if el.name is 'template' and el.namespace is NS_HTML
1262 # 2. Let last table be the last table element in the stack of
1263 # open elements, if any.
1266 for el, i in open_els
1267 if el.name is 'table' and el.namespace is NS_HTML
1271 # 3. If there is a last template and either there is no last
1272 # table, or there is one, but last template is lower (more
1273 # recently added) than last table in the stack of open
1274 # elements, then: let adjusted insertion location be inside
1275 # last template's template contents, after its last child (if
1276 # any), and abort these substeps.
1277 if last_template and (last_table is null or last_template_i < last_table_i)
1278 target = last_template # fixfull should be it's contents
1279 target_i = target.children.length
1281 # 4. If there is no last table, then let adjusted insertion
1282 # location be inside the first element in the stack of open
1283 # elements (the html element), after its last child (if any),
1284 # and abort these substeps. (fragment case)
1285 if last_table is null
1287 target = open_els[open_els.length - 1]
1288 target_i = target.children.length
1290 # 5. If last table has a parent element, then let adjusted
1291 # insertion location be inside last table's parent element,
1292 # immediately before last table, and abort these substeps.
1293 if last_table.parent?
1294 for c, i in last_table.parent.children
1296 target = last_table.parent
1300 # 6. Let previous element be the element immediately above last
1301 # table in the stack of open elements.
1303 # huh? how could it not have a parent?
1304 previous_element = open_els[last_table_i + 1]
1305 # 7. Let adjusted insertion location be inside previous
1306 # element, after its last child (if any).
1307 target = previous_element
1308 target_i = target.children.length
1309 # Note: These steps are involved in part because it's possible
1310 # for elements, the table element in this case in particular,
1311 # to have been moved by a script around in the DOM, or indeed
1312 # removed from the DOM entirely, after the element was inserted
1314 break # don't really loop
1316 # Otherwise Let adjusted insertion location be inside target, after
1317 # its last child (if any).
1318 target_i = target.children.length
1320 # 3. If the adjusted insertion location is inside a template element,
1321 # let it instead be inside the template element's template contents,
1322 # after its last child (if any).
1323 # fixfull (template)
1325 # 4. Return the adjusted insertion location.
1326 return [target, target_i]
1328 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1329 # aka create_an_element_for_token
1330 token_to_element = (t, namespace, intended_parent) ->
1331 # convert attributes into a hash
1334 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1335 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1337 # TODO 2. If the newly created element has an xmlns attribute in the
1338 # XMLNS namespace whose value is not exactly the same as the element's
1339 # namespace, that is a parse error. Similarly, if the newly created
1340 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1341 # value is not the XLink Namespace, that is a parse error.
1343 # fixfull: the spec says stuff about form pointers and ownerDocument
1347 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1348 insert_foreign_element = (token, namespace) ->
1349 ail = adjusted_insertion_location()
1352 el = token_to_element token, namespace, ail_el
1353 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1355 ail_el.children.splice ail_i, 0, el
1358 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1359 insert_html_element = (token) ->
1360 insert_foreign_element token, NS_HTML
1362 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1363 # position should be [node, index_within_children]
1364 insert_comment = (t, position = null) ->
1365 position ?= adjusted_insertion_location()
1366 position[0].children.splice position[1], 0, t
1369 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1370 parse_generic_raw_text = (t) ->
1371 insert_html_element t
1372 tok_state = tok_state_rawtext
1373 original_ins_mode = ins_mode
1374 ins_mode = ins_mode_text
1375 parse_generic_rcdata_text = (t) ->
1376 insert_html_element t
1377 tok_state = tok_state_rcdata
1378 original_ins_mode = ins_mode
1379 ins_mode = ins_mode_text
1381 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1382 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1383 generate_implied_end_tags = (except = null) ->
1384 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1387 # 8.2.5.4 The rules for parsing tokens in HTML content
1388 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1390 # 8.2.5.4.1 The "initial" insertion mode
1391 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1392 ins_mode_initial = (t) ->
1395 if t.type is TYPE_COMMENT
1399 if t.type is TYPE_DOCTYPE
1400 # FIXME check identifiers, set quirks, etc
1403 ins_mode = ins_mode_before_html
1406 #fixfull (iframe, quirks)
1407 ins_mode = ins_mode_before_html
1411 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1412 ins_mode_before_html = (t) ->
1413 if t.type is TYPE_DOCTYPE
1416 if t.type is TYPE_COMMENT
1421 if t.type is TYPE_START_TAG and t.name is 'html'
1422 el = token_to_element t, NS_HTML, doc
1423 doc.children.push el
1424 open_els.unshift(el)
1425 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1426 ins_mode = ins_mode_before_head
1428 if t.type is TYPE_END_TAG
1429 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1430 # fall through to "anything else"
1435 html_tok = new_open_tag 'html'
1436 el = token_to_element html_tok, NS_HTML, doc
1437 doc.children.push el
1439 # ?fixfull browsing context
1440 ins_mode = ins_mode_before_head
1444 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1445 ins_mode_before_head = (t) ->
1448 if t.type is TYPE_COMMENT
1451 if t.type is TYPE_DOCTYPE
1454 if t.type is TYPE_START_TAG and t.name is 'html'
1457 if t.type is TYPE_START_TAG and t.name is 'head'
1458 el = insert_html_element t
1459 head_element_pointer = el
1460 ins_mode = ins_mode_in_head
1462 if t.type is TYPE_END_TAG
1463 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1464 # fall through to Anything else below
1469 head_tok = new_open_tag 'head'
1470 el = insert_html_element head_tok
1471 head_element_pointer = el
1472 ins_mode = ins_mode_in_head
1475 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1476 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1477 open_els.shift() # spec says this will be a 'head' node
1478 ins_mode = ins_mode_after_head
1480 ins_mode_in_head = (t) ->
1481 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1484 if t.type is TYPE_COMMENT
1487 if t.type is TYPE_DOCTYPE
1490 if t.type is TYPE_START_TAG and t.name is 'html'
1493 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1494 el = insert_html_element t
1496 t.acknowledge_self_closing()
1498 if t.type is TYPE_START_TAG and t.name is 'meta'
1499 el = insert_html_element t
1501 t.acknowledge_self_closing()
1502 # fixfull encoding stuff
1504 if t.type is TYPE_START_TAG and t.name is 'title'
1505 parse_generic_rcdata_text t
1507 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1508 parse_generic_raw_text t
1510 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1511 insert_html_element t
1512 ins_mode = ins_mode_in_head_noscript
1514 if t.type is TYPE_START_TAG and t.name is 'script'
1515 ail = adjusted_insertion_location()
1516 el = token_to_element t, NS_HTML, ail
1517 el.flag 'parser-inserted', true
1518 # fixfull frament case
1519 ail[0].children.splice ail[1], 0, el
1521 tok_state = tok_state_script_data
1522 original_ins_mode = ins_mode # make sure orig... is defined
1523 ins_mode = ins_mode_text
1525 if t.type is TYPE_END_TAG and t.name is 'head'
1526 open_els.shift() # will be a head element... spec says so
1527 ins_mode = ins_mode_after_head
1529 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1530 ins_mode_in_head_else t
1532 if t.type is TYPE_START_TAG and t.name is 'template'
1533 insert_html_element t
1535 flag_frameset_ok = false
1536 ins_mode = ins_mode_in_template
1537 template_ins_modes.unshift ins_mode_in_template
1539 if t.type is TYPE_END_TAG and t.name is 'template'
1540 if template_tag_is_open()
1541 generate_implied_end_tags
1542 if open_els[0].name isnt 'template'
1545 el = open_els.shift()
1546 if el.name is 'template' and el.namespace is NS_HTML
1548 clear_afe_to_marker()
1549 template_ins_modes.shift()
1554 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1557 ins_mode_in_head_else t
1559 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1560 ins_mode_in_head_noscript_else = (t) ->
1563 ins_mode = ins_mode_in_head
1565 ins_mode_in_head_noscript = (t) ->
1566 if t.type is TYPE_DOCTYPE
1569 if t.type is TYPE_START_TAG and t.name is 'html'
1572 if t.type is TYPE_END_TAG and t.name is 'noscript'
1574 ins_mode = ins_mode_in_head
1576 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1579 if t.type is TYPE_END_TAG and t.name is 'br'
1580 ins_mode_in_head_noscript_else t
1582 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1586 ins_mode_in_head_noscript_else t
1591 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1592 ins_mode_after_head_else = (t) ->
1593 body_tok = new_open_tag 'body'
1594 insert_html_element body_tok
1595 ins_mode = ins_mode_in_body
1598 ins_mode_after_head = (t) ->
1602 if t.type is TYPE_COMMENT
1605 if t.type is TYPE_DOCTYPE
1608 if t.type is TYPE_START_TAG and t.name is 'html'
1611 if t.type is TYPE_START_TAG and t.name is 'body'
1612 insert_html_element t
1613 flag_frameset_ok = false
1614 ins_mode = ins_mode_in_body
1616 if t.type is TYPE_START_TAG and t.name is 'frameset'
1617 insert_html_element t
1618 ins_mode = ins_mode_in_frameset
1620 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1622 open_els.unshift head_element_pointer
1624 for el, i of open_els
1625 if el is head_element_pointer
1626 open_els.splice i, 1
1628 console.log "warning: 23904 couldn't find head element in open_els"
1630 if t.type is TYPE_END_TAG and t.name is 'template'
1633 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1634 ins_mode_after_head_else t
1636 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1640 ins_mode_after_head_else t
1642 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1643 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1644 for el, i in open_els
1645 if el.name is name and el.namespace is NS_HTML
1646 generate_implied_end_tags name # arg is exception
1647 parse_error() unless i is 0
1652 if special_elements[el.name] is el.namespace
1656 ins_mode_in_body = (t) ->
1657 if t.type is TYPE_TEXT and t.text is "\u0000"
1664 if t.type is TYPE_TEXT
1667 flag_frameset_ok = false
1669 if t.type is TYPE_COMMENT
1672 if t.type is TYPE_DOCTYPE
1675 if t.type is TYPE_START_TAG and t.name is 'html'
1677 return if template_tag_is_open()
1678 root_attrs = open_els[open_els.length - 1].attrs
1680 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1683 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1686 if t.type is TYPE_START_TAG and t.name is 'body'
1688 return if open_els.length < 2
1689 second = open_els[open_els.length - 2]
1690 return unless second.namespace is NS_HTML
1691 return unless second.name is 'body'
1692 return if template_tag_is_open()
1693 flag_frameset_ok = false
1695 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1697 if t.type is TYPE_START_TAG and t.name is 'frameset'
1699 return if open_els.length < 2
1700 second_i = open_els.length - 2
1701 second = open_els[second_i]
1702 return unless second.namespace is NS_HTML
1703 return unless second.name is 'body'
1704 if flag_frameset_ok is false
1707 for el, i in second.parent.children
1709 second.parent.children.splice i, 1
1711 open_els.splice second_i, 1
1712 # pop everything except the "root html element"
1713 while open_els.length > 1
1715 insert_html_element t
1716 ins_mode = ins_mode_in_frameset
1718 if t.type is TYPE_EOF
1720 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1721 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1722 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1725 unless ok_tags[t.name] is el.namespace
1728 if template_ins_modes.length > 0
1729 ins_mode_in_template t
1733 if t.type is TYPE_END_TAG and t.name is 'body'
1734 unless is_in_scope 'body', NS_HTML
1738 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1739 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1740 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1741 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1745 unless ok_tags[t.name] is el.namespace
1748 ins_mode = ins_mode_after_body
1750 if t.type is TYPE_END_TAG and t.name is 'html'
1751 unless is_in_scope 'body', NS_HTML
1755 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1756 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1757 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1758 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1762 unless ok_tags[t.name] is el.namespace
1765 ins_mode = ins_mode_after_body
1768 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1769 close_p_if_in_button_scope()
1770 insert_html_element t
1772 if t.type is TYPE_START_TAG and h_tags[t.name]?
1773 close_p_if_in_button_scope()
1774 if h_tags[open_els[0].name] is open_els[0].namespace
1777 insert_html_element t
1779 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1780 close_p_if_in_button_scope()
1781 insert_html_element t
1782 # spec: If the next token is a "LF" (U+000A) character token, then
1783 # ignore that token and move on to the next one. (Newlines at the
1784 # start of pre blocks are ignored as an authoring convenience.)
1785 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1787 flag_frameset_ok = false
1789 if t.type is TYPE_START_TAG and t.name is 'form'
1790 unless form_element_pointer is null or template_tag_is_open()
1793 close_p_if_in_button_scope()
1794 el = insert_html_element t
1795 unless template_tag_is_open()
1796 form_element_pointer = el
1798 if t.type is TYPE_START_TAG and t.name is 'li'
1799 flag_frameset_ok = false
1800 for node in open_els
1801 if node.name is 'li' and node.namespace is NS_HTML
1802 generate_implied_end_tags 'li' # arg is exception
1803 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1806 el = open_els.shift()
1807 if el.name is 'li' and el.namespace is NS_HTML
1810 if el_is_special_not_adp node
1812 close_p_if_in_button_scope()
1813 insert_html_element t
1815 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1816 flag_frameset_ok = false
1817 for node in open_els
1818 if node.name is 'dd' and node.namespace is NS_HTML
1819 generate_implied_end_tags 'dd' # arg is exception
1820 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1823 el = open_els.shift()
1824 if el.name is 'dd' and el.namespace is NS_HTML
1827 if node.name is 'dt' and node.namespace is NS_HTML
1828 generate_implied_end_tags 'dt' # arg is exception
1829 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1832 el = open_els.shift()
1833 if el.name is 'dt' and el.namespace is NS_HTML
1836 if el_is_special_not_adp node
1838 close_p_if_in_button_scope()
1839 insert_html_element t
1841 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1842 close_p_if_in_button_scope()
1843 insert_html_element t
1844 tok_state = tok_state_plaintext
1846 if t.type is TYPE_START_TAG and t.name is 'button'
1847 if is_in_scope 'button', NS_HTML
1849 generate_implied_end_tags()
1851 el = open_els.shift()
1852 if el.name is 'button' and el.namespace is NS_HTML
1855 insert_html_element t
1856 flag_frameset_ok = false
1858 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1859 unless is_in_scope t.name, NS_HTML
1862 generate_implied_end_tags()
1863 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1866 el = open_els.shift()
1867 if el.name is t.name and el.namespace is NS_HTML
1870 if t.type is TYPE_END_TAG and t.name is 'form'
1871 unless template_tag_is_open()
1872 node = form_element_pointer
1873 form_element_pointer = null
1874 if node is null or not el_is_in_scope node
1877 generate_implied_end_tags()
1878 if open_els[0] isnt node
1880 for el, i in open_els
1882 open_els.splice i, 1
1885 unless is_in_scope 'form', NS_HTML
1888 generate_implied_end_tags()
1889 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1892 el = open_els.shift()
1893 if el.name is 'form' and el.namespace is NS_HTML
1896 if t.type is TYPE_END_TAG and t.name is 'p'
1897 unless is_in_button_scope 'p', NS_HTML
1899 insert_html_element new_open_tag 'p'
1902 if t.type is TYPE_END_TAG and t.name is 'li'
1903 unless is_in_li_scope 'li', NS_HTML
1906 generate_implied_end_tags 'li' # arg is exception
1907 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1910 el = open_els.shift()
1911 if el.name is 'li' and el.namespace is NS_HTML
1914 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1915 unless is_in_scope t.name, NS_HTML
1918 generate_implied_end_tags t.name # arg is exception
1919 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1922 el = open_els.shift()
1923 if el.name is t.name and el.namespace is NS_HTML
1926 if t.type is TYPE_END_TAG and h_tags[t.name]?
1929 if h_tags[el.name] is el.namespace
1932 if standard_scopers[el.name] is el.namespace
1937 generate_implied_end_tags()
1938 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1941 el = open_els.shift()
1942 if h_tags[el.name] is el.namespace
1946 if t.type is TYPE_START_TAG and t.name is 'a'
1947 # If the list of active formatting elements contains an a element
1948 # between the end of the list and the last marker on the list (or
1949 # the start of the list if there is no marker on the list), then
1950 # this is a parse error; run the adoption agency algorithm for the
1951 # tag name "a", then remove that element from the list of active
1952 # formatting elements and the stack of open elements if the
1953 # adoption agency algorithm didn't already remove it (it might not
1954 # have if the element is not in table scope).
1957 if el.type is TYPE_AFE_MARKER
1959 if el.name is 'a' and el.namespace is NS_HTML
1967 for el, i in open_els
1969 open_els.splice i, 1
1971 el = insert_html_element t
1974 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1976 el = insert_html_element t
1979 if t.type is TYPE_START_TAG and t.name is 'nobr'
1981 el = insert_html_element t
1984 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1985 adoption_agency t.name
1987 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1989 insert_html_element t
1991 flag_frameset_ok = false
1993 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1994 unless is_in_scope t.name, NS_HTML
1997 generate_implied_end_tags()
1998 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2001 el = open_els.shift()
2002 if el.name is t.name and el.namespace is NS_HTML
2004 clear_afe_to_marker()
2006 if t.type is TYPE_START_TAG and t.name is 'table'
2007 close_p_if_in_button_scope() # fixfull quirksmode thing
2008 insert_html_element t
2009 flag_frameset_ok = false
2010 ins_mode = ins_mode_in_table
2012 if t.type is TYPE_END_TAG and t.name is 'br'
2014 t.type is TYPE_START_TAG
2016 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2018 insert_html_element t
2020 t.acknowledge_self_closing()
2021 flag_frameset_ok = false
2023 if t.type is TYPE_START_TAG and t.name is 'input'
2025 insert_html_element t
2027 t.acknowledge_self_closing()
2028 unless is_input_hidden_tok t
2029 flag_frameset_ok = false
2031 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
2032 insert_html_element t
2034 t.acknowledge_self_closing()
2036 if t.type is TYPE_START_TAG and t.name is 'hr'
2037 close_p_if_in_button_scope()
2038 insert_html_element t
2040 t.acknowledge_self_closing()
2041 flag_frameset_ok = false
2043 if t.type is TYPE_START_TAG and t.name is 'image'
2048 if t.type is TYPE_START_TAG and t.name is 'isindex'
2050 if template_tag_is_open() is false and form_element_pointer isnt null
2052 t.acknowledge_self_closing()
2053 flag_frameset_ok = false
2054 close_p_if_in_button_scope()
2055 el = insert_html_element new_open_tag 'form'
2056 unless template_tag_is_open()
2057 form_element_pointer = el
2060 el.attrs['action'] = a[1]
2062 insert_html_element new_open_tag 'hr'
2065 insert_html_element new_open_tag 'label'
2066 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2067 input_el = new_open_tag 'input'
2072 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2073 input_el.attrs_a.push [a[0], a[1]]
2074 input_el.attrs_a.push ['name', 'isindex']
2075 # fixfull this next bit is in english... internationalize?
2076 prompt ?= "This is a searchable index. Enter search keywords: "
2077 insert_character new_character_token prompt # fixfull split
2078 # TODO submit typo "balue" in spec
2079 insert_html_element input_el
2081 # insert_character '' # you can put chars here if promt attr missing
2083 insert_html_element new_open_tag 'hr'
2086 unless template_tag_is_open()
2087 form_element_pointer = null
2089 if t.type is TYPE_START_TAG and t.name is 'textarea'
2090 insert_html_element t
2091 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2093 tok_state = tok_state_rcdata
2094 original_ins_mode = ins_mode
2095 flag_frameset_ok = false
2096 ins_mode = ins_mode_text
2098 if t.type is TYPE_START_TAG and t.name is 'xmp'
2099 close_p_if_in_button_scope()
2101 flag_frameset_ok = false
2102 parse_generic_raw_text t
2104 if t.type is TYPE_START_TAG and t.name is 'iframe'
2105 flag_frameset_ok = false
2106 parse_generic_raw_text t
2108 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2109 parse_generic_raw_text t
2111 if t.type is TYPE_START_TAG and t.name is 'select'
2113 insert_html_element t
2114 flag_frameset_ok = false
2115 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2116 ins_mode = ins_mode_in_select_in_table
2118 ins_mode = ins_mode_in_select
2120 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2121 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2124 insert_html_element t
2126 # this comment block implements the W3C spec
2127 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2128 # if is_in_scope 'ruby', NS_HTML
2129 # generate_implied_end_tags()
2130 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2132 # insert_html_element t
2134 # if t.type is TYPE_START_TAG and t.name is 'rt'
2135 # if is_in_scope 'ruby', NS_HTML
2136 # generate_implied_end_tags 'rtc' # arg is exception
2137 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2139 # insert_html_element t
2141 # below implements the WATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2142 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2143 if is_in_scope 'ruby', NS_HTML
2144 generate_implied_end_tags()
2145 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2147 insert_html_element t
2149 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2150 if is_in_scope 'ruby', NS_HTML
2151 generate_implied_end_tags 'rtc'
2152 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2154 insert_html_element t
2157 if t.type is TYPE_START_TAG and t.name is 'math'
2159 adjust_mathml_attributes t
2160 adjust_foreign_attributes t
2161 insert_foreign_element t, NS_MATHML
2162 if t.flag 'self-closing'
2164 t.acknowledge_self_closing()
2166 if t.type is TYPE_START_TAG and t.name is 'svg'
2168 adjust_svg_attributes t
2169 adjust_foreign_attributes t
2170 insert_foreign_element t, NS_SVG
2171 if t.flag 'self-closing'
2173 t.acknowledge_self_closing()
2175 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2178 if t.type is TYPE_START_TAG # any other start tag
2180 insert_html_element t
2182 if t.type is TYPE_END_TAG # any other end tag
2183 in_body_any_other_end_tag t.name
2187 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2188 ins_mode_text = (t) ->
2189 if t.type is TYPE_TEXT
2192 if t.type is TYPE_EOF
2194 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2195 open_els[0].flag 'already started', true
2197 ins_mode = original_ins_mode
2200 if t.type is TYPE_END_TAG and t.name is 'script'
2202 ins_mode = original_ins_mode
2203 # fixfull the spec seems to assume that I'm going to run the script
2204 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2206 if t.type is TYPE_END_TAG
2208 ins_mode = original_ins_mode
2210 console.log 'warning: end of ins_mode_text reached'
2212 # the functions below implement the tokenizer stats described here:
2213 # http://www.w3.org/TR/html5/syntax.html#tokenization
2215 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2216 ins_mode_in_table_else = (t) ->
2218 flag_foster_parenting = true
2220 flag_foster_parenting = false
2222 ins_mode_in_table = (t) ->
2225 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2226 pending_table_character_tokens = []
2227 original_ins_mode = ins_mode
2228 ins_mode = ins_mode_in_table_text
2231 ins_mode_in_table_else t
2239 clear_stack_to_table_context()
2241 insert_html_element t
2242 ins_mode = ins_mode_in_caption
2244 clear_stack_to_table_context()
2245 insert_html_element t
2246 ins_mode = ins_mode_in_column_group
2248 clear_stack_to_table_context()
2249 insert_html_element new_open_tag 'colgroup'
2250 ins_mode = ins_mode_in_column_group
2252 when 'tbody', 'tfoot', 'thead'
2253 clear_stack_to_table_context()
2254 insert_html_element t
2255 ins_mode = ins_mode_in_table_body
2256 when 'td', 'th', 'tr'
2257 clear_stack_to_table_context()
2258 insert_html_element new_open_tag 'tbody'
2259 ins_mode = ins_mode_in_table_body
2263 if is_in_table_scope 'table', NS_HTML
2265 el = open_els.shift()
2266 if el.name is 'table' and el.namespace is NS_HTML
2270 when 'style', 'script', 'template'
2273 unless is_input_hidden_tok t
2274 ins_mode_in_table_else t
2277 el = insert_html_element t
2279 t.acknowledge_self_closing()
2282 if form_element_pointer?
2284 if template_tag_is_open()
2286 form_element_pointer = insert_html_element t
2289 ins_mode_in_table_else t
2293 if is_in_table_scope 'table', NS_HTML
2295 el = open_els.shift()
2296 if el.name is 'table' and el.namespace is NS_HTML
2301 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2306 ins_mode_in_table_else t
2310 ins_mode_in_table_else t
2313 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2314 ins_mode_in_table_text = (t) ->
2315 if t.type is TYPE_TEXT and t.text is "\u0000"
2319 if t.type is TYPE_TEXT
2320 pending_table_character_tokens.push t
2324 for old in pending_table_character_tokens
2325 unless is_space_tok old
2329 for old in pending_table_character_tokens
2330 insert_character old
2332 for old in pending_table_character_tokens
2333 ins_mode_in_table_else old
2334 pending_table_character_tokens = []
2335 ins_mode = original_ins_mode
2338 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2339 ins_mode_in_caption = (t) ->
2340 if t.type is TYPE_END_TAG and t.name is 'caption'
2341 if is_in_table_scope 'caption', NS_HTML
2342 generate_implied_end_tags()
2343 if open_els[0].name isnt 'caption'
2346 el = open_els.shift()
2347 if el.name is 'caption' and el.namespace is NS_HTML
2349 clear_afe_to_marker()
2350 ins_mode = ins_mode_in_table
2355 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2357 if is_in_table_scope 'caption', NS_HTML
2359 el = open_els.shift()
2360 if el.name is 'caption' and el.namespace is NS_HTML
2362 clear_afe_to_marker()
2363 ins_mode = ins_mode_in_table
2365 # else fragment case
2367 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2373 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2374 ins_mode_in_column_group = (t) ->
2378 if t.type is TYPE_COMMENT
2381 if t.type is TYPE_DOCTYPE
2384 if t.type is TYPE_START_TAG and t.name is 'html'
2387 if t.type is TYPE_START_TAG and t.name is 'col'
2388 el = insert_html_element t
2390 t.acknowledge_self_closing()
2392 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2393 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2395 ins_mode = ins_mode_in_table
2399 if t.type is TYPE_END_TAG and t.name is 'col'
2402 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2405 if t.type is TYPE_EOF
2409 if open_els[0].name isnt 'colgroup'
2413 ins_mode = ins_mode_in_table
2417 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2418 ins_mode_in_table_body = (t) ->
2419 if t.type is TYPE_START_TAG and t.name is 'tr'
2420 clear_stack_to_table_body_context()
2421 insert_html_element t
2422 ins_mode = ins_mode_in_row
2424 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2426 clear_stack_to_table_body_context()
2427 insert_html_element new_open_tag 'tr'
2428 ins_mode = ins_mode_in_row
2431 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2432 unless is_in_table_scope t.name, NS_HTML
2435 clear_stack_to_table_body_context()
2437 ins_mode = ins_mode_in_table
2439 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2442 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2445 if table_scopers[el.name] is el.namespace
2450 clear_stack_to_table_body_context()
2452 ins_mode = ins_mode_in_table
2455 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2461 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2462 ins_mode_in_row = (t) ->
2463 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2464 clear_stack_to_table_row_context()
2465 insert_html_element t
2466 ins_mode = ins_mode_in_cell
2469 if t.type is TYPE_END_TAG and t.name is 'tr'
2470 if is_in_table_scope 'tr', NS_HTML
2471 clear_stack_to_table_row_context()
2473 ins_mode = ins_mode_in_table_body
2477 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2478 if is_in_table_scope 'tr', NS_HTML
2479 clear_stack_to_table_row_context()
2481 ins_mode = ins_mode_in_table_body
2486 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2487 if is_in_table_scope t.name, NS_HTML
2488 if is_in_table_scope 'tr', NS_HTML
2489 clear_stack_to_table_row_context()
2491 ins_mode = ins_mode_in_table_body
2496 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2502 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2504 generate_implied_end_tags()
2505 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2508 el = open_els.shift()
2509 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2511 clear_afe_to_marker()
2512 ins_mode = ins_mode_in_row
2514 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2515 ins_mode_in_cell = (t) ->
2516 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2517 if is_in_table_scope t.name, NS_HTML
2518 generate_implied_end_tags()
2519 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2522 el = open_els.shift()
2523 if el.name is t.name and el.namespace is NS_HTML
2525 clear_afe_to_marker()
2526 ins_mode = ins_mode_in_row
2530 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2533 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2536 if table_scopers[el.name] is el.namespace
2544 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2547 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2548 if is_in_table_scope t.name, NS_HTML
2557 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2558 ins_mode_in_select = (t) ->
2559 if t.type is TYPE_TEXT and t.text is "\u0000"
2562 if t.type is TYPE_TEXT
2565 if t.type is TYPE_COMMENT
2568 if t.type is TYPE_DOCTYPE
2571 if t.type is TYPE_START_TAG and t.name is 'html'
2574 if t.type is TYPE_START_TAG and t.name is 'option'
2575 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2577 insert_html_element t
2579 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2580 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2582 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2584 insert_html_element t
2586 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2587 if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
2588 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2590 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2595 if t.type is TYPE_END_TAG and t.name is 'option'
2596 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2601 if t.type is TYPE_END_TAG and t.name is 'select'
2602 if is_in_select_scope 'select', NS_HTML
2604 el = open_els.shift()
2605 if el.name is 'select' and el.namespace is NS_HTML
2611 if t.type is TYPE_START_TAG and t.name is 'select'
2614 el = open_els.shift()
2615 if el.name is 'select' and el.namespace is NS_HTML
2618 # spec says that this is the same as </select> but it doesn't say
2619 # to check scope first
2621 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2623 if is_in_select_scope 'select', NS_HTML
2626 el = open_els.shift()
2627 if el.name is 'select' and el.namespace is NS_HTML
2632 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2635 if t.type is TYPE_EOF
2642 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2643 ins_mode_in_select_in_table = (t) ->
2644 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2647 el = open_els.shift()
2648 if el.name is 'select' and el.namespace is NS_HTML
2653 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2655 unless is_in_table_scope t.name, NS_HTML
2658 el = open_els.shift()
2659 if el.name is 'select' and el.namespace is NS_HTML
2665 ins_mode_in_select t
2668 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2669 ins_mode_in_template = (t) ->
2670 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2673 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2676 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2677 template_ins_modes.shift()
2678 template_ins_modes.unshift ins_mode_in_table
2679 ins_mode = ins_mode_in_table
2682 if t.type is TYPE_START_TAG and t.name is 'col'
2683 template_ins_modes.shift()
2684 template_ins_modes.unshift ins_mode_in_column_group
2685 ins_mode = ins_mode_in_column_group
2688 if t.type is TYPE_START_TAG and t.name is 'tr'
2689 template_ins_modes.shift()
2690 template_ins_modes.unshift ins_mode_in_table_body
2691 ins_mode = ins_mode_in_table_body
2694 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2695 template_ins_modes.shift()
2696 template_ins_modes.unshift ins_mode_in_row
2697 ins_mode = ins_mode_in_row
2700 if t.type is TYPE_START_TAG
2701 template_ins_modes.shift()
2702 template_ins_modes.unshift ins_mode_in_body
2703 ins_mode = ins_mode_in_body
2706 if t.type is TYPE_END_TAG
2709 if t.type is TYPE_EOF
2710 unless template_tag_is_open()
2715 el = open_els.shift()
2716 if el.name is 'template' and el.namespace is NS_HTML
2718 clear_afe_to_marker()
2719 template_ins_modes.shift()
2723 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2724 ins_mode_after_body = (t) ->
2728 if t.type is TYPE_COMMENT
2729 first = open_els[open_els.length - 1]
2730 insert_comment t, [first, first.children.length]
2732 if t.type is TYPE_DOCTYPE
2735 if t.type is TYPE_START_TAG and t.name is 'html'
2738 if t.type is TYPE_END_TAG and t.name is 'html'
2739 if flag_fragment_parsing
2742 ins_mode = ins_mode_after_after_body
2744 if t.type is TYPE_EOF
2749 ins_mode = ins_mode_in_body
2752 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2753 ins_mode_in_frameset = (t) ->
2757 if t.type is TYPE_COMMENT
2760 if t.type is TYPE_DOCTYPE
2763 if t.type is TYPE_START_TAG and t.name is 'html'
2766 if t.type is TYPE_START_TAG and t.name is 'frameset'
2767 insert_html_element t
2769 if t.type is TYPE_END_TAG and t.name is 'frameset'
2770 if open_els.length is 1
2772 return # fragment case
2774 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2775 ins_mode = ins_mode_after_frameset
2777 if t.type is TYPE_START_TAG and t.name is 'frame'
2778 insert_html_element t
2780 t.acknowledge_self_closing()
2782 if t.type is TYPE_START_TAG and t.name is 'noframes'
2785 if t.type is TYPE_EOF
2786 if open_els.length isnt 1
2794 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2795 ins_mode_after_frameset = (t) ->
2799 if t.type is TYPE_COMMENT
2802 if t.type is TYPE_DOCTYPE
2805 if t.type is TYPE_START_TAG and t.name is 'html'
2808 if t.type is TYPE_END_TAG and t.name is 'html'
2809 ins_mode = ins_mode_after_after_frameset
2811 if t.type is TYPE_START_TAG and t.name is 'noframes'
2814 if t.type is TYPE_EOF
2821 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2822 ins_mode_after_after_body = (t) ->
2823 if t.type is TYPE_COMMENT
2824 insert_comment t, [doc, doc.children.length]
2826 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2829 if t.type is TYPE_EOF
2834 ins_mode = ins_mode_in_body
2838 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2839 ins_mode_after_after_frameset = (t) ->
2840 if t.type is TYPE_COMMENT
2841 insert_comment t, [doc, doc.children.length]
2843 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2846 if t.type is TYPE_EOF
2849 if t.type is TYPE_START_TAG and t.name is 'noframes'
2856 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2857 has_color_face_or_size = (t) ->
2859 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2862 in_foreign_content_end_script = ->
2866 in_foreign_content_other_start = (t) ->
2867 acn = adjusted_current_node()
2868 if acn.namespace is NS_MATHML
2869 adjust_mathml_attributes t
2870 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2871 t.name = svg_name_fixes[t.name]
2872 if acn.namespace is NS_SVG
2873 adjust_svg_attributes t
2874 adjust_foreign_attributes t
2875 insert_foreign_element t, acn.namespace
2876 if t.flag 'self-closing'
2877 if t.name is 'script'
2878 t.acknowledge_self_closing()
2879 in_foreign_content_end_script()
2883 t.acknowledge_self_closing()
2885 in_foreign_content = (t) ->
2886 if t.type is TYPE_TEXT and t.text is "\u0000"
2888 insert_character new_character_token "\ufffd"
2893 if t.type is TYPE_TEXT
2894 flag_frameset_ok = false
2897 if t.type is TYPE_COMMENT
2900 if t.type is TYPE_DOCTYPE
2903 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
2905 if flag_fragment_parsing
2906 in_foreign_content_other_start t
2908 loop # is this safe?
2910 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
2914 if t.type is TYPE_START_TAG
2915 in_foreign_content_other_start t
2917 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
2918 in_foreign_content_end_script()
2920 if t.type is TYPE_END_TAG
2923 if node.name.toLowerCase() isnt t.name
2926 if node is open_els[open_els.length - 1]
2928 if node.name.toLowerCase() is t.name
2930 el = open_els.shift()
2935 if node.namespace is NS_HTML
2937 ins_mode t # explicitly call HTML insertion mode
2940 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2942 switch c = txt.charAt(cur++)
2944 return new_text_node parse_character_reference()
2946 tok_state = tok_state_tag_open
2949 return new_text_node "\ufffd"
2951 return new_eof_token()
2953 return new_text_node c
2956 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2957 # not needed: tok_state_character_reference_in_data = ->
2958 # just call parse_character_reference()
2960 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2961 tok_state_rcdata = ->
2962 switch c = txt.charAt(cur++)
2964 return new_text_node parse_character_reference()
2966 tok_state = tok_state_rcdata_less_than_sign
2969 return new_character_token "\ufffd"
2971 return new_eof_token()
2973 return new_character_token c
2976 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2977 # not needed: tok_state_character_reference_in_rcdata = ->
2978 # just call parse_character_reference()
2980 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2981 tok_state_rawtext = ->
2982 switch c = txt.charAt(cur++)
2984 tok_state = tok_state_rawtext_less_than_sign
2987 return new_character_token "\ufffd"
2989 return new_eof_token()
2991 return new_character_token c
2994 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2995 tok_state_script_data = ->
2996 switch c = txt.charAt(cur++)
2998 tok_state = tok_state_script_data_less_than_sign
3001 return new_character_token "\ufffd"
3003 return new_eof_token()
3005 return new_character_token c
3008 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3009 tok_state_plaintext = ->
3010 switch c = txt.charAt(cur++)
3013 return new_character_token "\ufffd"
3015 return new_eof_token()
3017 return new_character_token c
3021 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3022 tok_state_tag_open = ->
3023 c = txt.charAt(cur++)
3025 tok_state = tok_state_markup_declaration_open
3028 tok_state = tok_state_end_tag_open
3031 tok_cur_tag = new_open_tag c.toLowerCase()
3032 tok_state = tok_state_tag_name
3035 tok_cur_tag = new_open_tag c
3036 tok_state = tok_state_tag_name
3040 tok_cur_tag = new_comment_token '?' # FIXME right?
3041 tok_state = tok_state_bogus_comment
3045 tok_state = tok_state_data
3046 cur -= 1 # we didn't parse/handle the char after <
3047 return new_text_node '<'
3049 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3050 tok_state_end_tag_open = ->
3051 switch c = txt.charAt(cur++)
3054 tok_state = tok_state_data
3057 tok_state = tok_state_data
3058 return new_text_node '</'
3061 tok_cur_tag = new_end_tag c.toLowerCase()
3062 tok_state = tok_state_tag_name
3063 else if is_lc_alpha(c)
3064 tok_cur_tag = new_end_tag c
3065 tok_state = tok_state_tag_name
3068 tok_cur_tag = new_comment_token '/'
3069 tok_state = tok_state_bogus_comment
3072 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3073 tok_state_tag_name = ->
3074 switch c = txt.charAt(cur++)
3075 when "\t", "\n", "\u000c", ' '
3076 tok_state = tok_state_before_attribute_name
3078 tok_state = tok_state_self_closing_start_tag
3080 tok_state = tok_state_data
3086 tok_cur_tag.name += "\ufffd"
3089 tok_state = tok_state_data
3092 tok_cur_tag.name += c.toLowerCase()
3094 tok_cur_tag.name += c
3097 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3098 tok_state_rcdata_less_than_sign = ->
3099 c = txt.charAt(cur++)
3101 temporary_buffer = ''
3102 tok_state = tok_state_rcdata_end_tag_open
3105 tok_state = tok_state_rcdata
3106 cur -= 1 # reconsume the input character
3107 return new_character_token '<'
3109 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3110 tok_state_rcdata_end_tag_open = ->
3111 c = txt.charAt(cur++)
3113 tok_cur_tag = new_end_tag c.toLowerCase()
3114 temporary_buffer += c
3115 tok_state = tok_state_rcdata_end_tag_name
3118 tok_cur_tag = new_end_tag c
3119 temporary_buffer += c
3120 tok_state = tok_state_rcdata_end_tag_name
3123 tok_state = tok_state_rcdata
3124 cur -= 1 # reconsume the input character
3125 return new_character_token "</" # fixfull separate these
3127 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3128 is_appropriate_end_tag = (t) ->
3129 # spec says to check against "the tag name of the last start tag to
3130 # have been emitted from this tokenizer", but this is only called from
3131 # the various "raw" states, so it's hopefully ok to assume that
3132 # open_els[0].name will work instead TODO: verify this after the script
3133 # data states are implemented
3134 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3135 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3137 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3138 tok_state_rcdata_end_tag_name = ->
3139 c = txt.charAt(cur++)
3140 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3141 if is_appropriate_end_tag tok_cur_tag
3142 tok_state = tok_state_before_attribute_name
3144 # else fall through to "Anything else"
3146 if is_appropriate_end_tag tok_cur_tag
3147 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3149 # else fall through to "Anything else"
3151 if is_appropriate_end_tag tok_cur_tag
3152 tok_state = tok_state_data
3154 # else fall through to "Anything else"
3156 tok_cur_tag.name += c.toLowerCase()
3157 temporary_buffer += c
3160 tok_cur_tag.name += c
3161 temporary_buffer += c
3164 tok_state = tok_state_rcdata
3165 cur -= 1 # reconsume the input character
3166 return new_character_token '</' + temporary_buffer # fixfull separate these
3168 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3169 tok_state_rawtext_less_than_sign = ->
3170 c = txt.charAt(cur++)
3172 temporary_buffer = ''
3173 tok_state = tok_state_rawtext_end_tag_open
3176 tok_state = tok_state_rawtext
3177 cur -= 1 # reconsume the input character
3178 return new_character_token '<'
3180 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3181 tok_state_rawtext_end_tag_open = ->
3182 c = txt.charAt(cur++)
3184 tok_cur_tag = new_end_tag c.toLowerCase()
3185 temporary_buffer += c
3186 tok_state = tok_state_rawtext_end_tag_name
3189 tok_cur_tag = new_end_tag c
3190 temporary_buffer += c
3191 tok_state = tok_state_rawtext_end_tag_name
3194 tok_state = tok_state_rawtext
3195 cur -= 1 # reconsume the input character
3196 return new_character_token "</" # fixfull separate these
3198 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3199 tok_state_rawtext_end_tag_name = ->
3200 c = txt.charAt(cur++)
3201 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3202 if is_appropriate_end_tag tok_cur_tag
3203 tok_state = tok_state_before_attribute_name
3205 # else fall through to "Anything else"
3207 if is_appropriate_end_tag tok_cur_tag
3208 tok_state = tok_state_self_closing_start_tag
3210 # else fall through to "Anything else"
3212 if is_appropriate_end_tag tok_cur_tag
3213 tok_state = tok_state_data
3215 # else fall through to "Anything else"
3217 tok_cur_tag.name += c.toLowerCase()
3218 temporary_buffer += c
3221 tok_cur_tag.name += c
3222 temporary_buffer += c
3225 tok_state = tok_state_rawtext
3226 cur -= 1 # reconsume the input character
3227 return new_character_token '</' + temporary_buffer # fixfull separate these
3229 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3230 tok_state_script_data_less_than_sign = ->
3231 c = txt.charAt(cur++)
3233 temporary_buffer = ''
3234 tok_state = tok_state_script_data_end_tag_open
3237 tok_state = tok_state_script_data_escape_start
3238 return new_character_token '<!' # fixfull split
3240 tok_state = tok_state_script_data
3241 cur -= 1 # Reconsume
3242 return new_character_token '<'
3244 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3245 tok_state_script_data_end_tag_open = ->
3246 c = txt.charAt(cur++)
3248 tok_cur_tag = new_end_tag c.toLowerCase()
3249 temporary_buffer += c
3250 tok_state = tok_state_script_data_end_tag_name
3253 tok_cur_tag = new_end_tag c
3254 temporary_buffer += c
3255 tok_state = tok_state_script_data_end_tag_name
3258 tok_state = tok_state_script_data
3259 cur -= 1 # Reconsume
3260 return new_character_token '</'
3262 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3263 tok_state_script_data_end_tag_name = ->
3264 c = txt.charAt(cur++)
3265 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3266 if is_appropriate_end_tag tok_cur_tag
3267 tok_state = tok_state_before_attribute_name
3271 if is_appropriate_end_tag tok_cur_tag
3272 tok_state = tok_state_self_closing_start_tag
3276 if is_appropriate_end_tag tok_cur_tag
3277 tok_state = tok_state_data
3281 tok_cur_tag.name += c.toLowerCase()
3282 temporary_buffer += c
3285 tok_cur_tag.name += c
3286 temporary_buffer += c
3289 tok_state = tok_state_script_data
3290 cur -= 1 # Reconsume
3291 return new_character_token "</#{temporary_buffer}" # fixfull split
3293 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3294 tok_state_script_data_escape_start = ->
3295 c = txt.charAt(cur++)
3297 tok_state = tok_state_script_data_escape_start_dash
3298 return new_character_token '-'
3300 tok_state = tok_state_script_data
3301 cur -= 1 # Reconsume
3304 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3305 tok_state_script_data_escape_start_dash = ->
3306 c = txt.charAt(cur++)
3308 tok_state = tok_state_script_data_escaped_dash_dash
3309 return new_character_token '-'
3311 tok_state = tok_state_script_data
3312 cur -= 1 # Reconsume
3315 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3316 tok_state_script_data_escaped = ->
3317 c = txt.charAt(cur++)
3319 tok_state = tok_state_script_data_escaped_dash
3320 return new_character_token '-'
3322 tok_state = tok_state_script_data_escaped_less_than_sign
3326 return new_character_token "\ufffd"
3328 tok_state = tok_state_data
3330 cur -= 1 # Reconsume
3333 return new_character_token c
3335 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3336 tok_state_script_data_escaped_dash = ->
3337 c = txt.charAt(cur++)
3339 tok_state = tok_state_script_data_escaped_dash_dash
3340 return new_character_token '-'
3342 tok_state = tok_state_script_data_escaped_less_than_sign
3346 tok_state = tok_state_script_data_escaped
3347 return new_character_token "\ufffd"
3349 tok_state = tok_state_data
3351 cur -= 1 # Reconsume
3354 tok_state = tok_state_script_data_escaped
3355 return new_character_token c
3357 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3358 tok_state_script_data_escaped_dash_dash = ->
3359 c = txt.charAt(cur++)
3361 return new_character_token '-'
3363 tok_state = tok_state_script_data_escaped_less_than_sign
3366 tok_state = tok_state_script_data
3367 return new_character_token '>'
3370 tok_state = tok_state_script_data_escaped
3371 return new_character_token "\ufffd"
3374 tok_state = tok_state_data
3375 cur -= 1 # Reconsume
3378 tok_state = tok_state_script_data_escaped
3379 return new_character_token c
3381 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3382 tok_state_script_data_escaped_less_than_sign = ->
3383 c = txt.charAt(cur++)
3385 temporary_buffer = ''
3386 tok_state = tok_state_script_data_escaped_end_tag_open
3389 temporary_buffer = c.toLowerCase() # yes, really
3390 tok_state = tok_state_script_data_double_escape_start
3391 return new_character_token "<#{c}" # fixfull split
3393 temporary_buffer = c
3394 tok_state = tok_state_script_data_double_escape_start
3395 return new_character_token "<#{c}" # fixfull split
3397 tok_state = tok_state_script_data_escaped
3398 cur -= 1 # Reconsume
3399 return new_character_token '<'
3401 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3402 tok_state_script_data_escaped_end_tag_open = ->
3403 c = txt.charAt(cur++)
3405 tok_cur_tag = new_end_tag c.toLowerCase()
3406 temporary_buffer += c
3407 tok_state = tok_state_script_data_escaped_end_tag_name
3410 tok_cur_tag = new_end_tag c
3411 temporary_buffer += c
3412 tok_state = tok_state_script_data_escaped_end_tag_name
3415 tok_state = tok_state_script_data_escaped
3416 cur -= 1 # Reconsume
3417 return new_character_token '</' # fixfull split
3419 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3420 tok_state_script_data_escaped_end_tag_name = ->
3421 c = txt.charAt(cur++)
3422 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3423 if is_appropriate_end_tag tok_cur_tag
3424 tok_state = tok_state_before_attribute_name
3428 if is_appropriate_end_tag tok_cur_tag
3429 tok_state = tok_state_self_closing_start_tag
3433 if is_appropriate_end_tag tok_cur_tag
3434 tok_state = tok_state_data
3438 tok_cur_tag.name += c.toLowerCase()
3439 temporary_buffer += c.toLowerCase()
3442 tok_cur_tag.name += c
3443 temporary_buffer += c.toLowerCase()
3446 tok_state = tok_state_script_data_escaped
3447 cur -= 1 # Reconsume
3448 return new_character_token "</#{temporary_buffer}" # fixfull split
3450 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3451 tok_state_script_data_double_escape_start = ->
3452 c = txt.charAt(cur++)
3453 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3454 if temporary_buffer is 'script'
3455 tok_state = tok_state_script_data_double_escaped
3457 tok_state = tok_state_script_data_escaped
3458 return new_character_token c
3460 temporary_buffer += c.toLowerCase() # yes, really lowercase
3461 return new_character_token c
3463 temporary_buffer += c
3464 return new_character_token c
3466 tok_state = tok_state_script_data_escaped
3467 cur -= 1 # Reconsume
3470 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3471 tok_state_script_data_double_escaped = ->
3472 c = txt.charAt(cur++)
3474 tok_state = tok_state_script_data_double_escaped_dash
3475 return new_character_token '-'
3477 tok_state = tok_state_script_data_double_escaped_less_than_sign
3478 return new_character_token '<'
3481 return new_character_token "\ufffd"
3484 tok_state = tok_state_data
3485 cur -= 1 # Reconsume
3488 return new_character_token c
3490 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3491 tok_state_script_data_double_escaped_dash = ->
3492 c = txt.charAt(cur++)
3494 tok_state = tok_state_script_data_double_escaped_dash_dash
3495 return new_character_token '-'
3497 tok_state = tok_state_script_data_double_escaped_less_than_sign
3498 return new_character_token '<'
3501 tok_state = tok_state_script_data_double_escaped
3502 return new_character_token "\ufffd"
3505 tok_state = tok_state_data
3506 cur -= 1 # Reconsume
3509 tok_state = tok_state_script_data_double_escaped
3510 return new_character_token c
3512 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3513 tok_state_script_data_double_escaped_dash_dash = ->
3514 c = txt.charAt(cur++)
3516 return new_character_token '-'
3518 tok_state = tok_state_script_data_double_escaped_less_than_sign
3519 return new_character_token '<'
3521 tok_state = tok_state_script_data
3522 return new_character_token '>'
3525 tok_state = tok_state_script_data_double_escaped
3526 return new_character_token "\ufffd"
3529 tok_state = tok_state_data
3530 cur -= 1 # Reconsume
3533 tok_state = tok_state_script_data_double_escaped
3534 return new_character_token c
3536 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3537 tok_state_script_data_double_escaped_less_than_sign = ->
3538 c = txt.charAt(cur++)
3540 temporary_buffer = ''
3541 tok_state = tok_state_script_data_double_escape_end
3542 return new_character_token '/'
3544 tok_state = tok_state_script_data_double_escaped
3545 cur -= 1 # Reconsume
3548 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3549 tok_state_script_data_double_escape_end = ->
3550 c = txt.charAt(cur++)
3551 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3552 if temporary_buffer is 'script'
3553 tok_state = tok_state_script_data_escaped
3555 tok_state = tok_state_script_data_double_escaped
3556 return new_character_token c
3558 temporary_buffer += c.toLowerCase() # yes, really lowercase
3559 return new_character_token c
3561 temporary_buffer += c
3562 return new_character_token c
3564 tok_state = tok_state_script_data_double_escaped
3565 cur -= 1 # Reconsume
3568 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3569 tok_state_before_attribute_name = ->
3571 switch c = txt.charAt(cur++)
3572 when "\t", "\n", "\u000c", ' '
3575 tok_state = tok_state_self_closing_start_tag
3578 tok_state = tok_state_data
3584 attr_name = "\ufffd"
3585 when '"', "'", '<', '='
3590 tok_state = tok_state_data
3593 attr_name = c.toLowerCase()
3597 tok_cur_tag.attrs_a.unshift [attr_name, '']
3598 tok_state = tok_state_attribute_name
3601 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3602 tok_state_attribute_name = ->
3603 switch c = txt.charAt(cur++)
3604 when "\t", "\n", "\u000c", ' '
3605 tok_state = tok_state_after_attribute_name
3607 tok_state = tok_state_self_closing_start_tag
3609 tok_state = tok_state_before_attribute_value
3611 tok_state = tok_state_data
3617 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3620 tok_cur_tag.attrs_a[0][0] += c
3623 tok_state = tok_state_data
3626 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3628 tok_cur_tag.attrs_a[0][0] += c
3631 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3632 tok_state_after_attribute_name = ->
3633 c = txt.charAt(cur++)
3634 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3637 tok_state = tok_state_self_closing_start_tag
3640 tok_state = tok_state_before_attribute_value
3643 tok_state = tok_state_data
3646 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3647 tok_state = tok_state_attribute_name
3651 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3652 tok_state = tok_state_attribute_name
3656 tok_state = tok_state_data
3657 cur -= 1 # reconsume
3659 if c is '"' or c is "'" or c is '<'
3661 # fall through to Anything else
3663 tok_cur_tag.attrs_a.unshift [c, '']
3664 tok_state = tok_state_attribute_name
3666 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3667 tok_state_before_attribute_value = ->
3668 switch c = txt.charAt(cur++)
3669 when "\t", "\n", "\u000c", ' '
3672 tok_state = tok_state_attribute_value_double_quoted
3674 tok_state = tok_state_attribute_value_unquoted
3677 tok_state = tok_state_attribute_value_single_quoted
3680 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3681 tok_state = tok_state_attribute_value_unquoted
3684 tok_state = tok_state_data
3690 tok_state = tok_state_data
3692 tok_cur_tag.attrs_a[0][1] += c
3693 tok_state = tok_state_attribute_value_unquoted
3696 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3697 tok_state_attribute_value_double_quoted = ->
3698 switch c = txt.charAt(cur++)
3700 tok_state = tok_state_after_attribute_value_quoted
3702 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3705 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3708 tok_state = tok_state_data
3710 tok_cur_tag.attrs_a[0][1] += c
3713 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3714 tok_state_attribute_value_single_quoted = ->
3715 switch c = txt.charAt(cur++)
3717 tok_state = tok_state_after_attribute_value_quoted
3719 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3722 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3725 tok_state = tok_state_data
3727 tok_cur_tag.attrs_a[0][1] += c
3730 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3731 tok_state_attribute_value_unquoted = ->
3732 switch c = txt.charAt(cur++)
3733 when "\t", "\n", "\u000c", ' '
3734 tok_state = tok_state_before_attribute_name
3736 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3738 tok_state = tok_state_data
3743 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3746 tok_state = tok_state_data
3748 # Parse Error if ', <, = or ` (backtick)
3749 tok_cur_tag.attrs_a[0][1] += c
3752 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3753 tok_state_after_attribute_value_quoted = ->
3754 switch c = txt.charAt(cur++)
3755 when "\t", "\n", "\u000c", ' '
3756 tok_state = tok_state_before_attribute_name
3758 tok_state = tok_state_self_closing_start_tag
3760 tok_state = tok_state_data
3766 tok_state = tok_state_data
3769 tok_state = tok_state_before_attribute_name
3770 cur -= 1 # we didn't handle that char
3773 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3774 tok_state_self_closing_start_tag = ->
3775 c = txt.charAt(cur++)
3777 tok_cur_tag.flag 'self-closing', true
3778 tok_state = tok_state_data
3782 tok_state = tok_state_data
3783 cur -= 1 # Reconsume
3787 tok_state = tok_state_before_attribute_name
3788 cur -= 1 # Reconsume
3791 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3792 # WARNING: put a comment token in tok_cur_tag before setting this state
3793 tok_state_bogus_comment = ->
3794 next_gt = txt.indexOf '>', cur
3796 val = txt.substr cur
3799 val = txt.substr cur, (next_gt - cur)
3801 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3802 tok_cur_tag.text += val
3803 tok_state = tok_state_data
3806 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3807 tok_state_markup_declaration_open = ->
3808 if txt.substr(cur, 2) is '--'
3810 tok_cur_tag = new_comment_token ''
3811 tok_state = tok_state_comment_start
3813 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3815 tok_state = tok_state_doctype
3817 acn = adjusted_current_node()
3818 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3820 tok_state = tok_state_cdata_section
3824 tok_cur_tag = new_comment_token ''
3825 tok_state = tok_state_bogus_comment
3828 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3829 tok_state_comment_start = ->
3830 switch c = txt.charAt(cur++)
3832 tok_state = tok_state_comment_start_dash
3835 tok_state = tok_state_comment
3836 return new_character_token "\ufffd"
3839 tok_state = tok_state_data
3843 tok_state = tok_state_data
3844 cur -= 1 # Reconsume
3847 tok_cur_tag.text += c
3848 tok_state = tok_state_comment
3851 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3852 tok_state_comment_start_dash = ->
3853 switch c = txt.charAt(cur++)
3855 tok_state = tok_state_comment_end
3858 tok_cur_tag.text += "-\ufffd"
3859 tok_state = tok_state_comment
3862 tok_state = tok_state_data
3866 tok_state = tok_state_data
3867 cur -= 1 # Reconsume
3870 tok_cur_tag.text += "-#{c}"
3871 tok_state = tok_state_comment
3874 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3875 tok_state_comment = ->
3876 switch c = txt.charAt(cur++)
3878 tok_state = tok_state_comment_end_dash
3881 tok_cur_tag.text += "\ufffd"
3884 tok_state = tok_state_data
3885 cur -= 1 # Reconsume
3888 tok_cur_tag.text += c
3891 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3892 tok_state_comment_end_dash = ->
3893 switch c = txt.charAt(cur++)
3895 tok_state = tok_state_comment_end
3898 tok_cur_tag.text += "-\ufffd"
3899 tok_state = tok_state_comment
3902 tok_state = tok_state_data
3903 cur -= 1 # Reconsume
3906 tok_cur_tag.text += "-#{c}"
3907 tok_state = tok_state_comment
3910 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3911 tok_state_comment_end = ->
3912 switch c = txt.charAt(cur++)
3914 tok_state = tok_state_data
3918 tok_cur_tag.text += "--\ufffd"
3919 tok_state = tok_state_comment
3922 tok_state = tok_state_comment_end_bang
3925 tok_cur_tag.text += '-'
3928 tok_state = tok_state_data
3929 cur -= 1 # Reconsume
3933 tok_cur_tag.text += "--#{c}"
3934 tok_state = tok_state_comment
3937 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3938 tok_state_comment_end_bang = ->
3939 switch c = txt.charAt(cur++)
3941 tok_cur_tag.text += "--!#{c}"
3942 tok_state = tok_state_comment_end_dash
3944 tok_state = tok_state_data
3948 tok_cur_tag.text += "--!\ufffd"
3949 tok_state = tok_state_comment
3952 tok_state = tok_state_data
3953 cur -= 1 # Reconsume
3956 tok_cur_tag.text += "--!#{c}"
3957 tok_state = tok_state_comment
3960 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3961 tok_state_doctype = ->
3962 switch c = txt.charAt(cur++)
3963 when "\t", "\u000a", "\u000c", ' '
3964 tok_state = tok_state_before_doctype_name
3967 tok_state = tok_state_data
3968 el = new_doctype_token ''
3969 el.flag 'force-quirks', true
3970 cur -= 1 # Reconsume
3974 tok_state = tok_state_before_doctype_name
3975 cur -= 1 # Reconsume
3978 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3979 tok_state_before_doctype_name = ->
3980 c = txt.charAt(cur++)
3981 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3984 tok_cur_tag = new_doctype_token c.toLowerCase()
3985 tok_state = tok_state_doctype_name
3989 tok_cur_tag = new_doctype_token "\ufffd"
3990 tok_state = tok_state_doctype_name
3994 el = new_doctype_token ''
3995 el.flag 'force-quirks', true
3996 tok_state = tok_state_data
4000 tok_state = tok_state_data
4001 el = new_doctype_token ''
4002 el.flag 'force-quirks', true
4003 cur -= 1 # Reconsume
4006 tok_cur_tag = new_doctype_token c
4007 tok_state = tok_state_doctype_name
4010 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4011 tok_state_doctype_name = ->
4012 c = txt.charAt(cur++)
4013 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4014 tok_state = tok_state_after_doctype_name
4017 tok_state = tok_state_data
4020 tok_cur_tag.name += c.toLowerCase()
4024 tok_cur_tag.name += "\ufffd"
4028 tok_state = tok_state_data
4029 tok_cur_tag.flag 'force-quirks', true
4030 cur -= 1 # Reconsume
4033 tok_cur_tag.name += c
4036 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4037 tok_state_after_doctype_name = ->
4038 c = txt.charAt(cur++)
4039 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4042 tok_state = tok_state_data
4046 tok_state = tok_state_data
4047 tok_cur_tag.flag 'force-quirks', true
4048 cur -= 1 # Reconsume
4051 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4053 tok_state = tok_state_after_doctype_public_keyword
4055 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4057 tok_state = tok_state_after_doctype_system_keyword
4060 tok_cur_tag.flag 'force-quirks', true
4061 tok_state = tok_state_bogus_doctype
4064 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4065 tok_state_after_doctype_public_keyword = ->
4066 c = txt.charAt(cur++)
4067 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4068 tok_state = tok_state_before_doctype_public_identifier
4072 tok_cur_tag.public_identifier = ''
4073 tok_state = tok_state_doctype_public_identifier_double_quoted
4077 tok_cur_tag.public_identifier = ''
4078 tok_state = tok_state_doctype_public_identifier_single_quoted
4082 tok_cur_tag.flag 'force-quirks', true
4083 tok_state = tok_state_data
4087 tok_state = tok_state_data
4088 tok_cur_tag.flag 'force-quirks', true
4089 cur -= 1 # Reconsume
4093 tok_cur_tag.flag 'force-quirks', true
4094 tok_state = tok_state_bogus_doctype
4097 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4098 tok_state_before_doctype_public_identifier = ->
4099 c = txt.charAt(cur++)
4100 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4104 tok_cur_tag.public_identifier = ''
4105 tok_state = tok_state_doctype_public_identifier_double_quoted
4109 tok_cur_tag.public_identifier = ''
4110 tok_state = tok_state_doctype_public_identifier_single_quoted
4114 tok_cur_tag.flag 'force-quirks', true
4115 tok_state = tok_state_data
4119 tok_state = tok_state_data
4120 tok_cur_tag.flag 'force-quirks', true
4121 cur -= 1 # Reconsume
4125 tok_cur_tag.flag 'force-quirks', true
4126 tok_state = tok_state_bogus_doctype
4130 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4131 tok_state_doctype_public_identifier_double_quoted = ->
4132 c = txt.charAt(cur++)
4134 tok_state = tok_state_after_doctype_public_identifier
4138 tok_cur_tag.public_identifier += "\ufffd"
4142 tok_cur_tag.flag 'force-quirks', true
4143 tok_state = tok_state_data
4147 tok_state = tok_state_data
4148 tok_cur_tag.flag 'force-quirks', true
4149 cur -= 1 # Reconsume
4152 tok_cur_tag.public_identifier += c
4155 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4156 tok_state_doctype_public_identifier_single_quoted = ->
4157 c = txt.charAt(cur++)
4159 tok_state = tok_state_after_doctype_public_identifier
4163 tok_cur_tag.public_identifier += "\ufffd"
4167 tok_cur_tag.flag 'force-quirks', true
4168 tok_state = tok_state_data
4172 tok_state = tok_state_data
4173 tok_cur_tag.flag 'force-quirks', true
4174 cur -= 1 # Reconsume
4177 tok_cur_tag.public_identifier += c
4180 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4181 tok_state_after_doctype_public_identifier = ->
4182 c = txt.charAt(cur++)
4183 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4184 tok_state = tok_state_between_doctype_public_and_system_identifiers
4187 tok_state = tok_state_data
4191 tok_cur_tag.system_identifier = ''
4192 tok_state = tok_state_doctype_system_identifier_double_quoted
4196 tok_cur_tag.system_identifier = ''
4197 tok_state = tok_state_doctype_system_identifier_single_quoted
4201 tok_state = tok_state_data
4202 tok_cur_tag.flag 'force-quirks', true
4203 cur -= 1 # Reconsume
4207 tok_cur_tag.flag 'force-quirks', true
4208 tok_state = tok_state_bogus_doctype
4211 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4212 tok_state_between_doctype_public_and_system_identifiers = ->
4213 c = txt.charAt(cur++)
4214 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4217 tok_state = tok_state_data
4221 tok_cur_tag.system_identifier = ''
4222 tok_state = tok_state_doctype_system_identifier_double_quoted
4226 tok_cur_tag.system_identifier = ''
4227 tok_state = tok_state_doctype_system_identifier_single_quoted
4231 tok_state = tok_state_data
4232 tok_cur_tag.flag 'force-quirks', true
4233 cur -= 1 # Reconsume
4237 tok_cur_tag.flag 'force-quirks', true
4238 tok_state = tok_state_bogus_doctype
4241 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4242 tok_state_after_doctype_system_keyword = ->
4243 c = txt.charAt(cur++)
4244 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4245 tok_state = tok_state_before_doctype_system_identifier
4249 tok_cur_tag.system_identifier = ''
4250 tok_state = tok_state_doctype_system_identifier_double_quoted
4254 tok_cur_tag.system_identifier = ''
4255 tok_state = tok_state_doctype_system_identifier_single_quoted
4259 tok_cur_tag.flag 'force-quirks', true
4260 tok_state = tok_state_data
4264 tok_state = tok_state_data
4265 tok_cur_tag.flag 'force-quirks', true
4266 cur -= 1 # Reconsume
4270 tok_cur_tag.flag 'force-quirks', true
4271 tok_state = tok_state_bogus_doctype
4274 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4275 tok_state_before_doctype_system_identifier = ->
4276 c = txt.charAt(cur++)
4277 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4280 tok_cur_tag.system_identifier = ''
4281 tok_state = tok_state_doctype_system_identifier_double_quoted
4284 tok_cur_tag.system_identifier = ''
4285 tok_state = tok_state_doctype_system_identifier_single_quoted
4289 tok_cur_tag.flag 'force-quirks', true
4290 tok_state = tok_state_data
4294 tok_state = tok_state_data
4295 tok_cur_tag.flag 'force-quirks', true
4296 cur -= 1 # Reconsume
4300 tok_cur_tag.flag 'force-quirks', true
4301 tok_state = tok_state_bogus_doctype
4304 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4305 tok_state_doctype_system_identifier_double_quoted = ->
4306 c = txt.charAt(cur++)
4308 tok_state = tok_state_after_doctype_system_identifier
4312 tok_cur_tag.system_identifier += "\ufffd"
4316 tok_cur_tag.flag 'force-quirks', true
4317 tok_state = tok_state_data
4321 tok_state = tok_state_data
4322 tok_cur_tag.flag 'force-quirks', true
4323 cur -= 1 # Reconsume
4326 tok_cur_tag.system_identifier += c
4329 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4330 tok_state_doctype_system_identifier_single_quoted = ->
4331 c = txt.charAt(cur++)
4333 tok_state = tok_state_after_doctype_system_identifier
4337 tok_cur_tag.system_identifier += "\ufffd"
4341 tok_cur_tag.flag 'force-quirks', true
4342 tok_state = tok_state_data
4346 tok_state = tok_state_data
4347 tok_cur_tag.flag 'force-quirks', true
4348 cur -= 1 # Reconsume
4351 tok_cur_tag.system_identifier += c
4354 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4355 tok_state_after_doctype_system_identifier = ->
4356 c = txt.charAt(cur++)
4357 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4360 tok_state = tok_state_data
4364 tok_state = tok_state_data
4365 tok_cur_tag.flag 'force-quirks', true
4366 cur -= 1 # Reconsume
4370 # do _not_ tok_cur_tag.flag 'force-quirks', true
4371 tok_state = tok_state_bogus_doctype
4374 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4375 tok_state_bogus_doctype = ->
4376 c = txt.charAt(cur++)
4378 tok_state = tok_state_data
4381 tok_state = tok_state_data
4382 cur -= 1 # Reconsume
4387 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4388 tok_state_cdata_section = ->
4389 tok_state = tok_state_data
4390 next_gt = txt.indexOf ']]>', cur
4392 val = txt.substr cur
4395 val = txt.substr cur, (next_gt - cur)
4397 return new_character_token val # fixfull split
4399 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4400 # Don't set this as a state, just call it
4401 # returns a string (NOT a text node)
4402 parse_character_reference = (allowed_char = null, in_attr = false) ->
4403 if cur >= txt.length
4405 switch c = txt.charAt(cur)
4406 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4407 # explicitly not a parse error
4410 # there has to be "one or more" alnums between & and ; to be a parse error
4413 if cur + 1 >= txt.length
4415 if txt.charAt(cur + 1).toLowerCase() is 'x'
4424 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4429 if txt.charAt(start + i) is ';'
4433 code_point = txt.substr(start, i)
4434 while code_point.charAt(0) is '0' and code_point.length > 1
4435 code_point = code_point.substr 1
4436 code_point = parseInt(code_point, base)
4437 if unicode_fixes[code_point]?
4439 return unicode_fixes[code_point]
4441 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4445 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4447 return from_code_point code_point
4451 if alnum.indexOf(txt.charAt(cur + i)) is -1
4454 # exit early, because parse_error() below needs at least one alnum
4456 if txt.charAt(cur + i) is ';'
4457 i += 1 # include ';' terminator in value
4458 decoded = decode_named_char_ref txt.substr(cur, i)
4465 # no ';' terminator (only legacy char refs)
4467 for i in [2..max] # no prefix matches, so ok to check shortest first
4468 c = legacy_char_refs[txt.substr(cur, i)]
4471 if txt.charAt(cur + i) is '='
4472 # "because some legacy user agents will
4473 # misinterpret the markup in those cases"
4476 if alnum.indexOf(txt.charAt(cur + i)) > -1
4477 # this makes attributes forgiving about url args
4479 # ok, and besides the weird exceptions for attributes...
4480 # return the matching char
4481 cur += i # consume entity chars
4482 parse_error() # because no terminating ";"
4486 return # never reached
4488 # tree constructor initialization
4489 # see comments on TYPE_TAG/etc for the structure of this data
4492 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4494 afe = [] # active formatting elements
4495 template_ins_modes = []
4496 ins_mode = ins_mode_initial
4497 original_ins_mode = ins_mode # TODO check spec
4498 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4499 flag_frameset_ok = true
4501 flag_foster_parenting = false
4502 form_element_pointer = null
4503 temporary_buffer = null
4504 pending_table_character_tokens = []
4505 head_element_pointer = null
4506 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4507 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4509 # tokenizer initialization
4510 tok_state = tok_state_data
4512 # text pre-processing
4513 # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4514 txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4515 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4516 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4518 if args.name is "tests18.dat #17"
4521 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4526 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4529 serialize_els = (els, shallow, show_ids) ->
4535 serialized += t.serialize shallow, show_ids
4538 module.exports.parse_html = parse_html
4539 module.exports.debug_log_reset = debug_log_reset
4540 module.exports.debug_log_each = debug_log_each
4541 module.exports.TYPE_TAG = TYPE_TAG
4542 module.exports.TYPE_TEXT = TYPE_TEXT
4543 module.exports.TYPE_COMMENT = TYPE_COMMENT
4544 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4545 module.exports.NS_HTML = NS_HTML
4546 module.exports.NS_MATHML = NS_MATHML
4547 module.exports.NS_SVG = NS_SVG