1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
25 # Deviations from that spec:
27 # Purposeful: search this file for "WTAG"
29 # Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
34 # the spec uses a many different words do indicate which ends of lists/stacks
35 # they are talking about (and relative movement within the lists/stacks). This
36 # section splains. I'm implementing "lists" (afe and open_els) the same way
39 # stacks grow downward (current element is index=0)
41 # example: open_els = [a, b, c, d, e, f, g]
43 # "grows downwards" means it's visualized like this: (index: el, names)
45 # 6: g "start of the list", "topmost", "first"
47 # 4: e "previous" (to d), "above", "before"
48 # 3: d (previous/next are relative to this element)
49 # 2: c "next", "after", "lower", "below"
51 # 0: a "end of the list", "current node", "bottommost", "last"
55 # note: to get this to run outside a browser, you'll have to write a native
56 # implementation of decode_named_char_ref()
57 unless module?.exports?
59 module = exports: window.wheic
61 from_code_point = (x) ->
62 if String.fromCodePoint?
63 return String.fromCodePoint x
66 return String.fromCharCode x
68 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
70 # Each node is an obect of the Node class. Here are the Node types:
71 TYPE_TAG = 0 # name, {attributes}, [children]
72 TYPE_TEXT = 1 # "text"
75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
77 TYPE_END_TAG = 5 # name
79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
92 debug_log_each = (cb) ->
93 for str in g_debug_log
98 constructor: (type, args = {}) ->
99 @type = type # one of the TYPE_* constants above
100 @name = args.name ? '' # tag name
101 @text = args.text ? '' # contents for text/comment nodes
102 @attrs = args.attrs ? {}
103 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
104 @children = args.children ? []
105 @namespace = args.namespace ? NS_HTML
106 @parent = args.parent ? null
107 @token = args.token ? null
108 @flags = args.flags ? {}
112 @id = "#{++prev_node_id}"
113 acknowledge_self_closing: ->
115 @token.flag 'did_self_close', true
117 @flag 'did_self_close', true
118 flag: (key, value = null) ->
123 serialize: (shallow = false, show_ids = false) -> # for unit tests
128 ret += JSON.stringify @name
143 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
149 ret += c.serialize shallow, show_ids
153 ret += JSON.stringify @text
156 ret += JSON.stringify @text
158 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
161 when TYPE_AAA_BOOKMARK
162 ret += 'aaa_bookmark'
165 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
168 # helpers: (only take args that are normally known when parser creates nodes)
169 new_open_tag = (name) ->
170 return new Node TYPE_START_TAG, name: name
171 new_end_tag = (name) ->
172 return new Node TYPE_END_TAG, name: name
173 new_element = (name) ->
174 return new Node TYPE_TAG, name: name
175 new_text_node = (txt) ->
176 return new Node TYPE_TEXT, text: txt
177 new_character_token = new_text_node
178 new_comment_token = (txt) ->
179 return new Node TYPE_COMMENT, text: txt
180 new_doctype_token = (name) ->
181 return new Node TYPE_DOCTYPE, name: name
183 return new Node TYPE_EOF
185 return new Node TYPE_AFE_MARKER
186 new_aaa_bookmark = ->
187 return new Node TYPE_AAA_BOOKMARK
189 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
190 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
191 digits = "0123456789"
192 alnum = lc_alpha + uc_alpha + digits
193 hex_chars = digits + "abcdefABCDEF"
195 is_uc_alpha = (str) ->
196 return str.length is 1 and uc_alpha.indexOf(str) > -1
197 is_lc_alpha = (str) ->
198 return str.length is 1 and lc_alpha.indexOf(str) > -1
200 # some SVG elements have dashes in them
201 tag_name_chars = alnum + "-"
203 # http://www.w3.org/TR/html5/infrastructure.html#space-character
204 space_chars = "\u0009\u000a\u000c\u000d\u0020"
206 return txt.length is 1 and space_chars.indexOf(txt) > -1
207 is_space_tok = (t) ->
208 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
210 is_input_hidden_tok = (t) ->
211 return false unless t.type is TYPE_START_TAG
214 if a[1].toLowerCase() is 'hidden'
219 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
220 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
223 unicode_fixes[0x00] = "\uFFFD"
224 unicode_fixes[0x80] = "\u20AC"
225 unicode_fixes[0x82] = "\u201A"
226 unicode_fixes[0x83] = "\u0192"
227 unicode_fixes[0x84] = "\u201E"
228 unicode_fixes[0x85] = "\u2026"
229 unicode_fixes[0x86] = "\u2020"
230 unicode_fixes[0x87] = "\u2021"
231 unicode_fixes[0x88] = "\u02C6"
232 unicode_fixes[0x89] = "\u2030"
233 unicode_fixes[0x8A] = "\u0160"
234 unicode_fixes[0x8B] = "\u2039"
235 unicode_fixes[0x8C] = "\u0152"
236 unicode_fixes[0x8E] = "\u017D"
237 unicode_fixes[0x91] = "\u2018"
238 unicode_fixes[0x92] = "\u2019"
239 unicode_fixes[0x93] = "\u201C"
240 unicode_fixes[0x94] = "\u201D"
241 unicode_fixes[0x95] = "\u2022"
242 unicode_fixes[0x96] = "\u2013"
243 unicode_fixes[0x97] = "\u2014"
244 unicode_fixes[0x98] = "\u02DC"
245 unicode_fixes[0x99] = "\u2122"
246 unicode_fixes[0x9A] = "\u0161"
247 unicode_fixes[0x9B] = "\u203A"
248 unicode_fixes[0x9C] = "\u0153"
249 unicode_fixes[0x9E] = "\u017E"
250 unicode_fixes[0x9F] = "\u0178"
252 # These are the character references that don't need a terminating semicolon
253 # min length: 2, max: 6, none are a prefix of any other.
255 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
256 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
257 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
258 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
259 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
260 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
261 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
262 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
263 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
264 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
265 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
266 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
267 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
268 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
269 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
270 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
271 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
275 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
276 raw_text_elements = ['script', 'style']
277 escapable_raw_text_elements = ['textarea', 'title']
278 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
280 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
281 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
282 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
283 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
284 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
285 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
286 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
287 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
288 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
289 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
290 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
291 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
292 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
293 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
297 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
299 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
300 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
301 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
302 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
303 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
304 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
305 'determinant', 'diff', 'divergence', 'divide', 'domain',
306 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
307 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
308 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
309 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
310 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
311 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
312 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
313 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
314 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
315 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
316 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
317 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
318 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
319 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
320 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
321 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
322 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
323 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
324 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
325 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
326 'vectorproduct', 'xor'
328 # foreign_elements = [svg_elements..., mathml_elements...]
329 #normal_elements = All other allowed HTML elements are normal elements.
333 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
334 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
335 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
336 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
337 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
338 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
339 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
340 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
341 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
342 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
343 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
345 menu:NS_HTML,menuitem:NS_HTML, # WATWG adds these
347 meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
348 noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
349 plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
350 select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
351 table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
352 textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
353 tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
356 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
357 'annotation-xml':NS_MATHML,
360 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
363 formatting_elements = {
364 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
365 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
369 mathml_text_integration = {
370 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
372 is_mathml_text_integration_point = (el) ->
373 return mathml_text_integration[el.name] is el.namespace
374 is_html_integration = (el) -> # DON'T PASS A TOKEN
375 if el.namespace is NS_MATHML
376 if el.name is 'annotation-xml'
377 if el.attrs.encoding?
378 if el.attrs.encoding.toLowerCase() is 'text/html'
380 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
383 if el.namespace is NS_SVG
384 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
389 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
392 foster_parenting_targets = {
413 el_is_special = (e) ->
414 return special_elements[e.name] is e.namespace
416 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
417 el_is_special_not_adp = (el) ->
418 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
422 altglyphdef: 'altGlyphDef'
423 altglyphitem: 'altGlyphItem'
424 animatecolor: 'animateColor'
425 animatemotion: 'animateMotion'
426 animatetransform: 'animateTransform'
429 fecolormatrix: 'feColorMatrix'
430 fecomponenttransfer: 'feComponentTransfer'
431 fecomposite: 'feComposite'
432 feconvolvematrix: 'feConvolveMatrix'
433 fediffuselighting: 'feDiffuseLighting'
434 fedisplacementmap: 'feDisplacementMap'
435 fedistantlight: 'feDistantLight'
436 fedropshadow: 'feDropShadow'
442 fegaussianblur: 'feGaussianBlur'
445 femergenode: 'feMergeNode'
446 femorphology: 'feMorphology'
448 fepointlight: 'fePointLight'
449 fespecularlighting: 'feSpecularLighting'
450 fespotlight: 'feSpotLight'
452 feturbulence: 'feTurbulence'
453 foreignobject: 'foreignObject'
455 lineargradient: 'linearGradient'
456 radialgradient: 'radialGradient'
459 svg_attribute_fixes = {
460 attributename: 'attributeName'
461 attributetype: 'attributeType'
462 basefrequency: 'baseFrequency'
463 baseprofile: 'baseProfile'
465 clippathunits: 'clipPathUnits'
466 contentscripttype: 'contentScriptType'
467 contentstyletype: 'contentStyleType'
468 diffuseconstant: 'diffuseConstant'
470 externalresourcesrequired: 'externalResourcesRequired'
471 # WTAG removes this: filterres: 'filterRes'
472 filterunits: 'filterUnits'
474 gradienttransform: 'gradientTransform'
475 gradientunits: 'gradientUnits'
476 kernelmatrix: 'kernelMatrix'
477 kernelunitlength: 'kernelUnitLength'
478 keypoints: 'keyPoints'
479 keysplines: 'keySplines'
481 lengthadjust: 'lengthAdjust'
482 limitingconeangle: 'limitingConeAngle'
483 markerheight: 'markerHeight'
484 markerunits: 'markerUnits'
485 markerwidth: 'markerWidth'
486 maskcontentunits: 'maskContentUnits'
487 maskunits: 'maskUnits'
488 numoctaves: 'numOctaves'
489 pathlength: 'pathLength'
490 patterncontentunits: 'patternContentUnits'
491 patterntransform: 'patternTransform'
492 patternunits: 'patternUnits'
493 pointsatx: 'pointsAtX'
494 pointsaty: 'pointsAtY'
495 pointsatz: 'pointsAtZ'
496 preservealpha: 'preserveAlpha'
497 preserveaspectratio: 'preserveAspectRatio'
498 primitiveunits: 'primitiveUnits'
501 repeatcount: 'repeatCount'
502 repeatdur: 'repeatDur'
503 requiredextensions: 'requiredExtensions'
504 requiredfeatures: 'requiredFeatures'
505 specularconstant: 'specularConstant'
506 specularexponent: 'specularExponent'
507 spreadmethod: 'spreadMethod'
508 startoffset: 'startOffset'
509 stddeviation: 'stdDeviation'
510 stitchtiles: 'stitchTiles'
511 surfacescale: 'surfaceScale'
512 systemlanguage: 'systemLanguage'
513 tablevalues: 'tableValues'
516 textlength: 'textLength'
518 viewtarget: 'viewTarget'
519 xchannelselector: 'xChannelSelector'
520 ychannelselector: 'yChannelSelector'
521 zoomandpan: 'zoomAndPan'
523 foreign_attr_fixes = {
524 'xlink:actuate': 'xlink actuate'
525 'xlink:arcrole': 'xlink arcrole'
526 'xlink:href': 'xlink href'
527 'xlink:role': 'xlink role'
528 'xlink:show': 'xlink show'
529 'xlink:title': 'xlink title'
530 'xlink:type': 'xlink type'
531 'xml:base': 'xml base'
532 'xml:lang': 'xml lang'
533 'xml:space': 'xml space'
535 'xmlns:xlink': 'xmlns xlink'
537 adjust_mathml_attributes = (t) ->
539 if a[0] is 'definitionurl'
540 a[0] = 'definitionURL'
542 adjust_svg_attributes = (t) ->
544 if svg_attribute_fixes[a[0]]?
545 a[0] = svg_attribute_fixes[a[0]]
547 adjust_foreign_attributes = (t) ->
550 if foreign_attr_fixes[a[0]]?
551 a[0] = foreign_attr_fixes[a[0]]
554 # decode_named_char_ref()
556 # The list of named character references is _huge_ so ask the browser to decode
557 # for us instead of wasting bandwidth/space on including the table here.
559 # Pass without the "&" but with the ";" examples:
560 # for "&" pass "amp;"
561 # for "′" pass "x2032;"
564 textarea: document.createElement('textarea')
566 # TODO test this in IE8
567 decode_named_char_ref = (txt) ->
569 decoded = g_dncr.cache[txt]
570 return decoded if decoded?
571 g_dncr.textarea.innerHTML = txt
572 decoded = g_dncr.textarea.value
573 return null if decoded is txt
574 return g_dncr.cache[txt] = decoded
576 parse_html = (args) ->
578 cur = null # index of next char in txt to be parsed
579 # declare doc and tokenizer variables so they're in scope below
581 open_els = null # stack of open elements
582 afe = null # active formatting elements
583 template_ins_modes = null
585 original_ins_mode = null
587 tok_cur_tag = null # partially parsed tag
588 flag_scripting = null
589 flag_frameset_ok = null
591 flag_foster_parenting = null
592 form_element_pointer = null
593 temporary_buffer = null
594 pending_table_character_tokens = null
595 head_element_pointer = null
596 flag_fragment_parsing = null
597 context_element = null
606 console.log "Parse error at character #{cur} of #{txt.length}"
608 afe_push = (new_el) ->
611 if el.name is new_el.name and el.namespace is new_el.namespace
613 continue unless new_el.attrs[k] is v
614 for k, v of new_el.attrs
615 continue unless el.attrs[k] is v
622 afe.unshift new_afe_marker()
624 # the functions below impliment the Tree Contstruction algorithm
625 # http://www.w3.org/TR/html5/syntax.html#tree-construction
627 # But first... the helpers
628 template_tag_is_open = ->
630 if t.name is 'template' and t.namespace is NS_HTML
633 is_in_scope_x = (tag_name, scope, namespace) ->
635 if t.name is tag_name and (namespace is null or namespace is t.namespace)
637 if scope[t.name] is t.namespace
640 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
642 if t.name is tag_name and (namespace is null or namespace is t.namespace)
644 if scope[t.name] is t.namespace
646 if scope2[t.name] is t.namespace
650 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
651 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
654 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
655 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
657 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
659 button_scopers = button: NS_HTML
660 li_scopers = ol: NS_HTML, ul: NS_HTML
661 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
662 is_in_scope = (tag_name, namespace = null) ->
663 return is_in_scope_x tag_name, standard_scopers, namespace
664 is_in_button_scope = (tag_name, namespace = null) ->
665 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
666 is_in_table_scope = (tag_name, namespace = null) ->
667 return is_in_scope_x tag_name, table_scopers, namespace
668 # aka is_in_list_item_scope
669 is_in_li_scope = (tag_name, namespace = null) ->
670 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
671 is_in_select_scope = (tag_name, namespace = null) ->
673 if t.name is tag_name and (namespace is null or namespace is t.namespace)
675 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
678 # this checks for a particular element, not by name
679 # this requires a namespace match
680 el_is_in_scope = (needle) ->
684 if standard_scopers[el.name] is el.namespace
688 clear_to_table_stopers = {
693 clear_stack_to_table_context = ->
695 if clear_to_table_stopers[open_els[0].name]?
699 clear_to_table_body_stopers = {
706 clear_stack_to_table_body_context = ->
708 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
712 clear_to_table_row_stopers = {
717 clear_stack_to_table_row_context = ->
719 if clear_to_table_row_stopers[open_els[0].name]?
723 clear_afe_to_marker = ->
725 return unless afe.length > 0 # this happens in fragment case, ?spec error
727 if el.type is TYPE_AFE_MARKER
732 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
734 # 1. Let last be false.
736 # 2. Let node be the last node in the stack of open elements.
738 node = open_els[node_i]
739 # 3. Loop: If node is the first node in the stack of open elements,
740 # then set last to true, and, if the parser was originally created as
741 # part of the HTML fragment parsing algorithm (fragment case) set node
742 # to the context element.
744 if node_i is open_els.length - 1
746 # fixfull (fragment case)
748 # 4. If node is a select element, run these substeps:
749 if node.name is 'select' and node.namespace is NS_HTML
750 # 1. If last is true, jump to the step below labeled done.
752 # 2. Let ancestor be node.
755 # 3. Loop: If ancestor is the first node in the stack of
756 # open elements, jump to the step below labeled done.
758 if ancestor_i is open_els.length - 1
760 # 4. Let ancestor be the node before ancestor in the stack
763 ancestor = open_els[ancestor_i]
764 # 5. If ancestor is a template node, jump to the step below
766 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
768 # 6. If ancestor is a table node, switch the insertion mode
769 # to "in select in table" and abort these steps.
770 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
771 ins_mode = ins_mode_in_select_in_table
773 # 7. Jump back to the step labeled loop.
774 # 8. Done: Switch the insertion mode to "in select" and abort
776 ins_mode = ins_mode_in_select
778 # 5. If node is a td or th element and last is false, then switch
779 # the insertion mode to "in cell" and abort these steps.
780 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
781 ins_mode = ins_mode_in_cell
783 # 6. If node is a tr element, then switch the insertion mode to "in
784 # row" and abort these steps.
785 if node.name is 'tr' and node.namespace is NS_HTML
786 ins_mode = ins_mode_in_row
788 # 7. If node is a tbody, thead, or tfoot element, then switch the
789 # insertion mode to "in table body" and abort these steps.
790 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
791 ins_mode = ins_mode_in_table_body
793 # 8. If node is a caption element, then switch the insertion mode
794 # to "in caption" and abort these steps.
795 if node.name is 'caption' and node.namespace is NS_HTML
796 ins_mode = ins_mode_in_caption
798 # 9. If node is a colgroup element, then switch the insertion mode
799 # to "in column group" and abort these steps.
800 if node.name is 'colgroup' and node.namespace is NS_HTML
801 ins_mode = ins_mode_in_column_group
803 # 10. If node is a table element, then switch the insertion mode to
804 # "in table" and abort these steps.
805 if node.name is 'table' and node.namespace is NS_HTML
806 ins_mode = ins_mode_in_table
808 # 11. If node is a template element, then switch the insertion mode
809 # to the current template insertion mode and abort these steps.
810 if node.name is 'template' and node.namespace is NS_HTML
811 ins_mode = template_ins_modes[0]
813 # 12. If node is a head element and last is true, then switch the
814 # insertion mode to "in body" ("in body"! not "in head"!) and abort
815 # these steps. (fragment case)
816 if node.name is 'head' and node.namespace is NS_HTML and last
817 ins_mode = ins_mode_in_body
819 # 13. If node is a head element and last is false, then switch the
820 # insertion mode to "in head" and abort these steps.
821 if node.name is 'head' and node.namespace is NS_HTML and last is false
822 ins_mode = ins_mode_in_head
824 # 14. If node is a body element, then switch the insertion mode to
825 # "in body" and abort these steps.
826 if node.name is 'body' and node.namespace is NS_HTML
827 ins_mode = ins_mode_in_body
829 # 15. If node is a frameset element, then switch the insertion mode
830 # to "in frameset" and abort these steps. (fragment case)
831 if node.name is 'frameset' and node.namespace is NS_HTML
832 ins_mode = ins_mode_in_frameset
834 # 16. If node is an html element, run these substeps:
835 if node.name is 'html' and node.namespace is NS_HTML
836 # 1. If the head element pointer is null, switch the insertion
837 # mode to "before head" and abort these steps. (fragment case)
838 if head_element_pointer is null
839 ins_mode = ins_mode_before_head
841 # 2. Otherwise, the head element pointer is not null,
842 # switch the insertion mode to "after head" and abort these
844 ins_mode = ins_mode_after_head
846 # 17. If last is true, then switch the insertion mode to "in body"
847 # and abort these steps. (fragment case)
849 ins_mode = ins_mode_in_body
851 # 18. Let node now be the node before node in the stack of open
854 node = open_els[node_i]
855 # 19. Return to the step labeled loop.
859 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
860 adjusted_current_node = ->
861 if open_els.length is 1 and flag_fragment_parsing
862 return context_element
865 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
866 # this implementation is structured (mostly) as described at the link above.
867 # capitalized comments are the "labels" described at the link above.
869 return if afe.length is 0
870 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
875 if i is afe.length - 1
878 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
883 el = insert_html_element afe[i].token
888 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
889 # adoption agency algorithm
891 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
892 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
893 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
894 adoption_agency = (subject) ->
895 debug_log "adoption_agency()"
896 debug_log "tree: #{serialize_els doc.children, false, true}"
897 debug_log "open_els: #{serialize_els open_els, true, true}"
898 debug_log "afe: #{serialize_els afe, true, true}"
899 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
902 # remove it from the list of active formatting elements (if found)
907 debug_log "aaa: starting off with subject on top of stack, exiting"
914 # 5. Let formatting element be the last element in the list of
915 # active formatting elements that: is between the end of the list
916 # and the last scope marker in the list, if any, or the start of
917 # the list otherwise, and has the tag name subject.
919 for t, fe_of_afe in afe
920 if t.type is TYPE_AFE_MARKER
925 # If there is no such element, then abort these steps and instead
926 # act as described in the "any other end tag" entry above.
928 debug_log "aaa: fe not found in afe"
929 in_body_any_other_end_tag subject
931 # 6. If formatting element is not in the stack of open elements,
932 # then this is a parse error; remove the element from the list, and
935 for t, fe_of_open_els in open_els
940 debug_log "aaa: fe not found in open_els"
942 # "remove it from the list" must mean afe, since it's not in open_els
943 afe.splice fe_of_afe, 1
945 # 7. If formatting element is in the stack of open elements, but
946 # the element is not in scope, then this is a parse error; abort
948 unless el_is_in_scope fe
949 debug_log "aaa: fe not in scope"
952 # 8. If formatting element is not the current node, this is a parse
953 # error. (But do not abort these steps.)
954 unless open_els[0] is fe
957 # 9. Let furthest block be the topmost node in the stack of open
958 # elements that is lower in the stack than formatting element, and
959 # is an element in the special category. There might not be one.
961 fb_of_open_els = null
968 # and continue, to see if there's one that's more "topmost"
969 # 10. If there is no furthest block, then the UA must first pop all
970 # the nodes from the bottom of the stack of open elements, from the
971 # current node up to and including formatting element, then remove
972 # formatting element from the list of active formatting elements,
973 # and finally abort these steps.
975 debug_log "aaa: no fb"
979 afe.splice fe_of_afe, 1
981 # 11. Let common ancestor be the element immediately above
982 # formatting element in the stack of open elements.
983 ca = open_els[fe_of_open_els + 1] # common ancestor
985 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
986 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
987 bookmark = new_aaa_bookmark()
990 afe.splice i, 0, bookmark
992 node = last_node = fb
996 # 3. Let node be the element immediately above node in the
997 # stack of open elements, or if node is no longer in the stack
998 # of open elements (e.g. because it got removed by this
999 # algorithm), the element that was immediately above node in
1000 # the stack of open elements before node was removed.
1002 for t, i in open_els
1004 node_next = open_els[i + 1]
1006 node = node_next ? node_above
1007 debug_log "inner loop #{inner}"
1008 debug_log "tree: #{serialize_els doc.children, false, true}"
1009 debug_log "open_els: #{serialize_els open_els, true, true}"
1010 debug_log "afe: #{serialize_els afe, true, true}"
1011 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1012 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1013 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1014 debug_log "node: #{node.serialize true, true}"
1015 # TODO make sure node_above gets re-set if/when node is removed from open_els
1017 # 4. If node is formatting element, then go to the next step in
1018 # the overall algorithm.
1021 debug_log "the meat"
1022 # 5. If inner loop counter is greater than three and node is in
1023 # the list of active formatting elements, then remove node from
1024 # the list of active formatting elements.
1030 debug_log "max out inner"
1035 # 6. If node is not in the list of active formatting elements,
1036 # then remove node from the stack of open elements and then go
1037 # back to the step labeled inner loop.
1039 debug_log "not in afe"
1040 for t, i in open_els
1042 node_above = open_els[i + 1]
1043 open_els.splice i, 1
1046 debug_log "the bones"
1047 # 7. create an element for the token for which the element node
1048 # was created, in the HTML namespace, with common ancestor as
1049 # the intended parent; replace the entry for node in the list
1050 # of active formatting elements with an entry for the new
1051 # element, replace the entry for node in the stack of open
1052 # elements with an entry for the new element, and let node be
1054 new_node = token_to_element node.token, NS_HTML, ca
1058 debug_log "replaced in afe"
1060 for t, i in open_els
1062 node_above = open_els[i + 1]
1063 open_els[i] = new_node
1064 debug_log "replaced in open_els"
1067 # 8. If last node is furthest block, then move the
1068 # aforementioned bookmark to be immediately after the new node
1069 # in the list of active formatting elements.
1074 debug_log "removed bookmark"
1078 # "after" means lower
1079 afe.splice i, 0, bookmark # "after as <-
1080 debug_log "placed bookmark after node"
1081 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1083 # 9. Insert last node into node, first removing it from its
1084 # previous parent node if any.
1085 if last_node.parent?
1086 debug_log "last_node has parent"
1087 for c, i in last_node.parent.children
1089 debug_log "removing last_node from parent"
1090 last_node.parent.children.splice i, 1
1092 node.children.push last_node
1093 last_node.parent = node
1094 # 10. Let last node be node.
1097 # 11. Return to the step labeled inner loop.
1098 # 14. Insert whatever last node ended up being in the previous step
1099 # at the appropriate place for inserting a node, but using common
1100 # ancestor as the override target.
1102 # In the case where fe is immediately followed by fb:
1103 # * inner loop exits out early (node==fe)
1105 # * last_node is still in the tree (not a duplicate)
1106 if last_node.parent?
1107 debug_log "FEFIRST? last_node has parent"
1108 for c, i in last_node.parent.children
1110 debug_log "removing last_node from parent"
1111 last_node.parent.children.splice i, 1
1114 debug_log "after aaa inner loop"
1115 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1116 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1117 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1118 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1119 debug_log "tree: #{serialize_els doc.children, false, true}"
1124 # can't use standard insert token thing, because it's already in
1125 # open_els and must stay at it's current position in open_els
1126 dest = adjusted_insertion_location ca
1127 dest[0].children.splice dest[1], 0, last_node
1128 last_node.parent = dest[0]
1131 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1132 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1133 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1134 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1135 debug_log "tree: #{serialize_els doc.children, false, true}"
1137 # 15. Create an element for the token for which formatting element
1138 # was created, in the HTML namespace, with furthest block as the
1140 new_element = token_to_element fe.token, NS_HTML, fb
1141 # 16. Take all of the child nodes of furthest block and append them
1142 # to the element created in the last step.
1143 while fb.children.length
1144 t = fb.children.shift()
1145 t.parent = new_element
1146 new_element.children.push t
1147 # 17. Append that new element to furthest block.
1148 new_element.parent = fb
1149 fb.children.push new_element
1150 # 18. Remove formatting element from the list of active formatting
1151 # elements, and insert the new element into the list of active
1152 # formatting elements at the position of the aforementioned
1160 afe[i] = new_element
1162 # 19. Remove formatting element from the stack of open elements,
1163 # and insert the new element into the stack of open elements
1164 # immediately below the position of furthest block in that stack.
1165 for t, i in open_els
1167 open_els.splice i, 1
1169 for t, i in open_els
1171 open_els.splice i, 0, new_element
1173 # 20. Jump back to the step labeled outer loop.
1174 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1175 debug_log "tree: #{serialize_els doc.children, false, true}"
1176 debug_log "open_els: #{serialize_els open_els, true, true}"
1177 debug_log "afe: #{serialize_els afe, true, true}"
1178 debug_log "AAA DONE"
1180 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1181 close_p_element = ->
1182 generate_implied_end_tags 'p' # arg is exception
1183 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1185 while open_els.length > 1 # just in case
1186 el = open_els.shift()
1187 if el.name is 'p' and el.namespace is NS_HTML
1189 close_p_if_in_button_scope = ->
1190 if is_in_button_scope 'p', NS_HTML
1193 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1194 # aka insert_a_character = (t) ->
1195 insert_character = (t) ->
1196 dest = adjusted_insertion_location()
1197 # fixfull check for Document node
1199 prev = dest[0].children[dest[1] - 1]
1200 if prev.type is TYPE_TEXT
1203 dest[0].children.splice dest[1], 0, t
1206 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1207 process_token = (t) ->
1208 acn = adjusted_current_node()
1212 if acn.namespace is NS_HTML
1215 if is_mathml_text_integration_point(acn)
1216 if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1219 if t.type is TYPE_TEXT
1222 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1225 if is_html_integration acn
1226 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1229 if t.type is TYPE_EOF
1232 in_foreign_content t
1236 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1237 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1238 adjusted_insertion_location = (override_target = null) ->
1239 # 1. If there was an override target specified, then let target be the
1242 target = override_target
1243 else # Otherwise, let target be the current node.
1244 target = open_els[0]
1245 # 2. Determine the adjusted insertion location using the first matching
1246 # steps from the following list:
1248 # If foster parenting is enabled and target is a table, tbody, tfoot,
1249 # thead, or tr element Foster parenting happens when content is
1250 # misnested in tables.
1251 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1252 loop # once. this is here so we can ``break`` to "abort these substeps"
1253 # 1. Let last template be the last template element in the
1254 # stack of open elements, if any.
1255 last_template = null
1256 last_template_i = null
1257 for el, i in open_els
1258 if el.name is 'template' and el.namespace is NS_HTML
1262 # 2. Let last table be the last table element in the stack of
1263 # open elements, if any.
1266 for el, i in open_els
1267 if el.name is 'table' and el.namespace is NS_HTML
1271 # 3. If there is a last template and either there is no last
1272 # table, or there is one, but last template is lower (more
1273 # recently added) than last table in the stack of open
1274 # elements, then: let adjusted insertion location be inside
1275 # last template's template contents, after its last child (if
1276 # any), and abort these substeps.
1277 if last_template and (last_table is null or last_template_i < last_table_i)
1278 target = last_template # fixfull should be it's contents
1279 target_i = target.children.length
1281 # 4. If there is no last table, then let adjusted insertion
1282 # location be inside the first element in the stack of open
1283 # elements (the html element), after its last child (if any),
1284 # and abort these substeps. (fragment case)
1285 if last_table is null
1287 target = open_els[open_els.length - 1]
1288 target_i = target.children.length
1290 # 5. If last table has a parent element, then let adjusted
1291 # insertion location be inside last table's parent element,
1292 # immediately before last table, and abort these substeps.
1293 if last_table.parent?
1294 for c, i in last_table.parent.children
1296 target = last_table.parent
1300 # 6. Let previous element be the element immediately above last
1301 # table in the stack of open elements.
1303 # huh? how could it not have a parent?
1304 previous_element = open_els[last_table_i + 1]
1305 # 7. Let adjusted insertion location be inside previous
1306 # element, after its last child (if any).
1307 target = previous_element
1308 target_i = target.children.length
1309 # Note: These steps are involved in part because it's possible
1310 # for elements, the table element in this case in particular,
1311 # to have been moved by a script around in the DOM, or indeed
1312 # removed from the DOM entirely, after the element was inserted
1314 break # don't really loop
1316 # Otherwise Let adjusted insertion location be inside target, after
1317 # its last child (if any).
1318 target_i = target.children.length
1320 # 3. If the adjusted insertion location is inside a template element,
1321 # let it instead be inside the template element's template contents,
1322 # after its last child (if any).
1323 # fixfull (template)
1325 # 4. Return the adjusted insertion location.
1326 return [target, target_i]
1328 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1329 # aka create_an_element_for_token
1330 token_to_element = (t, namespace, intended_parent) ->
1331 # convert attributes into a hash
1334 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1335 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1337 # TODO 2. If the newly created element has an xmlns attribute in the
1338 # XMLNS namespace whose value is not exactly the same as the element's
1339 # namespace, that is a parse error. Similarly, if the newly created
1340 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1341 # value is not the XLink Namespace, that is a parse error.
1343 # fixfull: the spec says stuff about form pointers and ownerDocument
1347 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1348 insert_foreign_element = (token, namespace) ->
1349 ail = adjusted_insertion_location()
1352 el = token_to_element token, namespace, ail_el
1353 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1355 ail_el.children.splice ail_i, 0, el
1358 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1359 insert_html_element = (token) ->
1360 insert_foreign_element token, NS_HTML
1362 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1363 # position should be [node, index_within_children]
1364 insert_comment = (t, position = null) ->
1365 position ?= adjusted_insertion_location()
1366 position[0].children.splice position[1], 0, t
1369 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1370 parse_generic_raw_text = (t) ->
1371 insert_html_element t
1372 tok_state = tok_state_rawtext
1373 original_ins_mode = ins_mode
1374 ins_mode = ins_mode_text
1375 parse_generic_rcdata_text = (t) ->
1376 insert_html_element t
1377 tok_state = tok_state_rcdata
1378 original_ins_mode = ins_mode
1379 ins_mode = ins_mode_text
1381 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1382 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1383 generate_implied_end_tags = (except = null) ->
1384 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1387 # 8.2.5.4 The rules for parsing tokens in HTML content
1388 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1390 # 8.2.5.4.1 The "initial" insertion mode
1391 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1392 ins_mode_initial = (t) ->
1395 if t.type is TYPE_COMMENT
1399 if t.type is TYPE_DOCTYPE
1400 # FIXME check identifiers, set quirks, etc
1403 ins_mode = ins_mode_before_html
1406 #fixfull (iframe, quirks)
1407 ins_mode = ins_mode_before_html
1411 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1412 ins_mode_before_html = (t) ->
1413 if t.type is TYPE_DOCTYPE
1416 if t.type is TYPE_COMMENT
1421 if t.type is TYPE_START_TAG and t.name is 'html'
1422 el = token_to_element t, NS_HTML, doc
1423 doc.children.push el
1424 open_els.unshift(el)
1425 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1426 ins_mode = ins_mode_before_head
1428 if t.type is TYPE_END_TAG
1429 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1430 # fall through to "anything else"
1435 html_tok = new_open_tag 'html'
1436 el = token_to_element html_tok, NS_HTML, doc
1437 doc.children.push el
1439 # ?fixfull browsing context
1440 ins_mode = ins_mode_before_head
1444 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1445 ins_mode_before_head = (t) ->
1448 if t.type is TYPE_COMMENT
1451 if t.type is TYPE_DOCTYPE
1454 if t.type is TYPE_START_TAG and t.name is 'html'
1457 if t.type is TYPE_START_TAG and t.name is 'head'
1458 el = insert_html_element t
1459 head_element_pointer = el
1460 ins_mode = ins_mode_in_head
1462 if t.type is TYPE_END_TAG
1463 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1464 # fall through to Anything else below
1469 head_tok = new_open_tag 'head'
1470 el = insert_html_element head_tok
1471 head_element_pointer = el
1472 ins_mode = ins_mode_in_head
1475 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1476 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1477 open_els.shift() # spec says this will be a 'head' node
1478 ins_mode = ins_mode_after_head
1480 ins_mode_in_head = (t) ->
1481 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1484 if t.type is TYPE_COMMENT
1487 if t.type is TYPE_DOCTYPE
1490 if t.type is TYPE_START_TAG and t.name is 'html'
1493 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1494 el = insert_html_element t
1496 t.acknowledge_self_closing()
1498 if t.type is TYPE_START_TAG and t.name is 'meta'
1499 el = insert_html_element t
1501 t.acknowledge_self_closing()
1502 # fixfull encoding stuff
1504 if t.type is TYPE_START_TAG and t.name is 'title'
1505 parse_generic_rcdata_text t
1507 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1508 parse_generic_raw_text t
1510 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1511 insert_html_element t
1512 ins_mode = ins_mode_in_head_noscript
1514 if t.type is TYPE_START_TAG and t.name is 'script'
1515 ail = adjusted_insertion_location()
1516 el = token_to_element t, NS_HTML, ail
1517 el.flag 'parser-inserted', true
1518 # fixfull frament case
1519 ail[0].children.splice ail[1], 0, el
1521 tok_state = tok_state_script_data
1522 original_ins_mode = ins_mode # make sure orig... is defined
1523 ins_mode = ins_mode_text
1525 if t.type is TYPE_END_TAG and t.name is 'head'
1526 open_els.shift() # will be a head element... spec says so
1527 ins_mode = ins_mode_after_head
1529 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1530 ins_mode_in_head_else t
1532 if t.type is TYPE_START_TAG and t.name is 'template'
1533 insert_html_element t
1535 flag_frameset_ok = false
1536 ins_mode = ins_mode_in_template
1537 template_ins_modes.unshift ins_mode_in_template
1539 if t.type is TYPE_END_TAG and t.name is 'template'
1540 if template_tag_is_open()
1541 generate_implied_end_tags
1542 if open_els[0].name isnt 'template'
1545 el = open_els.shift()
1546 if el.name is 'template' and el.namespace is NS_HTML
1548 clear_afe_to_marker()
1549 template_ins_modes.shift()
1554 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1557 ins_mode_in_head_else t
1559 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1560 ins_mode_in_head_noscript_else = (t) ->
1563 ins_mode = ins_mode_in_head
1565 ins_mode_in_head_noscript = (t) ->
1566 if t.type is TYPE_DOCTYPE
1569 if t.type is TYPE_START_TAG and t.name is 'html'
1572 if t.type is TYPE_END_TAG and t.name is 'noscript'
1574 ins_mode = ins_mode_in_head
1576 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1579 if t.type is TYPE_END_TAG and t.name is 'br'
1580 ins_mode_in_head_noscript_else t
1582 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1586 ins_mode_in_head_noscript_else t
1591 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1592 ins_mode_after_head_else = (t) ->
1593 body_tok = new_open_tag 'body'
1594 insert_html_element body_tok
1595 ins_mode = ins_mode_in_body
1598 ins_mode_after_head = (t) ->
1602 if t.type is TYPE_COMMENT
1605 if t.type is TYPE_DOCTYPE
1608 if t.type is TYPE_START_TAG and t.name is 'html'
1611 if t.type is TYPE_START_TAG and t.name is 'body'
1612 insert_html_element t
1613 flag_frameset_ok = false
1614 ins_mode = ins_mode_in_body
1616 if t.type is TYPE_START_TAG and t.name is 'frameset'
1617 insert_html_element t
1618 ins_mode = ins_mode_in_frameset
1620 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1622 open_els.unshift head_element_pointer
1624 for el, i of open_els
1625 if el is head_element_pointer
1626 open_els.splice i, 1
1628 console.log "warning: 23904 couldn't find head element in open_els"
1630 if t.type is TYPE_END_TAG and t.name is 'template'
1633 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1634 ins_mode_after_head_else t
1636 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1640 ins_mode_after_head_else t
1642 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1643 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1644 for el, i in open_els
1645 if el.name is name and el.namespace is NS_HTML
1646 generate_implied_end_tags name # arg is exception
1647 parse_error() unless i is 0
1652 if special_elements[el.name] is el.namespace
1656 ins_mode_in_body = (t) ->
1657 if t.type is TYPE_TEXT and t.text is "\u0000"
1664 if t.type is TYPE_TEXT
1667 flag_frameset_ok = false
1669 if t.type is TYPE_COMMENT
1672 if t.type is TYPE_DOCTYPE
1675 if t.type is TYPE_START_TAG and t.name is 'html'
1677 return if template_tag_is_open()
1678 root_attrs = open_els[open_els.length - 1].attrs
1680 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1683 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1686 if t.type is TYPE_START_TAG and t.name is 'body'
1688 return if open_els.length < 2
1689 second = open_els[open_els.length - 2]
1690 return unless second.namespace is NS_HTML
1691 return unless second.name is 'body'
1692 return if template_tag_is_open()
1693 flag_frameset_ok = false
1695 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1697 if t.type is TYPE_START_TAG and t.name is 'frameset'
1699 return if open_els.length < 2
1700 second_i = open_els.length - 2
1701 second = open_els[second_i]
1702 return unless second.namespace is NS_HTML
1703 return unless second.name is 'body'
1704 if flag_frameset_ok is false
1707 for el, i in second.parent.children
1709 second.parent.children.splice i, 1
1711 open_els.splice second_i, 1
1712 # pop everything except the "root html element"
1713 while open_els.length > 1
1715 insert_html_element t
1716 ins_mode = ins_mode_in_frameset
1718 if t.type is TYPE_EOF
1720 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1721 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1722 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1725 unless ok_tags[t.name] is el.namespace
1728 if template_ins_modes.length > 0
1729 ins_mode_in_template t
1733 if t.type is TYPE_END_TAG and t.name is 'body'
1734 unless is_in_scope 'body', NS_HTML
1738 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1739 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1740 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1741 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1745 unless ok_tags[t.name] is el.namespace
1748 ins_mode = ins_mode_after_body
1750 if t.type is TYPE_END_TAG and t.name is 'html'
1751 unless is_in_scope 'body', NS_HTML
1755 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1756 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1757 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1758 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1762 unless ok_tags[t.name] is el.namespace
1765 ins_mode = ins_mode_after_body
1768 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1769 close_p_if_in_button_scope()
1770 insert_html_element t
1772 if t.type is TYPE_START_TAG and h_tags[t.name]?
1773 close_p_if_in_button_scope()
1774 if h_tags[open_els[0].name] is open_els[0].namespace
1777 insert_html_element t
1779 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1780 close_p_if_in_button_scope()
1781 insert_html_element t
1782 # spec: If the next token is a "LF" (U+000A) character token, then
1783 # ignore that token and move on to the next one. (Newlines at the
1784 # start of pre blocks are ignored as an authoring convenience.)
1785 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1787 flag_frameset_ok = false
1789 if t.type is TYPE_START_TAG and t.name is 'form'
1790 unless form_element_pointer is null or template_tag_is_open()
1793 close_p_if_in_button_scope()
1794 el = insert_html_element t
1795 unless template_tag_is_open()
1796 form_element_pointer = el
1798 if t.type is TYPE_START_TAG and t.name is 'li'
1799 flag_frameset_ok = false
1800 for node in open_els
1801 if node.name is 'li' and node.namespace is NS_HTML
1802 generate_implied_end_tags 'li' # arg is exception
1803 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1806 el = open_els.shift()
1807 if el.name is 'li' and el.namespace is NS_HTML
1810 if el_is_special_not_adp node
1812 close_p_if_in_button_scope()
1813 insert_html_element t
1815 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1816 flag_frameset_ok = false
1817 for node in open_els
1818 if node.name is 'dd' and node.namespace is NS_HTML
1819 generate_implied_end_tags 'dd' # arg is exception
1820 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1823 el = open_els.shift()
1824 if el.name is 'dd' and el.namespace is NS_HTML
1827 if node.name is 'dt' and node.namespace is NS_HTML
1828 generate_implied_end_tags 'dt' # arg is exception
1829 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1832 el = open_els.shift()
1833 if el.name is 'dt' and el.namespace is NS_HTML
1836 if el_is_special_not_adp node
1838 close_p_if_in_button_scope()
1839 insert_html_element t
1841 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1842 close_p_if_in_button_scope()
1843 insert_html_element t
1844 tok_state = tok_state_plaintext
1846 if t.type is TYPE_START_TAG and t.name is 'button'
1847 if is_in_scope 'button', NS_HTML
1849 generate_implied_end_tags()
1851 el = open_els.shift()
1852 if el.name is 'button' and el.namespace is NS_HTML
1855 insert_html_element t
1856 flag_frameset_ok = false
1858 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1859 unless is_in_scope t.name, NS_HTML
1862 generate_implied_end_tags()
1863 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1866 el = open_els.shift()
1867 if el.name is t.name and el.namespace is NS_HTML
1870 if t.type is TYPE_END_TAG and t.name is 'form'
1871 unless template_tag_is_open()
1872 node = form_element_pointer
1873 form_element_pointer = null
1874 if node is null or not el_is_in_scope node
1877 generate_implied_end_tags()
1878 if open_els[0] isnt node
1880 for el, i in open_els
1882 open_els.splice i, 1
1885 unless is_in_scope 'form', NS_HTML
1888 generate_implied_end_tags()
1889 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1892 el = open_els.shift()
1893 if el.name is 'form' and el.namespace is NS_HTML
1896 if t.type is TYPE_END_TAG and t.name is 'p'
1897 unless is_in_button_scope 'p', NS_HTML
1899 insert_html_element new_open_tag 'p'
1902 if t.type is TYPE_END_TAG and t.name is 'li'
1903 unless is_in_li_scope 'li', NS_HTML
1906 generate_implied_end_tags 'li' # arg is exception
1907 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1910 el = open_els.shift()
1911 if el.name is 'li' and el.namespace is NS_HTML
1914 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1915 unless is_in_scope t.name, NS_HTML
1918 generate_implied_end_tags t.name # arg is exception
1919 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1922 el = open_els.shift()
1923 if el.name is t.name and el.namespace is NS_HTML
1926 if t.type is TYPE_END_TAG and h_tags[t.name]?
1929 if h_tags[el.name] is el.namespace
1932 if standard_scopers[el.name] is el.namespace
1937 generate_implied_end_tags()
1938 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1941 el = open_els.shift()
1942 if h_tags[el.name] is el.namespace
1946 if t.type is TYPE_START_TAG and t.name is 'a'
1947 # If the list of active formatting elements contains an a element
1948 # between the end of the list and the last marker on the list (or
1949 # the start of the list if there is no marker on the list), then
1950 # this is a parse error; run the adoption agency algorithm for the
1951 # tag name "a", then remove that element from the list of active
1952 # formatting elements and the stack of open elements if the
1953 # adoption agency algorithm didn't already remove it (it might not
1954 # have if the element is not in table scope).
1957 if el.type is TYPE_AFE_MARKER
1959 if el.name is 'a' and el.namespace is NS_HTML
1967 for el, i in open_els
1969 open_els.splice i, 1
1971 el = insert_html_element t
1974 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1976 el = insert_html_element t
1979 if t.type is TYPE_START_TAG and t.name is 'nobr'
1981 el = insert_html_element t
1984 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1985 adoption_agency t.name
1987 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1989 insert_html_element t
1991 flag_frameset_ok = false
1993 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1994 unless is_in_scope t.name, NS_HTML
1997 generate_implied_end_tags()
1998 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2001 el = open_els.shift()
2002 if el.name is t.name and el.namespace is NS_HTML
2004 clear_afe_to_marker()
2006 if t.type is TYPE_START_TAG and t.name is 'table'
2007 close_p_if_in_button_scope() # fixfull quirksmode thing
2008 insert_html_element t
2009 flag_frameset_ok = false
2010 ins_mode = ins_mode_in_table
2012 if t.type is TYPE_END_TAG and t.name is 'br'
2014 t.type is TYPE_START_TAG
2016 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2018 insert_html_element t
2020 t.acknowledge_self_closing()
2021 flag_frameset_ok = false
2023 if t.type is TYPE_START_TAG and t.name is 'input'
2025 insert_html_element t
2027 t.acknowledge_self_closing()
2028 unless is_input_hidden_tok t
2029 flag_frameset_ok = false
2031 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
2032 insert_html_element t
2034 t.acknowledge_self_closing()
2036 if t.type is TYPE_START_TAG and t.name is 'hr'
2037 close_p_if_in_button_scope()
2038 insert_html_element t
2040 t.acknowledge_self_closing()
2041 flag_frameset_ok = false
2043 if t.type is TYPE_START_TAG and t.name is 'image'
2048 if t.type is TYPE_START_TAG and t.name is 'isindex'
2050 if template_tag_is_open() is false and form_element_pointer isnt null
2052 t.acknowledge_self_closing()
2053 flag_frameset_ok = false
2054 close_p_if_in_button_scope()
2055 el = insert_html_element new_open_tag 'form'
2056 unless template_tag_is_open()
2057 form_element_pointer = el
2060 el.attrs['action'] = a[1]
2062 insert_html_element new_open_tag 'hr'
2065 insert_html_element new_open_tag 'label'
2066 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2067 input_el = new_open_tag 'input'
2072 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2073 input_el.attrs_a.push [a[0], a[1]]
2074 input_el.attrs_a.push ['name', 'isindex']
2075 # fixfull this next bit is in english... internationalize?
2076 prompt ?= "This is a searchable index. Enter search keywords: "
2077 insert_character new_character_token prompt # fixfull split
2078 # TODO submit typo "balue" in spec
2079 insert_html_element input_el
2081 # insert_character '' # you can put chars here if promt attr missing
2083 insert_html_element new_open_tag 'hr'
2086 unless template_tag_is_open()
2087 form_element_pointer = null
2089 if t.type is TYPE_START_TAG and t.name is 'textarea'
2090 insert_html_element t
2091 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2093 tok_state = tok_state_rcdata
2094 original_ins_mode = ins_mode
2095 flag_frameset_ok = false
2096 ins_mode = ins_mode_text
2098 if t.type is TYPE_START_TAG and t.name is 'xmp'
2099 close_p_if_in_button_scope()
2101 flag_frameset_ok = false
2102 parse_generic_raw_text t
2104 if t.type is TYPE_START_TAG and t.name is 'iframe'
2105 flag_frameset_ok = false
2106 parse_generic_raw_text t
2108 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2109 parse_generic_raw_text t
2111 if t.type is TYPE_START_TAG and t.name is 'select'
2113 insert_html_element t
2114 flag_frameset_ok = false
2115 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2116 ins_mode = ins_mode_in_select_in_table
2118 ins_mode = ins_mode_in_select
2120 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2121 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2124 insert_html_element t
2126 # this comment block implements the W3C spec
2127 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2128 # if is_in_scope 'ruby', NS_HTML
2129 # generate_implied_end_tags()
2130 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2132 # insert_html_element t
2134 # if t.type is TYPE_START_TAG and t.name is 'rt'
2135 # if is_in_scope 'ruby', NS_HTML
2136 # generate_implied_end_tags 'rtc' # arg is exception
2137 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2139 # insert_html_element t
2141 # below implements the WATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2142 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2143 if is_in_scope 'ruby', NS_HTML
2144 generate_implied_end_tags()
2145 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2147 insert_html_element t
2149 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2150 if is_in_scope 'ruby', NS_HTML
2151 generate_implied_end_tags 'rtc'
2152 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2154 insert_html_element t
2157 if t.type is TYPE_START_TAG and t.name is 'math'
2159 adjust_mathml_attributes t
2160 adjust_foreign_attributes t
2161 insert_foreign_element t, NS_MATHML
2162 if t.flag 'self-closing'
2164 t.acknowledge_self_closing()
2166 if t.type is TYPE_START_TAG and t.name is 'svg'
2168 adjust_svg_attributes t
2169 adjust_foreign_attributes t
2170 insert_foreign_element t, NS_SVG
2171 if t.flag 'self-closing'
2173 t.acknowledge_self_closing()
2175 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2178 if t.type is TYPE_START_TAG # any other start tag
2180 insert_html_element t
2182 if t.type is TYPE_END_TAG # any other end tag
2183 in_body_any_other_end_tag t.name
2187 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2188 ins_mode_text = (t) ->
2189 if t.type is TYPE_TEXT
2192 if t.type is TYPE_EOF
2194 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2195 open_els[0].flag 'already started', true
2197 ins_mode = original_ins_mode
2200 if t.type is TYPE_END_TAG and t.name is 'script'
2202 ins_mode = original_ins_mode
2203 # fixfull the spec seems to assume that I'm going to run the script
2204 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2206 if t.type is TYPE_END_TAG
2208 ins_mode = original_ins_mode
2210 console.log 'warning: end of ins_mode_text reached'
2212 # the functions below implement the tokenizer stats described here:
2213 # http://www.w3.org/TR/html5/syntax.html#tokenization
2215 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2216 ins_mode_in_table_else = (t) ->
2218 flag_foster_parenting = true
2220 flag_foster_parenting = false
2222 ins_mode_in_table = (t) ->
2225 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2226 pending_table_character_tokens = []
2227 original_ins_mode = ins_mode
2228 ins_mode = ins_mode_in_table_text
2231 ins_mode_in_table_else t
2239 clear_stack_to_table_context()
2241 insert_html_element t
2242 ins_mode = ins_mode_in_caption
2244 clear_stack_to_table_context()
2245 insert_html_element t
2246 ins_mode = ins_mode_in_column_group
2248 clear_stack_to_table_context()
2249 insert_html_element new_open_tag 'colgroup'
2250 ins_mode = ins_mode_in_column_group
2252 when 'tbody', 'tfoot', 'thead'
2253 clear_stack_to_table_context()
2254 insert_html_element t
2255 ins_mode = ins_mode_in_table_body
2256 when 'td', 'th', 'tr'
2257 clear_stack_to_table_context()
2258 insert_html_element new_open_tag 'tbody'
2259 ins_mode = ins_mode_in_table_body
2263 if is_in_table_scope 'table', NS_HTML
2265 el = open_els.shift()
2266 if el.name is 'table' and el.namespace is NS_HTML
2270 when 'style', 'script', 'template'
2273 unless is_input_hidden_tok t
2274 ins_mode_in_table_else t
2277 el = insert_html_element t
2279 t.acknowledge_self_closing()
2282 if form_element_pointer?
2284 if template_tag_is_open()
2286 form_element_pointer = insert_html_element t
2289 ins_mode_in_table_else t
2293 if is_in_table_scope 'table', NS_HTML
2295 el = open_els.shift()
2296 if el.name is 'table' and el.namespace is NS_HTML
2301 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2306 ins_mode_in_table_else t
2310 ins_mode_in_table_else t
2313 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2314 ins_mode_in_table_text = (t) ->
2315 if t.type is TYPE_TEXT and t.text is "\u0000"
2319 if t.type is TYPE_TEXT
2320 pending_table_character_tokens.push t
2324 for old in pending_table_character_tokens
2325 unless is_space_tok old
2329 for old in pending_table_character_tokens
2330 insert_character old
2332 for old in pending_table_character_tokens
2333 ins_mode_in_table_else old
2334 pending_table_character_tokens = []
2335 ins_mode = original_ins_mode
2338 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2339 ins_mode_in_caption = (t) ->
2340 if t.type is TYPE_END_TAG and t.name is 'caption'
2341 if is_in_table_scope 'caption', NS_HTML
2342 generate_implied_end_tags()
2343 if open_els[0].name isnt 'caption'
2346 el = open_els.shift()
2347 if el.name is 'caption' and el.namespace is NS_HTML
2349 clear_afe_to_marker()
2350 ins_mode = ins_mode_in_table
2355 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2357 if is_in_table_scope 'caption', NS_HTML
2359 el = open_els.shift()
2360 if el.name is 'caption' and el.namespace is NS_HTML
2362 clear_afe_to_marker()
2363 ins_mode = ins_mode_in_table
2365 # else fragment case
2367 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2373 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2374 ins_mode_in_column_group = (t) ->
2378 if t.type is TYPE_COMMENT
2381 if t.type is TYPE_DOCTYPE
2384 if t.type is TYPE_START_TAG and t.name is 'html'
2387 if t.type is TYPE_START_TAG and t.name is 'col'
2388 el = insert_html_element t
2390 t.acknowledge_self_closing()
2392 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2393 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2395 ins_mode = ins_mode_in_table
2399 if t.type is TYPE_END_TAG and t.name is 'col'
2402 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2405 if t.type is TYPE_EOF
2409 if open_els[0].name isnt 'colgroup'
2413 ins_mode = ins_mode_in_table
2417 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2418 ins_mode_in_table_body = (t) ->
2419 if t.type is TYPE_START_TAG and t.name is 'tr'
2420 clear_stack_to_table_body_context()
2421 insert_html_element t
2422 ins_mode = ins_mode_in_row
2424 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2426 clear_stack_to_table_body_context()
2427 insert_html_element new_open_tag 'tr'
2428 ins_mode = ins_mode_in_row
2431 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2432 unless is_in_table_scope t.name, NS_HTML
2435 clear_stack_to_table_body_context()
2437 ins_mode = ins_mode_in_table
2439 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2442 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2445 if table_scopers[el.name] is el.namespace
2450 clear_stack_to_table_body_context()
2452 ins_mode = ins_mode_in_table
2455 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2461 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2462 ins_mode_in_row = (t) ->
2463 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2464 clear_stack_to_table_row_context()
2465 insert_html_element t
2466 ins_mode = ins_mode_in_cell
2469 if t.type is TYPE_END_TAG and t.name is 'tr'
2470 if is_in_table_scope 'tr', NS_HTML
2471 clear_stack_to_table_row_context()
2473 ins_mode = ins_mode_in_table_body
2477 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2478 if is_in_table_scope 'tr', NS_HTML
2479 clear_stack_to_table_row_context()
2481 ins_mode = ins_mode_in_table_body
2486 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2487 if is_in_table_scope t.name, NS_HTML
2488 if is_in_table_scope 'tr', NS_HTML
2489 clear_stack_to_table_row_context()
2491 ins_mode = ins_mode_in_table_body
2496 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2502 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2504 generate_implied_end_tags()
2505 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2508 el = open_els.shift()
2509 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2511 clear_afe_to_marker()
2512 ins_mode = ins_mode_in_row
2514 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2515 ins_mode_in_cell = (t) ->
2516 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2517 if is_in_table_scope t.name, NS_HTML
2518 generate_implied_end_tags()
2519 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2522 el = open_els.shift()
2523 if el.name is t.name and el.namespace is NS_HTML
2525 clear_afe_to_marker()
2526 ins_mode = ins_mode_in_row
2530 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2533 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2536 if table_scopers[el.name] is el.namespace
2544 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2547 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2548 if is_in_table_scope t.name, NS_HTML
2557 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2558 ins_mode_in_select = (t) ->
2559 if t.type is TYPE_TEXT and t.text is "\u0000"
2562 if t.type is TYPE_TEXT
2565 if t.type is TYPE_COMMENT
2568 if t.type is TYPE_DOCTYPE
2571 if t.type is TYPE_START_TAG and t.name is 'html'
2574 if t.type is TYPE_START_TAG and t.name is 'option'
2575 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2577 insert_html_element t
2579 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2580 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2582 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2584 insert_html_element t
2586 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2587 if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
2588 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2590 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2595 if t.type is TYPE_END_TAG and t.name is 'option'
2596 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2601 if t.type is TYPE_END_TAG and t.name is 'select'
2602 if is_in_select_scope 'select', NS_HTML
2604 el = open_els.shift()
2605 if el.name is 'select' and el.namespace is NS_HTML
2611 if t.type is TYPE_START_TAG and t.name is 'select'
2614 el = open_els.shift()
2615 if el.name is 'select' and el.namespace is NS_HTML
2618 # spec says that this is the same as </select> but it doesn't say
2619 # to check scope first
2621 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2623 if is_in_select_scope 'select', NS_HTML
2626 el = open_els.shift()
2627 if el.name is 'select' and el.namespace is NS_HTML
2632 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2635 if t.type is TYPE_EOF
2642 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2643 ins_mode_in_select_in_table = (t) ->
2644 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2647 el = open_els.shift()
2648 if el.name is 'select' and el.namespace is NS_HTML
2653 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2655 unless is_in_table_scope t.name, NS_HTML
2658 el = open_els.shift()
2659 if el.name is 'select' and el.namespace is NS_HTML
2665 ins_mode_in_select t
2668 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2669 ins_mode_in_template = (t) ->
2670 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2673 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2676 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2677 template_ins_modes.shift()
2678 template_ins_modes.unshift ins_mode_in_table
2679 ins_mode = ins_mode_in_table
2682 if t.type is TYPE_START_TAG and t.name is 'col'
2683 template_ins_modes.shift()
2684 template_ins_modes.unshift ins_mode_in_column_group
2685 ins_mode = ins_mode_in_column_group
2688 if t.type is TYPE_START_TAG and t.name is 'tr'
2689 template_ins_modes.shift()
2690 template_ins_modes.unshift ins_mode_in_table_body
2691 ins_mode = ins_mode_in_table_body
2694 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2695 template_ins_modes.shift()
2696 template_ins_modes.unshift ins_mode_in_row
2697 ins_mode = ins_mode_in_row
2700 if t.type is TYPE_START_TAG
2701 template_ins_modes.shift()
2702 template_ins_modes.unshift ins_mode_in_body
2703 ins_mode = ins_mode_in_body
2706 if t.type is TYPE_END_TAG
2709 if t.type is TYPE_EOF
2710 unless template_tag_is_open()
2715 el = open_els.shift()
2716 if el.name is 'template' and el.namespace is NS_HTML
2718 clear_afe_to_marker()
2719 template_ins_modes.shift()
2723 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2724 ins_mode_after_body = (t) ->
2728 if t.type is TYPE_COMMENT
2729 insert_comment t, [open_els[0], open_els[0].children.length]
2731 if t.type is TYPE_DOCTYPE
2734 if t.type is TYPE_START_TAG and t.name is 'html'
2737 if t.type is TYPE_END_TAG and t.name is 'html'
2738 if flag_fragment_parsing
2741 ins_mode = ins_mode_after_after_body
2743 if t.type is TYPE_EOF
2748 ins_mode = ins_mode_in_body
2751 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2752 ins_mode_in_frameset = (t) ->
2756 if t.type is TYPE_COMMENT
2759 if t.type is TYPE_DOCTYPE
2762 if t.type is TYPE_START_TAG and t.name is 'html'
2765 if t.type is TYPE_START_TAG and t.name is 'frameset'
2766 insert_html_element t
2768 if t.type is TYPE_END_TAG and t.name is 'frameset'
2769 if open_els.length is 1
2771 return # fragment case
2773 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2774 ins_mode = ins_mode_after_frameset
2776 if t.type is TYPE_START_TAG and t.name is 'frame'
2777 insert_html_element t
2779 t.acknowledge_self_closing()
2781 if t.type is TYPE_START_TAG and t.name is 'noframes'
2784 if t.type is TYPE_EOF
2785 if open_els.length isnt 1
2793 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2794 ins_mode_after_frameset = (t) ->
2798 if t.type is TYPE_COMMENT
2801 if t.type is TYPE_DOCTYPE
2804 if t.type is TYPE_START_TAG and t.name is 'html'
2807 if t.type is TYPE_END_TAG and t.name is 'html'
2808 insert_mode = ins_mode_after_after_frameset
2810 if t.type is TYPE_START_TAG and t.name is 'noframes'
2813 if t.type is TYPE_EOF
2820 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2821 ins_mode_after_after_body = (t) ->
2822 if t.type is TYPE_COMMENT
2823 insert_comment t, [doc, doc.children.length]
2825 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2828 if t.type is TYPE_EOF
2833 ins_mode = ins_mode_in_body
2837 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2838 ins_mode_after_after_frameset = (t) ->
2839 if t.type is TYPE_COMMENT
2840 insert_comment t, [doc, doc.children.length]
2842 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2845 if t.type is TYPE_EOF
2848 if t.type is TYPE_START_TAG and t.name is 'noframes'
2855 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2856 has_color_face_or_size = (t) ->
2858 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2861 in_foreign_content_end_script = ->
2865 in_foreign_content_other_start = (t) ->
2866 acn = adjusted_current_node()
2867 if acn.namespace is NS_MATHML
2868 adjust_mathml_attributes t
2869 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2870 t.name = svg_name_fixes[t.name]
2871 if acn.namespace is NS_SVG
2872 adjust_svg_attributes t
2873 adjust_foreign_attributes t
2874 insert_foreign_element t, acn.namespace
2875 if t.flag 'self-closing'
2876 if t.name is 'script'
2877 t.acknowledge_self_closing()
2878 in_foreign_content_end_script()
2882 t.acknowledge_self_closing()
2884 in_foreign_content = (t) ->
2885 if t.type is TYPE_TEXT and t.text is "\u0000"
2887 insert_character new_character_token "\ufffd"
2892 if t.type is TYPE_TEXT
2893 flag_frameset_ok = false
2896 if t.type is TYPE_COMMENT
2899 if t.type is TYPE_DOCTYPE
2902 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
2904 if flag_fragment_parsing
2905 in_foreign_content_other_start t
2907 loop # is this safe?
2909 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
2913 if t.type is TYPE_START_TAG
2914 in_foreign_content_other_start t
2916 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
2917 in_foreign_content_end_script()
2919 if t.type is TYPE_END_TAG
2922 if node.name.toLowerCase() isnt t.name
2925 if node is open_els[open_els.length - 1]
2927 if node.name.toLowerCase() is t.name
2929 el = open_els.shift()
2934 if node.namespace is NS_HTML
2936 ins_mode t # explicitly call HTML insertion mode
2939 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2941 switch c = txt.charAt(cur++)
2943 return new_text_node parse_character_reference()
2945 tok_state = tok_state_tag_open
2948 return new_text_node "\ufffd"
2950 return new_eof_token()
2952 return new_text_node c
2955 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2956 # not needed: tok_state_character_reference_in_data = ->
2957 # just call parse_character_reference()
2959 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2960 tok_state_rcdata = ->
2961 switch c = txt.charAt(cur++)
2963 return new_text_node parse_character_reference()
2965 tok_state = tok_state_rcdata_less_than_sign
2968 return new_character_token "\ufffd"
2970 return new_eof_token()
2972 return new_character_token c
2975 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2976 # not needed: tok_state_character_reference_in_rcdata = ->
2977 # just call parse_character_reference()
2979 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2980 tok_state_rawtext = ->
2981 switch c = txt.charAt(cur++)
2983 tok_state = tok_state_rawtext_less_than_sign
2986 return new_character_token "\ufffd"
2988 return new_eof_token()
2990 return new_character_token c
2993 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2994 tok_state_script_data = ->
2995 switch c = txt.charAt(cur++)
2997 tok_state = tok_state_script_data_less_than_sign
3000 return new_character_token "\ufffd"
3002 return new_eof_token()
3004 return new_character_token c
3007 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3008 tok_state_plaintext = ->
3009 switch c = txt.charAt(cur++)
3012 return new_character_token "\ufffd"
3014 return new_eof_token()
3016 return new_character_token c
3020 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3021 tok_state_tag_open = ->
3022 c = txt.charAt(cur++)
3024 tok_state = tok_state_markup_declaration_open
3027 tok_state = tok_state_end_tag_open
3030 tok_cur_tag = new_open_tag c.toLowerCase()
3031 tok_state = tok_state_tag_name
3034 tok_cur_tag = new_open_tag c
3035 tok_state = tok_state_tag_name
3039 tok_cur_tag = new_comment_token '?' # FIXME right?
3040 tok_state = tok_state_bogus_comment
3044 tok_state = tok_state_data
3045 cur -= 1 # we didn't parse/handle the char after <
3046 return new_text_node '<'
3048 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3049 tok_state_end_tag_open = ->
3050 switch c = txt.charAt(cur++)
3053 tok_state = tok_state_data
3056 tok_state = tok_state_data
3057 return new_text_node '</'
3060 tok_cur_tag = new_end_tag c.toLowerCase()
3061 tok_state = tok_state_tag_name
3062 else if is_lc_alpha(c)
3063 tok_cur_tag = new_end_tag c
3064 tok_state = tok_state_tag_name
3067 tok_cur_tag = new_comment_token '/'
3068 tok_state = tok_state_bogus_comment
3071 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3072 tok_state_tag_name = ->
3073 switch c = txt.charAt(cur++)
3074 when "\t", "\n", "\u000c", ' '
3075 tok_state = tok_state_before_attribute_name
3077 tok_state = tok_state_self_closing_start_tag
3079 tok_state = tok_state_data
3085 tok_cur_tag.name += "\ufffd"
3088 tok_state = tok_state_data
3091 tok_cur_tag.name += c.toLowerCase()
3093 tok_cur_tag.name += c
3096 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3097 tok_state_rcdata_less_than_sign = ->
3098 c = txt.charAt(cur++)
3100 temporary_buffer = ''
3101 tok_state = tok_state_rcdata_end_tag_open
3104 tok_state = tok_state_rcdata
3105 cur -= 1 # reconsume the input character
3106 return new_character_token '<'
3108 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3109 tok_state_rcdata_end_tag_open = ->
3110 c = txt.charAt(cur++)
3112 tok_cur_tag = new_end_tag c.toLowerCase()
3113 temporary_buffer += c
3114 tok_state = tok_state_rcdata_end_tag_name
3117 tok_cur_tag = new_end_tag c
3118 temporary_buffer += c
3119 tok_state = tok_state_rcdata_end_tag_name
3122 tok_state = tok_state_rcdata
3123 cur -= 1 # reconsume the input character
3124 return new_character_token "</" # fixfull separate these
3126 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3127 is_appropriate_end_tag = (t) ->
3128 # spec says to check against "the tag name of the last start tag to
3129 # have been emitted from this tokenizer", but this is only called from
3130 # the various "raw" states, so it's hopefully ok to assume that
3131 # open_els[0].name will work instead TODO: verify this after the script
3132 # data states are implemented
3133 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3134 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3136 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3137 tok_state_rcdata_end_tag_name = ->
3138 c = txt.charAt(cur++)
3139 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3140 if is_appropriate_end_tag tok_cur_tag
3141 tok_state = tok_state_before_attribute_name
3143 # else fall through to "Anything else"
3145 if is_appropriate_end_tag tok_cur_tag
3146 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3148 # else fall through to "Anything else"
3150 if is_appropriate_end_tag tok_cur_tag
3151 tok_state = tok_state_data
3153 # else fall through to "Anything else"
3155 tok_cur_tag.name += c.toLowerCase()
3156 temporary_buffer += c
3159 tok_cur_tag.name += c
3160 temporary_buffer += c
3163 tok_state = tok_state_rcdata
3164 cur -= 1 # reconsume the input character
3165 return new_character_token '</' + temporary_buffer # fixfull separate these
3167 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3168 tok_state_rawtext_less_than_sign = ->
3169 c = txt.charAt(cur++)
3171 temporary_buffer = ''
3172 tok_state = tok_state_rawtext_end_tag_open
3175 tok_state = tok_state_rawtext
3176 cur -= 1 # reconsume the input character
3177 return new_character_token '<'
3179 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3180 tok_state_rawtext_end_tag_open = ->
3181 c = txt.charAt(cur++)
3183 tok_cur_tag = new_end_tag c.toLowerCase()
3184 temporary_buffer += c
3185 tok_state = tok_state_rawtext_end_tag_name
3188 tok_cur_tag = new_end_tag c
3189 temporary_buffer += c
3190 tok_state = tok_state_rawtext_end_tag_name
3193 tok_state = tok_state_rawtext
3194 cur -= 1 # reconsume the input character
3195 return new_character_token "</" # fixfull separate these
3197 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3198 tok_state_rawtext_end_tag_name = ->
3199 c = txt.charAt(cur++)
3200 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3201 if is_appropriate_end_tag tok_cur_tag
3202 tok_state = tok_state_before_attribute_name
3204 # else fall through to "Anything else"
3206 if is_appropriate_end_tag tok_cur_tag
3207 tok_state = tok_state_self_closing_start_tag
3209 # else fall through to "Anything else"
3211 if is_appropriate_end_tag tok_cur_tag
3212 tok_state = tok_state_data
3214 # else fall through to "Anything else"
3216 tok_cur_tag.name += c.toLowerCase()
3217 temporary_buffer += c
3220 tok_cur_tag.name += c
3221 temporary_buffer += c
3224 tok_state = tok_state_rawtext
3225 cur -= 1 # reconsume the input character
3226 return new_character_token '</' + temporary_buffer # fixfull separate these
3228 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3229 tok_state_script_data_less_than_sign = ->
3230 c = txt.charAt(cur++)
3232 temporary_buffer = ''
3233 tok_state = tok_state_script_data_end_tag_open
3236 tok_state = tok_state_script_data_escape_start
3237 return new_character_token '<!' # fixfull split
3239 tok_state = tok_state_script_data
3240 cur -= 1 # Reconsume
3241 return new_character_token '<'
3243 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3244 tok_state_script_data_end_tag_open = ->
3245 c = txt.charAt(cur++)
3247 tok_cur_tag = new_end_tag c.toLowerCase()
3248 temporary_buffer += c
3249 tok_state = tok_state_script_data_end_tag_name
3252 tok_cur_tag = new_end_tag c
3253 temporary_buffer += c
3254 tok_state = tok_state_script_data_end_tag_name
3257 tok_state = tok_state_script_data
3258 cur -= 1 # Reconsume
3259 return new_character_token '</'
3261 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3262 tok_state_script_data_end_tag_name = ->
3263 c = txt.charAt(cur++)
3264 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3265 if is_appropriate_end_tag tok_cur_tag
3266 tok_state = tok_state_before_attribute_name
3270 if is_appropriate_end_tag tok_cur_tag
3271 tok_state = tok_state_self_closing_start_tag
3275 if is_appropriate_end_tag tok_cur_tag
3276 tok_state = tok_state_data
3280 tok_cur_tag.name += c.toLowerCase()
3281 temporary_buffer += c
3284 tok_cur_tag.name += c
3285 temporary_buffer += c
3288 tok_state = tok_state_script_data
3289 cur -= 1 # Reconsume
3290 return new_character_token "</#{temporary_buffer}" # fixfull split
3292 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3293 tok_state_script_data_escape_start = ->
3294 c = txt.charAt(cur++)
3296 tok_state = tok_state_script_data_escape_start_dash
3297 return new_character_token '-'
3299 tok_state = tok_state_script_data
3300 cur -= 1 # Reconsume
3303 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3304 tok_state_script_data_escape_start_dash = ->
3305 c = txt.charAt(cur++)
3307 tok_state = tok_state_script_data_escaped_dash_dash
3308 return new_character_token '-'
3310 tok_state = tok_state_script_data
3311 cur -= 1 # Reconsume
3314 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3315 tok_state_script_data_escaped = ->
3316 c = txt.charAt(cur++)
3318 tok_state = tok_state_script_data_escaped_dash
3319 return new_character_token '-'
3321 tok_state = tok_state_script_data_escaped_less_than_sign
3325 return new_character_token "\ufffd"
3327 tok_state = tok_state_data
3329 cur -= 1 # Reconsume
3332 return new_character_token c
3334 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3335 tok_state_script_data_escaped_dash = ->
3336 c = txt.charAt(cur++)
3338 tok_state = tok_state_script_data_escaped_dash_dash
3339 return new_character_token '-'
3341 tok_state = tok_state_script_data_escaped_less_than_sign
3345 tok_state = tok_state_script_data_escaped
3346 return new_character_token "\ufffd"
3348 tok_state = tok_state_data
3350 cur -= 1 # Reconsume
3353 tok_state = tok_state_script_data_escaped
3354 return new_character_token c
3356 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3357 tok_state_script_data_escaped_dash_dash = ->
3358 c = txt.charAt(cur++)
3360 return new_character_token '-'
3362 tok_state = tok_state_script_data_escaped_less_than_sign
3365 tok_state = tok_state_script_data
3366 return new_character_token '>'
3369 tok_state = tok_state_script_data_escaped
3370 return new_character_token "\ufffd"
3373 tok_state = tok_state_data
3374 cur -= 1 # Reconsume
3377 tok_state = tok_state_script_data_escaped
3378 return new_character_token c
3380 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3381 tok_state_script_data_escaped_less_than_sign = ->
3382 c = txt.charAt(cur++)
3384 temporary_buffer = ''
3385 tok_state = tok_state_script_data_escaped_end_tag_open
3388 temporary_buffer = c.toLowerCase() # yes, really
3389 tok_state = tok_state_script_data_double_escape_start
3390 return new_character_token "<#{c}" # fixfull split
3392 temporary_buffer = c
3393 tok_state = tok_state_script_data_double_escape_start
3394 return new_character_token "<#{c}" # fixfull split
3396 tok_state = tok_state_script_data_escaped
3397 cur -= 1 # Reconsume
3398 return new_character_token '<'
3400 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3401 tok_state_script_data_escaped_end_tag_open = ->
3402 c = txt.charAt(cur++)
3404 tok_cur_tag = new_end_tag c.toLowerCase()
3405 temporary_buffer += c
3406 tok_state = tok_state_script_data_escaped_end_tag_name
3409 tok_cur_tag = new_end_tag c
3410 temporary_buffer += c
3411 tok_state = tok_state_script_data_escaped_end_tag_name
3414 tok_state = tok_state_script_data_escaped
3415 cur -= 1 # Reconsume
3416 return new_character_token '</' # fixfull split
3418 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3419 tok_state_script_data_escaped_end_tag_name = ->
3420 c = txt.charAt(cur++)
3421 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3422 if is_appropriate_end_tag tok_cur_tag
3423 tok_state = tok_state_before_attribute_name
3427 if is_appropriate_end_tag tok_cur_tag
3428 tok_state = tok_state_self_closing_start_tag
3432 if is_appropriate_end_tag tok_cur_tag
3433 tok_state = tok_state_data
3437 tok_cur_tag.name += c.toLowerCase()
3438 temporary_buffer += c.toLowerCase()
3441 tok_cur_tag.name += c
3442 temporary_buffer += c.toLowerCase()
3445 tok_state = tok_state_script_data_escaped
3446 cur -= 1 # Reconsume
3447 return new_character_token "</#{temporary_buffer}" # fixfull split
3449 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3450 tok_state_script_data_double_escape_start = ->
3451 c = txt.charAt(cur++)
3452 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3453 if temporary_buffer is 'script'
3454 tok_state = tok_state_script_data_double_escaped
3456 tok_state = tok_state_script_data_escaped
3457 return new_character_token c
3459 temporary_buffer += c.toLowerCase() # yes, really lowercase
3460 return new_character_token c
3462 temporary_buffer += c
3463 return new_character_token c
3465 tok_state = tok_state_script_data_escaped
3466 cur -= 1 # Reconsume
3469 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3470 tok_state_script_data_double_escaped = ->
3471 c = txt.charAt(cur++)
3473 tok_state = tok_state_script_data_double_escaped_dash
3474 return new_character_token '-'
3476 tok_state = tok_state_script_data_double_escaped_less_than_sign
3477 return new_character_token '<'
3480 return new_character_token "\ufffd"
3483 tok_state = tok_state_data
3484 cur -= 1 # Reconsume
3487 return new_character_token c
3489 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3490 tok_state_script_data_double_escaped_dash = ->
3491 c = txt.charAt(cur++)
3493 tok_state = tok_state_script_data_double_escaped_dash_dash
3494 return new_character_token '-'
3496 tok_state = tok_state_script_data_double_escaped_less_than_sign
3497 return new_character_token '<'
3500 tok_state = tok_state_script_data_double_escaped
3501 return new_character_token "\ufffd"
3504 tok_state = tok_state_data
3505 cur -= 1 # Reconsume
3508 tok_state = tok_state_script_data_double_escaped
3509 return new_character_token c
3511 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3512 tok_state_script_data_double_escaped_dash_dash = ->
3513 c = txt.charAt(cur++)
3515 return new_character_token '-'
3517 tok_state = tok_state_script_data_double_escaped_less_than_sign
3518 return new_character_token '<'
3520 tok_state = tok_state_script_data
3521 return new_character_token '>'
3524 tok_state = tok_state_script_data_double_escaped
3525 return new_character_token "\ufffd"
3528 tok_state = tok_state_data
3529 cur -= 1 # Reconsume
3532 tok_state = tok_state_script_data_double_escaped
3533 return new_character_token c
3535 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3536 tok_state_script_data_double_escaped_less_than_sign = ->
3537 c = txt.charAt(cur++)
3539 temporary_buffer = ''
3540 tok_state = tok_state_script_data_double_escape_end
3541 return new_character_token '/'
3543 tok_state = tok_state_script_data_double_escaped
3544 cur -= 1 # Reconsume
3547 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3548 tok_state_script_data_double_escape_end = ->
3549 c = txt.charAt(cur++)
3550 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3551 if temporary_buffer is 'script'
3552 tok_state = tok_state_script_data_escaped
3554 tok_state = tok_state_script_data_double_escaped
3555 return new_character_token c
3557 temporary_buffer += c.toLowerCase() # yes, really lowercase
3558 return new_character_token c
3560 temporary_buffer += c
3561 return new_character_token c
3563 tok_state = tok_state_script_data_double_escaped
3564 cur -= 1 # Reconsume
3567 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3568 tok_state_before_attribute_name = ->
3570 switch c = txt.charAt(cur++)
3571 when "\t", "\n", "\u000c", ' '
3574 tok_state = tok_state_self_closing_start_tag
3577 tok_state = tok_state_data
3583 attr_name = "\ufffd"
3584 when '"', "'", '<', '='
3589 tok_state = tok_state_data
3592 attr_name = c.toLowerCase()
3596 tok_cur_tag.attrs_a.unshift [attr_name, '']
3597 tok_state = tok_state_attribute_name
3600 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3601 tok_state_attribute_name = ->
3602 switch c = txt.charAt(cur++)
3603 when "\t", "\n", "\u000c", ' '
3604 tok_state = tok_state_after_attribute_name
3606 tok_state = tok_state_self_closing_start_tag
3608 tok_state = tok_state_before_attribute_value
3610 tok_state = tok_state_data
3616 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3619 tok_cur_tag.attrs_a[0][0] += c
3622 tok_state = tok_state_data
3625 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3627 tok_cur_tag.attrs_a[0][0] += c
3630 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3631 tok_state_after_attribute_name = ->
3632 c = txt.charAt(cur++)
3633 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3636 tok_state = tok_state_self_closing_start_tag
3639 tok_state = tok_state_before_attribute_value
3642 tok_state = tok_state_data
3645 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3646 tok_state = tok_state_attribute_name
3650 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3651 tok_state = tok_state_attribute_name
3655 tok_state = tok_state_data
3656 cur -= 1 # reconsume
3658 if c is '"' or c is "'" or c is '<'
3660 # fall through to Anything else
3662 tok_cur_tag.attrs_a.unshift [c, '']
3663 tok_state = tok_state_attribute_name
3665 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3666 tok_state_before_attribute_value = ->
3667 switch c = txt.charAt(cur++)
3668 when "\t", "\n", "\u000c", ' '
3671 tok_state = tok_state_attribute_value_double_quoted
3673 tok_state = tok_state_attribute_value_unquoted
3676 tok_state = tok_state_attribute_value_single_quoted
3679 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3680 tok_state = tok_state_attribute_value_unquoted
3683 tok_state = tok_state_data
3689 tok_state = tok_state_data
3691 tok_cur_tag.attrs_a[0][1] += c
3692 tok_state = tok_state_attribute_value_unquoted
3695 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3696 tok_state_attribute_value_double_quoted = ->
3697 switch c = txt.charAt(cur++)
3699 tok_state = tok_state_after_attribute_value_quoted
3701 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3704 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3707 tok_state = tok_state_data
3709 tok_cur_tag.attrs_a[0][1] += c
3712 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3713 tok_state_attribute_value_single_quoted = ->
3714 switch c = txt.charAt(cur++)
3716 tok_state = tok_state_after_attribute_value_quoted
3718 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3721 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3724 tok_state = tok_state_data
3726 tok_cur_tag.attrs_a[0][1] += c
3729 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3730 tok_state_attribute_value_unquoted = ->
3731 switch c = txt.charAt(cur++)
3732 when "\t", "\n", "\u000c", ' '
3733 tok_state = tok_state_before_attribute_name
3735 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3737 tok_state = tok_state_data
3742 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3745 tok_state = tok_state_data
3747 # Parse Error if ', <, = or ` (backtick)
3748 tok_cur_tag.attrs_a[0][1] += c
3751 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3752 tok_state_after_attribute_value_quoted = ->
3753 switch c = txt.charAt(cur++)
3754 when "\t", "\n", "\u000c", ' '
3755 tok_state = tok_state_before_attribute_name
3757 tok_state = tok_state_self_closing_start_tag
3759 tok_state = tok_state_data
3765 tok_state = tok_state_data
3768 tok_state = tok_state_before_attribute_name
3769 cur -= 1 # we didn't handle that char
3772 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3773 tok_state_self_closing_start_tag = ->
3774 c = txt.charAt(cur++)
3776 tok_cur_tag.flag 'self-closing', true
3777 tok_state = tok_state_data
3781 tok_state = tok_state_data
3782 cur -= 1 # Reconsume
3786 tok_state = tok_state_before_attribute_name
3787 cur -= 1 # Reconsume
3790 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3791 # WARNING: put a comment token in tok_cur_tag before setting this state
3792 tok_state_bogus_comment = ->
3793 next_gt = txt.indexOf '>', cur
3795 val = txt.substr cur
3798 val = txt.substr cur, (next_gt - cur)
3800 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3801 tok_cur_tag.text += val
3802 tok_state = tok_state_data
3805 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3806 tok_state_markup_declaration_open = ->
3807 if txt.substr(cur, 2) is '--'
3809 tok_cur_tag = new_comment_token ''
3810 tok_state = tok_state_comment_start
3812 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3814 tok_state = tok_state_doctype
3816 acn = adjusted_current_node()
3817 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3819 tok_state = tok_state_cdata_section
3823 tok_cur_tag = new_comment_token ''
3824 tok_state = tok_state_bogus_comment
3827 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3828 tok_state_comment_start = ->
3829 switch c = txt.charAt(cur++)
3831 tok_state = tok_state_comment_start_dash
3834 tok_state = tok_state_comment
3835 return new_character_token "\ufffd"
3838 tok_state = tok_state_data
3842 tok_state = tok_state_data
3843 cur -= 1 # Reconsume
3846 tok_cur_tag.text += c
3847 tok_state = tok_state_comment
3850 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3851 tok_state_comment_start_dash = ->
3852 switch c = txt.charAt(cur++)
3854 tok_state = tok_state_comment_end
3857 tok_cur_tag.text += "-\ufffd"
3858 tok_state = tok_state_comment
3861 tok_state = tok_state_data
3865 tok_state = tok_state_data
3866 cur -= 1 # Reconsume
3869 tok_cur_tag.text += "-#{c}"
3870 tok_state = tok_state_comment
3873 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3874 tok_state_comment = ->
3875 switch c = txt.charAt(cur++)
3877 tok_state = tok_state_comment_end_dash
3880 tok_cur_tag.text += "\ufffd"
3883 tok_state = tok_state_data
3884 cur -= 1 # Reconsume
3887 tok_cur_tag.text += c
3890 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3891 tok_state_comment_end_dash = ->
3892 switch c = txt.charAt(cur++)
3894 tok_state = tok_state_comment_end
3897 tok_cur_tag.text += "-\ufffd"
3898 tok_state = tok_state_comment
3901 tok_state = tok_state_data
3902 cur -= 1 # Reconsume
3905 tok_cur_tag.text += "-#{c}"
3906 tok_state = tok_state_comment
3909 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3910 tok_state_comment_end = ->
3911 switch c = txt.charAt(cur++)
3913 tok_state = tok_state_data
3917 tok_cur_tag.text += "--\ufffd"
3918 tok_state = tok_state_comment
3921 tok_state = tok_state_comment_end_bang
3924 tok_cur_tag.text += '-'
3927 tok_state = tok_state_data
3928 cur -= 1 # Reconsume
3932 tok_cur_tag.text += "--#{c}"
3933 tok_state = tok_state_comment
3936 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3937 tok_state_comment_end_bang = ->
3938 switch c = txt.charAt(cur++)
3940 tok_cur_tag.text += "--!#{c}"
3941 tok_state = tok_state_comment_end_dash
3943 tok_state = tok_state_data
3947 tok_cur_tag.text += "--!\ufffd"
3948 tok_state = tok_state_comment
3951 tok_state = tok_state_data
3952 cur -= 1 # Reconsume
3955 tok_cur_tag.text += "--!#{c}"
3956 tok_state = tok_state_comment
3959 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3960 tok_state_doctype = ->
3961 switch c = txt.charAt(cur++)
3962 when "\t", "\u000a", "\u000c", ' '
3963 tok_state = tok_state_before_doctype_name
3966 tok_state = tok_state_data
3967 el = new_doctype_token ''
3968 el.flag 'force-quirks', true
3969 cur -= 1 # Reconsume
3973 tok_state = tok_state_before_doctype_name
3974 cur -= 1 # Reconsume
3977 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3978 tok_state_before_doctype_name = ->
3979 c = txt.charAt(cur++)
3980 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3983 tok_cur_tag = new_doctype_token c.toLowerCase()
3984 tok_state = tok_state_doctype_name
3988 tok_cur_tag = new_doctype_token "\ufffd"
3989 tok_state = tok_state_doctype_name
3993 el = new_doctype_token ''
3994 el.flag 'force-quirks', true
3995 tok_state = tok_state_data
3999 tok_state = tok_state_data
4000 el = new_doctype_token ''
4001 el.flag 'force-quirks', true
4002 cur -= 1 # Reconsume
4005 tok_cur_tag = new_doctype_token c
4006 tok_state = tok_state_doctype_name
4009 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4010 tok_state_doctype_name = ->
4011 c = txt.charAt(cur++)
4012 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4013 tok_state = tok_state_after_doctype_name
4016 tok_state = tok_state_data
4019 tok_cur_tag.name += c.toLowerCase()
4023 tok_cur_tag.name += "\ufffd"
4027 tok_state = tok_state_data
4028 tok_cur_tag.flag 'force-quirks', true
4029 cur -= 1 # Reconsume
4032 tok_cur_tag.name += c
4035 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4036 tok_state_after_doctype_name = ->
4037 c = txt.charAt(cur++)
4038 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4041 tok_state = tok_state_data
4045 tok_state = tok_state_data
4046 tok_cur_tag.flag 'force-quirks', true
4047 cur -= 1 # Reconsume
4050 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4052 tok_state = tok_state_after_doctype_public_keyword
4054 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4056 tok_state = tok_state_after_doctype_system_keyword
4059 tok_cur_tag.flag 'force-quirks', true
4060 tok_state = tok_state_bogus_doctype
4063 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4064 tok_state_after_doctype_public_keyword = ->
4065 c = txt.charAt(cur++)
4066 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4067 tok_state = tok_state_before_doctype_public_identifier
4071 tok_cur_tag.public_identifier = ''
4072 tok_state = tok_state_doctype_public_identifier_double_quoted
4076 tok_cur_tag.public_identifier = ''
4077 tok_state = tok_state_doctype_public_identifier_single_quoted
4081 tok_cur_tag.flag 'force-quirks', true
4082 tok_state = tok_state_data
4086 tok_state = tok_state_data
4087 tok_cur_tag.flag 'force-quirks', true
4088 cur -= 1 # Reconsume
4092 tok_cur_tag.flag 'force-quirks', true
4093 tok_state = tok_state_bogus_doctype
4096 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4097 tok_state_before_doctype_public_identifier = ->
4098 c = txt.charAt(cur++)
4099 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4103 tok_cur_tag.public_identifier = ''
4104 tok_state = tok_state_doctype_public_identifier_double_quoted
4108 tok_cur_tag.public_identifier = ''
4109 tok_state = tok_state_doctype_public_identifier_single_quoted
4113 tok_cur_tag.flag 'force-quirks', true
4114 tok_state = tok_state_data
4118 tok_state = tok_state_data
4119 tok_cur_tag.flag 'force-quirks', true
4120 cur -= 1 # Reconsume
4124 tok_cur_tag.flag 'force-quirks', true
4125 tok_state = tok_state_bogus_doctype
4129 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4130 tok_state_doctype_public_identifier_double_quoted = ->
4131 c = txt.charAt(cur++)
4133 tok_state = tok_state_after_doctype_public_identifier
4137 tok_cur_tag.public_identifier += "\ufffd"
4141 tok_cur_tag.flag 'force-quirks', true
4142 tok_state = tok_state_data
4146 tok_state = tok_state_data
4147 tok_cur_tag.flag 'force-quirks', true
4148 cur -= 1 # Reconsume
4151 tok_cur_tag.public_identifier += c
4154 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4155 tok_state_doctype_public_identifier_single_quoted = ->
4156 c = txt.charAt(cur++)
4158 tok_state = tok_state_after_doctype_public_identifier
4162 tok_cur_tag.public_identifier += "\ufffd"
4166 tok_cur_tag.flag 'force-quirks', true
4167 tok_state = tok_state_data
4171 tok_state = tok_state_data
4172 tok_cur_tag.flag 'force-quirks', true
4173 cur -= 1 # Reconsume
4176 tok_cur_tag.public_identifier += c
4179 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4180 tok_state_after_doctype_public_identifier = ->
4181 c = txt.charAt(cur++)
4182 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4183 tok_state = tok_state_between_doctype_public_and_system_identifiers
4186 tok_state = tok_state_data
4190 tok_cur_tag.system_identifier = ''
4191 tok_state = tok_state_doctype_system_identifier_double_quoted
4195 tok_cur_tag.system_identifier = ''
4196 tok_state = tok_state_doctype_system_identifier_single_quoted
4200 tok_state = tok_state_data
4201 tok_cur_tag.flag 'force-quirks', true
4202 cur -= 1 # Reconsume
4206 tok_cur_tag.flag 'force-quirks', true
4207 tok_state = tok_state_bogus_doctype
4210 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4211 tok_state_between_doctype_public_and_system_identifiers = ->
4212 c = txt.charAt(cur++)
4213 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4216 tok_state = tok_state_data
4220 tok_cur_tag.system_identifier = ''
4221 tok_state = tok_state_doctype_system_identifier_double_quoted
4225 tok_cur_tag.system_identifier = ''
4226 tok_state = tok_state_doctype_system_identifier_single_quoted
4230 tok_state = tok_state_data
4231 tok_cur_tag.flag 'force-quirks', true
4232 cur -= 1 # Reconsume
4236 tok_cur_tag.flag 'force-quirks', true
4237 tok_state = tok_state_bogus_doctype
4240 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4241 tok_state_after_doctype_system_keyword = ->
4242 c = txt.charAt(cur++)
4243 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4244 tok_state = tok_state_before_doctype_system_identifier
4248 tok_cur_tag.system_identifier = ''
4249 tok_state = tok_state_doctype_system_identifier_double_quoted
4253 tok_cur_tag.system_identifier = ''
4254 tok_state = tok_state_doctype_system_identifier_single_quoted
4258 tok_cur_tag.flag 'force-quirks', true
4259 tok_state = tok_state_data
4263 tok_state = tok_state_data
4264 tok_cur_tag.flag 'force-quirks', true
4265 cur -= 1 # Reconsume
4269 tok_cur_tag.flag 'force-quirks', true
4270 tok_state = tok_state_bogus_doctype
4273 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4274 tok_state_before_doctype_system_identifier = ->
4275 c = txt.charAt(cur++)
4276 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4279 tok_cur_tag.system_identifier = ''
4280 tok_state = tok_state_doctype_system_identifier_double_quoted
4283 tok_cur_tag.system_identifier = ''
4284 tok_state = tok_state_doctype_system_identifier_single_quoted
4288 tok_cur_tag.flag 'force-quirks', true
4289 tok_state = tok_state_data
4293 tok_state = tok_state_data
4294 tok_cur_tag.flag 'force-quirks', true
4295 cur -= 1 # Reconsume
4299 tok_cur_tag.flag 'force-quirks', true
4300 tok_state = tok_state_bogus_doctype
4303 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4304 tok_state_doctype_system_identifier_double_quoted = ->
4305 c = txt.charAt(cur++)
4307 tok_state = tok_state_after_doctype_system_identifier
4311 tok_cur_tag.system_identifier += "\ufffd"
4315 tok_cur_tag.flag 'force-quirks', true
4316 tok_state = tok_state_data
4320 tok_state = tok_state_data
4321 tok_cur_tag.flag 'force-quirks', true
4322 cur -= 1 # Reconsume
4325 tok_cur_tag.system_identifier += c
4328 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4329 tok_state_doctype_system_identifier_single_quoted = ->
4330 c = txt.charAt(cur++)
4332 tok_state = tok_state_after_doctype_system_identifier
4336 tok_cur_tag.system_identifier += "\ufffd"
4340 tok_cur_tag.flag 'force-quirks', true
4341 tok_state = tok_state_data
4345 tok_state = tok_state_data
4346 tok_cur_tag.flag 'force-quirks', true
4347 cur -= 1 # Reconsume
4350 tok_cur_tag.system_identifier += c
4353 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4354 tok_state_after_doctype_system_identifier = ->
4355 c = txt.charAt(cur++)
4356 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4359 tok_state = tok_state_data
4363 tok_state = tok_state_data
4364 tok_cur_tag.flag 'force-quirks', true
4365 cur -= 1 # Reconsume
4369 # do _not_ tok_cur_tag.flag 'force-quirks', true
4370 tok_state = tok_state_bogus_doctype
4373 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4374 tok_state_bogus_doctype = ->
4375 c = txt.charAt(cur++)
4377 tok_state = tok_state_data
4380 tok_state = tok_state_data
4381 cur -= 1 # Reconsume
4386 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4387 tok_state_cdata_section = ->
4388 tok_state = tok_state_data
4389 next_gt = txt.indexOf ']]>', cur
4391 val = txt.substr cur
4394 val = txt.substr cur, (next_gt - cur)
4396 return new_character_token val # fixfull split
4398 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4399 # Don't set this as a state, just call it
4400 # returns a string (NOT a text node)
4401 parse_character_reference = (allowed_char = null, in_attr = false) ->
4402 if cur >= txt.length
4404 switch c = txt.charAt(cur)
4405 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4406 # explicitly not a parse error
4409 # there has to be "one or more" alnums between & and ; to be a parse error
4412 if cur + 1 >= txt.length
4414 if txt.charAt(cur + 1).toLowerCase() is 'x'
4423 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4428 if txt.charAt(start + i) is ';'
4432 code_point = txt.substr(start, i)
4433 while code_point.charAt(0) is '0' and code_point.length > 1
4434 code_point = code_point.substr 1
4435 code_point = parseInt(code_point, base)
4436 if unicode_fixes[code_point]?
4438 return unicode_fixes[code_point]
4440 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4444 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4446 return from_code_point code_point
4450 if alnum.indexOf(txt.charAt(cur + i)) is -1
4453 # exit early, because parse_error() below needs at least one alnum
4455 if txt.charAt(cur + i) is ';'
4456 i += 1 # include ';' terminator in value
4457 decoded = decode_named_char_ref txt.substr(cur, i)
4464 # no ';' terminator (only legacy char refs)
4466 for i in [2..max] # no prefix matches, so ok to check shortest first
4467 c = legacy_char_refs[txt.substr(cur, i)]
4470 if txt.charAt(cur + i) is '='
4471 # "because some legacy user agents will
4472 # misinterpret the markup in those cases"
4475 if alnum.indexOf(txt.charAt(cur + i)) > -1
4476 # this makes attributes forgiving about url args
4478 # ok, and besides the weird exceptions for attributes...
4479 # return the matching char
4480 cur += i # consume entity chars
4481 parse_error() # because no terminating ";"
4485 return # never reached
4487 # tree constructor initialization
4488 # see comments on TYPE_TAG/etc for the structure of this data
4491 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4493 afe = [] # active formatting elements
4494 template_ins_modes = []
4495 ins_mode = ins_mode_initial
4496 original_ins_mode = ins_mode # TODO check spec
4497 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4498 flag_frameset_ok = true
4500 flag_foster_parenting = false
4501 form_element_pointer = null
4502 temporary_buffer = null
4503 pending_table_character_tokens = []
4504 head_element_pointer = null
4505 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4506 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4508 # tokenizer initialization
4509 tok_state = tok_state_data
4511 # text pre-processing
4512 # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4513 txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4514 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4515 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4517 if args.name is "tests16.dat #25"
4520 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4525 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4528 serialize_els = (els, shallow, show_ids) ->
4534 serialized += t.serialize shallow, show_ids
4537 module.exports.parse_html = parse_html
4538 module.exports.debug_log_reset = debug_log_reset
4539 module.exports.debug_log_each = debug_log_each
4540 module.exports.TYPE_TAG = TYPE_TAG
4541 module.exports.TYPE_TEXT = TYPE_TEXT
4542 module.exports.TYPE_COMMENT = TYPE_COMMENT
4543 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4544 module.exports.NS_HTML = NS_HTML
4545 module.exports.NS_MATHML = NS_MATHML
4546 module.exports.NS_SVG = NS_SVG