1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
25 # Deviations from that spec:
27 # Purposeful: search this file for "WTAG"
29 # Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
34 # the spec uses a many different words do indicate which ends of lists/stacks
35 # they are talking about (and relative movement within the lists/stacks). This
36 # section splains. I'm implementing "lists" (afe and open_els) the same way
39 # stacks grow downward (current element is index=0)
41 # example: open_els = [a, b, c, d, e, f, g]
43 # "grows downwards" means it's visualized like this: (index: el, names)
45 # 6: g "start of the list", "topmost", "first"
47 # 4: e "previous" (to d), "above", "before"
48 # 3: d (previous/next are relative to this element)
49 # 2: c "next", "after", "lower", "below"
51 # 0: a "end of the list", "current node", "bottommost", "last"
55 # note: to get this to run outside a browser, you'll have to write a native
56 # implementation of decode_named_char_ref()
57 unless module?.exports?
59 module = exports: window.wheic
61 from_code_point = (x) ->
62 if String.fromCodePoint?
63 return String.fromCodePoint x
66 return String.fromCharCode x
68 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
70 # Each node is an obect of the Node class. Here are the Node types:
71 TYPE_TAG = 0 # name, {attributes}, [children]
72 TYPE_TEXT = 1 # "text"
75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
77 TYPE_END_TAG = 5 # name
79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
92 debug_log_each = (cb) ->
93 for str in g_debug_log
98 constructor: (type, args = {}) ->
99 @type = type # one of the TYPE_* constants above
100 @name = args.name ? '' # tag name
101 @text = args.text ? '' # contents for text/comment nodes
102 @attrs = args.attrs ? {}
103 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
104 @children = args.children ? []
105 @namespace = args.namespace ? NS_HTML
106 @parent = args.parent ? null
107 @token = args.token ? null
108 @flags = args.flags ? {}
112 @id = "#{++prev_node_id}"
113 acknowledge_self_closing: ->
115 @token.flag 'did_self_close'
117 @flag 'did_self_close', true
118 flag: (key, value = null) ->
123 serialize: (shallow = false, show_ids = false) -> # for unit tests
128 ret += JSON.stringify @name
143 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
149 ret += c.serialize shallow, show_ids
153 ret += JSON.stringify @text
156 ret += JSON.stringify @text
158 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
161 when TYPE_AAA_BOOKMARK
162 ret += 'aaa_bookmark'
165 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
168 # helpers: (only take args that are normally known when parser creates nodes)
169 new_open_tag = (name) ->
170 return new Node TYPE_START_TAG, name: name
171 new_end_tag = (name) ->
172 return new Node TYPE_END_TAG, name: name
173 new_element = (name) ->
174 return new Node TYPE_TAG, name: name
175 new_text_node = (txt) ->
176 return new Node TYPE_TEXT, text: txt
177 new_character_token = new_text_node
178 new_comment_token = (txt) ->
179 return new Node TYPE_COMMENT, text: txt
180 new_doctype_token = (name) ->
181 return new Node TYPE_DOCTYPE, name: name
183 return new Node TYPE_EOF
185 return new Node TYPE_AFE_MARKER
186 new_aaa_bookmark = ->
187 return new Node TYPE_AAA_BOOKMARK
189 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
190 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
191 digits = "0123456789"
192 alnum = lc_alpha + uc_alpha + digits
193 hex_chars = digits + "abcdefABCDEF"
195 is_uc_alpha = (str) ->
196 return str.length is 1 and uc_alpha.indexOf(str) > -1
197 is_lc_alpha = (str) ->
198 return str.length is 1 and lc_alpha.indexOf(str) > -1
200 # some SVG elements have dashes in them
201 tag_name_chars = alnum + "-"
203 # http://www.w3.org/TR/html5/infrastructure.html#space-character
204 space_chars = "\u0009\u000a\u000c\u000d\u0020"
206 return txt.length is 1 and space_chars.indexOf(txt) > -1
207 is_space_tok = (t) ->
208 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
210 is_input_hidden_tok = (t) ->
211 return false unless t.type is TYPE_START_TAG
214 if a[1].toLowerCase() is 'hidden'
219 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
220 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
223 unicode_fixes[0x00] = "\uFFFD"
224 unicode_fixes[0x80] = "\u20AC"
225 unicode_fixes[0x82] = "\u201A"
226 unicode_fixes[0x83] = "\u0192"
227 unicode_fixes[0x84] = "\u201E"
228 unicode_fixes[0x85] = "\u2026"
229 unicode_fixes[0x86] = "\u2020"
230 unicode_fixes[0x87] = "\u2021"
231 unicode_fixes[0x88] = "\u02C6"
232 unicode_fixes[0x89] = "\u2030"
233 unicode_fixes[0x8A] = "\u0160"
234 unicode_fixes[0x8B] = "\u2039"
235 unicode_fixes[0x8C] = "\u0152"
236 unicode_fixes[0x8E] = "\u017D"
237 unicode_fixes[0x91] = "\u2018"
238 unicode_fixes[0x92] = "\u2019"
239 unicode_fixes[0x93] = "\u201C"
240 unicode_fixes[0x94] = "\u201D"
241 unicode_fixes[0x95] = "\u2022"
242 unicode_fixes[0x96] = "\u2013"
243 unicode_fixes[0x97] = "\u2014"
244 unicode_fixes[0x98] = "\u02DC"
245 unicode_fixes[0x99] = "\u2122"
246 unicode_fixes[0x9A] = "\u0161"
247 unicode_fixes[0x9B] = "\u203A"
248 unicode_fixes[0x9C] = "\u0153"
249 unicode_fixes[0x9E] = "\u017E"
250 unicode_fixes[0x9F] = "\u0178"
252 # These are the character references that don't need a terminating semicolon
253 # min length: 2, max: 6, none are a prefix of any other.
255 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
256 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
257 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
258 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
259 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
260 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
261 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
262 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
263 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
264 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
265 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
266 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
267 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
268 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
269 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
270 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
271 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
275 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
276 raw_text_elements = ['script', 'style']
277 escapable_raw_text_elements = ['textarea', 'title']
278 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
280 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
281 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
282 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
283 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
284 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
285 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
286 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
287 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
288 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
289 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
290 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
291 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
292 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
293 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
297 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
299 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
300 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
301 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
302 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
303 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
304 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
305 'determinant', 'diff', 'divergence', 'divide', 'domain',
306 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
307 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
308 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
309 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
310 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
311 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
312 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
313 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
314 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
315 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
316 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
317 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
318 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
319 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
320 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
321 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
322 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
323 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
324 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
325 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
326 'vectorproduct', 'xor'
328 # foreign_elements = [svg_elements..., mathml_elements...]
329 #normal_elements = All other allowed HTML elements are normal elements.
333 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
334 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
335 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
336 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
337 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
338 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
339 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
340 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
341 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
342 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
343 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
345 menu:NS_HTML,menuitem:NS_HTML, # WATWG adds these
347 meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
348 noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
349 plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
350 select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
351 table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
352 textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
353 tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
356 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
357 'annotation-xml':NS_MATHML,
360 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
363 formatting_elements = {
364 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
365 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
369 mathml_text_integration = {
370 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
372 is_mathml_text_integration_point = (el) ->
373 return mathml_text_integration[el.name] is el.namespace
374 is_html_integration = (el) -> # DON'T PASS A TOKEN
375 if el.namespace is NS_MATHML
376 if el.name is 'annotation-xml'
377 if el.attrs.encoding?
378 if el.attrs.encoding.toLowerCase() is 'text/html'
380 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
383 if el.namespace is NS_SVG
384 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
389 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
392 foster_parenting_targets = {
413 el_is_special = (e) ->
414 return special_elements[e.name] is e.namespace
416 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
417 el_is_special_not_adp = (el) ->
418 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
422 altglyphdef: 'altGlyphDef'
423 altglyphitem: 'altGlyphItem'
424 animatecolor: 'animateColor'
425 animatemotion: 'animateMotion'
426 animatetransform: 'animateTransform'
429 fecolormatrix: 'feColorMatrix'
430 fecomponenttransfer: 'feComponentTransfer'
431 fecomposite: 'feComposite'
432 feconvolvematrix: 'feConvolveMatrix'
433 fediffuselighting: 'feDiffuseLighting'
434 fedisplacementmap: 'feDisplacementMap'
435 fedistantlight: 'feDistantLight'
436 fedropshadow: 'feDropShadow'
442 fegaussianblur: 'feGaussianBlur'
445 femergenode: 'feMergeNode'
446 femorphology: 'feMorphology'
448 fepointlight: 'fePointLight'
449 fespecularlighting: 'feSpecularLighting'
450 fespotlight: 'feSpotLight'
452 feturbulence: 'feTurbulence'
453 foreignobject: 'foreignObject'
455 lineargradient: 'linearGradient'
456 radialgradient: 'radialGradient'
459 svg_attribute_fixes = {
460 attributename: 'attributeName'
461 attributetype: 'attributeType'
462 basefrequency: 'baseFrequency'
463 baseprofile: 'baseProfile'
465 clippathunits: 'clipPathUnits'
466 contentscripttype: 'contentScriptType'
467 contentstyletype: 'contentStyleType'
468 diffuseconstant: 'diffuseConstant'
470 externalresourcesrequired: 'externalResourcesRequired'
471 filterres: 'filterRes'
472 filterunits: 'filterUnits'
474 gradienttransform: 'gradientTransform'
475 gradientunits: 'gradientUnits'
476 kernelmatrix: 'kernelMatrix'
477 kernelunitlength: 'kernelUnitLength'
478 keypoints: 'keyPoints'
479 keysplines: 'keySplines'
481 lengthadjust: 'lengthAdjust'
482 limitingconeangle: 'limitingConeAngle'
483 markerheight: 'markerHeight'
484 markerunits: 'markerUnits'
485 markerwidth: 'markerWidth'
486 maskcontentunits: 'maskContentUnits'
487 maskunits: 'maskUnits'
488 numoctaves: 'numOctaves'
489 pathlength: 'pathLength'
490 patterncontentunits: 'patternContentUnits'
491 patterntransform: 'patternTransform'
492 patternunits: 'patternUnits'
493 pointsatx: 'pointsAtX'
494 pointsaty: 'pointsAtY'
495 pointsatz: 'pointsAtZ'
496 preservealpha: 'preserveAlpha'
497 preserveaspectratio: 'preserveAspectRatio'
498 primitiveunits: 'primitiveUnits'
501 repeatcount: 'repeatCount'
502 repeatdur: 'repeatDur'
503 requiredextensions: 'requiredExtensions'
504 requiredfeatures: 'requiredFeatures'
505 specularconstant: 'specularConstant'
506 specularexponent: 'specularExponent'
507 spreadmethod: 'spreadMethod'
508 startoffset: 'startOffset'
509 stddeviation: 'stdDeviation'
510 stitchtiles: 'stitchTiles'
511 surfacescale: 'surfaceScale'
512 systemlanguage: 'systemLanguage'
513 tablevalues: 'tableValues'
516 textlength: 'textLength'
518 viewtarget: 'viewTarget'
519 xchannelselector: 'xChannelSelector'
520 ychannelselector: 'yChannelSelector'
521 zoomandpan: 'zoomAndPan'
523 foreign_attr_fixes = {
524 'xlink:actuate': 'xlink actuate'
525 'xlink:arcrole': 'xlink arcrole'
526 'xlink:href': 'xlink href'
527 'xlink:role': 'xlink role'
528 'xlink:show': 'xlink show'
529 'xlink:title': 'xlink title'
530 'xlink:type': 'xlink type'
531 'xml:base': 'xml base'
532 'xml:lang': 'xml lang'
533 'xml:space': 'xml space'
535 'xmlns:xlink': 'xmlns xlink'
537 adjust_mathml_attributes = (t) ->
539 if a[0] is 'definitionurl'
540 a[0] = 'definitionURL'
542 adjust_svg_attributes = (t) ->
544 if svg_attribute_fixes[a[0]]?
545 a[0] = svg_attribute_fixes[a[0]]
547 adjust_foreign_attributes = (t) ->
550 if foreign_attr_fixes[a[0]]?
551 a[0] = foreign_attr_fixes[a[0]]
554 # decode_named_char_ref()
556 # The list of named character references is _huge_ so ask the browser to decode
557 # for us instead of wasting bandwidth/space on including the table here.
559 # Pass without the "&" but with the ";" examples:
560 # for "&" pass "amp;"
561 # for "′" pass "x2032;"
564 textarea: document.createElement('textarea')
566 # TODO test this in IE8
567 decode_named_char_ref = (txt) ->
569 decoded = g_dncr.cache[txt]
570 return decoded if decoded?
571 g_dncr.textarea.innerHTML = txt
572 decoded = g_dncr.textarea.value
573 return null if decoded is txt
574 return g_dncr.cache[txt] = decoded
576 parse_html = (args) ->
578 cur = null # index of next char in txt to be parsed
579 # declare doc and tokenizer variables so they're in scope below
581 open_els = null # stack of open elements
582 afe = null # active formatting elements
583 template_ins_modes = null
585 original_ins_mode = null
587 tok_cur_tag = null # partially parsed tag
588 flag_scripting = null
589 flag_frameset_ok = null
591 flag_foster_parenting = null
592 form_element_pointer = null
593 temporary_buffer = null
594 pending_table_character_tokens = null
595 head_element_pointer = null
596 flag_fragment_parsing = null
597 context_element = null
606 console.log "Parse error at character #{cur} of #{txt.length}"
608 afe_push = (new_el) ->
611 if el.name is new_el.name and el.namespace is new_el.namespace
613 continue unless new_el.attrs[k] is v
614 for k, v of new_el.attrs
615 continue unless el.attrs[k] is v
622 afe.unshift new_afe_marker()
624 # the functions below impliment the Tree Contstruction algorithm
625 # http://www.w3.org/TR/html5/syntax.html#tree-construction
627 # But first... the helpers
628 template_tag_is_open = ->
630 if t.name is 'template' and t.namespace is NS_HTML
633 is_in_scope_x = (tag_name, scope, namespace) ->
635 if t.name is tag_name and (namespace is null or namespace is t.namespace)
637 if scope[t.name] is t.namespace
640 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
642 if t.name is tag_name and (namespace is null or namespace is t.namespace)
644 if scope[t.name] is t.namespace
646 if scope2[t.name] is t.namespace
650 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
651 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
652 template: NS_HTML, mi: NS_MATHML,
654 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
655 'annotation-xml': NS_MATHML,
657 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
659 button_scopers = button: NS_HTML
660 li_scopers = ol: NS_HTML, ul: NS_HTML
661 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
662 is_in_scope = (tag_name, namespace = null) ->
663 return is_in_scope_x tag_name, standard_scopers, namespace
664 is_in_button_scope = (tag_name, namespace = null) ->
665 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
666 is_in_table_scope = (tag_name, namespace = null) ->
667 return is_in_scope_x tag_name, table_scopers, namespace
668 # aka is_in_list_item_scope
669 is_in_li_scope = (tag_name, namespace = null) ->
670 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
671 is_in_select_scope = (tag_name, namespace = null) ->
673 if t.name is tag_name and (namespace is null or namespace is t.namespace)
675 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
678 # this checks for a particular element, not by name
679 # this requires a namespace match
680 el_is_in_scope = (needle) ->
684 if standard_scopers[el.name] is el.namespace
688 clear_to_table_stopers = {
693 clear_stack_to_table_context = ->
695 if clear_to_table_stopers[open_els[0].name]?
699 clear_to_table_body_stopers = {
706 clear_stack_to_table_body_context = ->
708 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
712 clear_to_table_row_stopers = {
717 clear_stack_to_table_row_context = ->
719 if clear_to_table_row_stopers[open_els[0].name]?
723 clear_afe_to_marker = ->
725 return unless afe.length > 0 # this happens in fragment case, ?spec error
727 if el.type is TYPE_AFE_MARKER
732 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
734 # 1. Let last be false.
736 # 2. Let node be the last node in the stack of open elements.
738 node = open_els[node_i]
739 # 3. Loop: If node is the first node in the stack of open elements,
740 # then set last to true, and, if the parser was originally created as
741 # part of the HTML fragment parsing algorithm (fragment case) set node
742 # to the context element.
744 if node_i is open_els.length - 1
746 # fixfull (fragment case)
748 # 4. If node is a select element, run these substeps:
749 if node.name is 'select' and node.namespace is NS_HTML
750 # 1. If last is true, jump to the step below labeled done.
752 # 2. Let ancestor be node.
755 # 3. Loop: If ancestor is the first node in the stack of
756 # open elements, jump to the step below labeled done.
758 if ancestor_i is open_els.length - 1
760 # 4. Let ancestor be the node before ancestor in the stack
763 ancestor = open_els[ancestor_i]
764 # 5. If ancestor is a template node, jump to the step below
766 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
768 # 6. If ancestor is a table node, switch the insertion mode
769 # to "in select in table" and abort these steps.
770 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
771 ins_mode = ins_mode_in_select_in_table
773 # 7. Jump back to the step labeled loop.
774 # 8. Done: Switch the insertion mode to "in select" and abort
776 ins_mode = ins_mode_in_select
778 # 5. If node is a td or th element and last is false, then switch
779 # the insertion mode to "in cell" and abort these steps.
780 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
781 ins_mode = ins_mode_in_cell
783 # 6. If node is a tr element, then switch the insertion mode to "in
784 # row" and abort these steps.
785 if node.name is 'tr' and node.namespace is NS_HTML
786 ins_mode = ins_mode_in_row
788 # 7. If node is a tbody, thead, or tfoot element, then switch the
789 # insertion mode to "in table body" and abort these steps.
790 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
791 ins_mode = ins_mode_in_table_body
793 # 8. If node is a caption element, then switch the insertion mode
794 # to "in caption" and abort these steps.
795 if node.name is 'caption' and node.namespace is NS_HTML
796 ins_mode = ins_mode_in_caption
798 # 9. If node is a colgroup element, then switch the insertion mode
799 # to "in column group" and abort these steps.
800 if node.name is 'colgroup' and node.namespace is NS_HTML
801 ins_mode = ins_mode_in_column_group
803 # 10. If node is a table element, then switch the insertion mode to
804 # "in table" and abort these steps.
805 if node.name is 'table' and node.namespace is NS_HTML
806 ins_mode = ins_mode_in_table
808 # 11. If node is a template element, then switch the insertion mode
809 # to the current template insertion mode and abort these steps.
810 if node.name is 'template' and node.namespace is NS_HTML
811 ins_mode = template_ins_modes[0]
813 # 12. If node is a head element and last is true, then switch the
814 # insertion mode to "in body" ("in body"! not "in head"!) and abort
815 # these steps. (fragment case)
816 if node.name is 'head' and node.namespace is NS_HTML and last
817 ins_mode = ins_mode_in_body
819 # 13. If node is a head element and last is false, then switch the
820 # insertion mode to "in head" and abort these steps.
821 if node.name is 'head' and node.namespace is NS_HTML and last is false
822 ins_mode = ins_mode_in_head
824 # 14. If node is a body element, then switch the insertion mode to
825 # "in body" and abort these steps.
826 if node.name is 'body' and node.namespace is NS_HTML
827 ins_mode = ins_mode_in_body
829 # 15. If node is a frameset element, then switch the insertion mode
830 # to "in frameset" and abort these steps. (fragment case)
831 if node.name is 'frameset' and node.namespace is NS_HTML
832 ins_mode = ins_mode_in_frameset
834 # 16. If node is an html element, run these substeps:
835 if node.name is 'html' and node.namespace is NS_HTML
836 # 1. If the head element pointer is null, switch the insertion
837 # mode to "before head" and abort these steps. (fragment case)
838 if head_element_pointer is null
839 ins_mode = ins_mode_before_head
841 # 2. Otherwise, the head element pointer is not null,
842 # switch the insertion mode to "after head" and abort these
844 ins_mode = ins_mode_after_head
846 # 17. If last is true, then switch the insertion mode to "in body"
847 # and abort these steps. (fragment case)
849 ins_mode = ins_mode_in_body
851 # 18. Let node now be the node before node in the stack of open
854 node = open_els[node_i]
855 # 19. Return to the step labeled loop.
859 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
860 adjusted_current_node = ->
861 if open_els.length is 1 and flag_fragment_parsing
862 return context_element
865 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
866 # this implementation is structured (mostly) as described at the link above.
867 # capitalized comments are the "labels" described at the link above.
869 return if afe.length is 0
870 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
875 if i is afe.length - 1
878 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
883 el = insert_html_element afe[i].token
888 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
889 # adoption agency algorithm
891 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
892 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
893 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
894 adoption_agency = (subject) ->
895 debug_log "adoption_agency()"
896 debug_log "tree: #{serialize_els doc.children, false, true}"
897 debug_log "open_els: #{serialize_els open_els, true, true}"
898 debug_log "afe: #{serialize_els afe, true, true}"
899 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
902 # remove it from the list of active formatting elements (if found)
907 debug_log "aaa: starting off with subject on top of stack, exiting"
914 # 5. Let formatting element be the last element in the list of
915 # active formatting elements that: is between the end of the list
916 # and the last scope marker in the list, if any, or the start of
917 # the list otherwise, and has the tag name subject.
919 for t, fe_of_afe in afe
920 if t.type is TYPE_AFE_MARKER
925 # If there is no such element, then abort these steps and instead
926 # act as described in the "any other end tag" entry above.
928 debug_log "aaa: fe not found in afe"
929 in_body_any_other_end_tag subject
931 # 6. If formatting element is not in the stack of open elements,
932 # then this is a parse error; remove the element from the list, and
935 for t, fe_of_open_els in open_els
940 debug_log "aaa: fe not found in open_els"
942 # "remove it from the list" must mean afe, since it's not in open_els
943 afe.splice fe_of_afe, 1
945 # 7. If formatting element is in the stack of open elements, but
946 # the element is not in scope, then this is a parse error; abort
948 unless el_is_in_scope fe
949 debug_log "aaa: fe not in scope"
952 # 8. If formatting element is not the current node, this is a parse
953 # error. (But do not abort these steps.)
954 unless open_els[0] is fe
957 # 9. Let furthest block be the topmost node in the stack of open
958 # elements that is lower in the stack than formatting element, and
959 # is an element in the special category. There might not be one.
961 fb_of_open_els = null
968 # and continue, to see if there's one that's more "topmost"
969 # 10. If there is no furthest block, then the UA must first pop all
970 # the nodes from the bottom of the stack of open elements, from the
971 # current node up to and including formatting element, then remove
972 # formatting element from the list of active formatting elements,
973 # and finally abort these steps.
975 debug_log "aaa: no fb"
979 afe.splice fe_of_afe, 1
981 # 11. Let common ancestor be the element immediately above
982 # formatting element in the stack of open elements.
983 ca = open_els[fe_of_open_els + 1] # common ancestor
985 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
986 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
987 bookmark = new_aaa_bookmark()
990 afe.splice i, 0, bookmark
992 node = last_node = fb
996 # 3. Let node be the element immediately above node in the
997 # stack of open elements, or if node is no longer in the stack
998 # of open elements (e.g. because it got removed by this
999 # algorithm), the element that was immediately above node in
1000 # the stack of open elements before node was removed.
1002 for t, i in open_els
1004 node_next = open_els[i + 1]
1006 node = node_next ? node_above
1007 debug_log "inner loop #{inner}"
1008 debug_log "tree: #{serialize_els doc.children, false, true}"
1009 debug_log "open_els: #{serialize_els open_els, true, true}"
1010 debug_log "afe: #{serialize_els afe, true, true}"
1011 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1012 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1013 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1014 debug_log "node: #{node.serialize true, true}"
1015 # TODO make sure node_above gets re-set if/when node is removed from open_els
1017 # 4. If node is formatting element, then go to the next step in
1018 # the overall algorithm.
1021 debug_log "the meat"
1022 # 5. If inner loop counter is greater than three and node is in
1023 # the list of active formatting elements, then remove node from
1024 # the list of active formatting elements.
1030 debug_log "max out inner"
1035 # 6. If node is not in the list of active formatting elements,
1036 # then remove node from the stack of open elements and then go
1037 # back to the step labeled inner loop.
1039 debug_log "not in afe"
1040 for t, i in open_els
1042 node_above = open_els[i + 1]
1043 open_els.splice i, 1
1046 debug_log "the bones"
1047 # 7. create an element for the token for which the element node
1048 # was created, in the HTML namespace, with common ancestor as
1049 # the intended parent; replace the entry for node in the list
1050 # of active formatting elements with an entry for the new
1051 # element, replace the entry for node in the stack of open
1052 # elements with an entry for the new element, and let node be
1054 new_node = token_to_element node.token, NS_HTML, ca
1058 debug_log "replaced in afe"
1060 for t, i in open_els
1062 node_above = open_els[i + 1]
1063 open_els[i] = new_node
1064 debug_log "replaced in open_els"
1067 # 8. If last node is furthest block, then move the
1068 # aforementioned bookmark to be immediately after the new node
1069 # in the list of active formatting elements.
1074 debug_log "removed bookmark"
1078 # "after" means lower
1079 afe.splice i, 0, bookmark # "after as <-
1080 debug_log "placed bookmark after node"
1081 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1083 # 9. Insert last node into node, first removing it from its
1084 # previous parent node if any.
1085 if last_node.parent?
1086 debug_log "last_node has parent"
1087 for c, i in last_node.parent.children
1089 debug_log "removing last_node from parent"
1090 last_node.parent.children.splice i, 1
1092 node.children.push last_node
1093 last_node.parent = node
1094 # 10. Let last node be node.
1097 # 11. Return to the step labeled inner loop.
1098 # 14. Insert whatever last node ended up being in the previous step
1099 # at the appropriate place for inserting a node, but using common
1100 # ancestor as the override target.
1102 # In the case where fe is immediately followed by fb:
1103 # * inner loop exits out early (node==fe)
1105 # * last_node is still in the tree (not a duplicate)
1106 if last_node.parent?
1107 debug_log "FEFIRST? last_node has parent"
1108 for c, i in last_node.parent.children
1110 debug_log "removing last_node from parent"
1111 last_node.parent.children.splice i, 1
1114 debug_log "after aaa inner loop"
1115 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1116 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1117 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1118 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1119 debug_log "tree: #{serialize_els doc.children, false, true}"
1124 # can't use standard insert token thing, because it's already in
1125 # open_els and must stay at it's current position in open_els
1126 dest = adjusted_insertion_location ca
1127 dest[0].children.splice dest[1], 0, last_node
1128 last_node.parent = dest[0]
1131 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1132 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1133 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1134 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1135 debug_log "tree: #{serialize_els doc.children, false, true}"
1137 # 15. Create an element for the token for which formatting element
1138 # was created, in the HTML namespace, with furthest block as the
1140 new_element = token_to_element fe.token, NS_HTML, fb
1141 # 16. Take all of the child nodes of furthest block and append them
1142 # to the element created in the last step.
1143 while fb.children.length
1144 t = fb.children.shift()
1145 t.parent = new_element
1146 new_element.children.push t
1147 # 17. Append that new element to furthest block.
1148 new_element.parent = fb
1149 fb.children.push new_element
1150 # 18. Remove formatting element from the list of active formatting
1151 # elements, and insert the new element into the list of active
1152 # formatting elements at the position of the aforementioned
1160 afe[i] = new_element
1162 # 19. Remove formatting element from the stack of open elements,
1163 # and insert the new element into the stack of open elements
1164 # immediately below the position of furthest block in that stack.
1165 for t, i in open_els
1167 open_els.splice i, 1
1169 for t, i in open_els
1171 open_els.splice i, 0, new_element
1173 # 20. Jump back to the step labeled outer loop.
1174 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1175 debug_log "tree: #{serialize_els doc.children, false, true}"
1176 debug_log "open_els: #{serialize_els open_els, true, true}"
1177 debug_log "afe: #{serialize_els afe, true, true}"
1178 debug_log "AAA DONE"
1180 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1181 close_p_element = ->
1182 generate_implied_end_tags 'p' # arg is exception
1183 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1185 while open_els.length > 1 # just in case
1186 el = open_els.shift()
1187 if el.name is 'p' and el.namespace is NS_HTML
1189 close_p_if_in_button_scope = ->
1190 if is_in_button_scope 'p', NS_HTML
1193 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1194 # aka insert_a_character = (t) ->
1195 insert_character = (t) ->
1196 dest = adjusted_insertion_location()
1197 # fixfull check for Document node
1199 prev = dest[0].children[dest[1] - 1]
1200 if prev.type is TYPE_TEXT
1203 dest[0].children.splice dest[1], 0, t
1206 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1207 process_token = (t) ->
1208 acn = adjusted_current_node()
1212 if acn.namespace is NS_HTML
1215 if is_mathml_text_integration_point(acn)
1216 if t.type is TYPE_START_TAG and (t.name is 'mglyph' or t.name is 'malignmark')
1219 if t.type is TYPE_TEXT
1222 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1225 if is_html_integration acn
1226 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1229 if t.type is TYPE_EOF
1232 in_foreign_content t
1236 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1237 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1238 adjusted_insertion_location = (override_target = null) ->
1239 # 1. If there was an override target specified, then let target be the
1242 target = override_target
1243 else # Otherwise, let target be the current node.
1244 target = open_els[0]
1245 # 2. Determine the adjusted insertion location using the first matching
1246 # steps from the following list:
1248 # If foster parenting is enabled and target is a table, tbody, tfoot,
1249 # thead, or tr element Foster parenting happens when content is
1250 # misnested in tables.
1251 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1252 loop # once. this is here so we can ``break`` to "abort these substeps"
1253 # 1. Let last template be the last template element in the
1254 # stack of open elements, if any.
1255 last_template = null
1256 last_template_i = null
1257 for el, i in open_els
1258 if el.name is 'template' and el.namespace is NS_HTML
1262 # 2. Let last table be the last table element in the stack of
1263 # open elements, if any.
1266 for el, i in open_els
1267 if el.name is 'table' and el.namespace is NS_HTML
1271 # 3. If there is a last template and either there is no last
1272 # table, or there is one, but last template is lower (more
1273 # recently added) than last table in the stack of open
1274 # elements, then: let adjusted insertion location be inside
1275 # last template's template contents, after its last child (if
1276 # any), and abort these substeps.
1277 if last_template and (last_table is null or last_template_i < last_table_i)
1278 target = last_template # fixfull should be it's contents
1279 target_i = target.children.length
1281 # 4. If there is no last table, then let adjusted insertion
1282 # location be inside the first element in the stack of open
1283 # elements (the html element), after its last child (if any),
1284 # and abort these substeps. (fragment case)
1285 if last_table is null
1287 target = open_els[open_els.length - 1]
1288 target_i = target.children.length
1290 # 5. If last table has a parent element, then let adjusted
1291 # insertion location be inside last table's parent element,
1292 # immediately before last table, and abort these substeps.
1293 if last_table.parent?
1294 for c, i in last_table.parent.children
1296 target = last_table.parent
1300 # 6. Let previous element be the element immediately above last
1301 # table in the stack of open elements.
1303 # huh? how could it not have a parent?
1304 previous_element = open_els[last_table_i + 1]
1305 # 7. Let adjusted insertion location be inside previous
1306 # element, after its last child (if any).
1307 target = previous_element
1308 target_i = target.children.length
1309 # Note: These steps are involved in part because it's possible
1310 # for elements, the table element in this case in particular,
1311 # to have been moved by a script around in the DOM, or indeed
1312 # removed from the DOM entirely, after the element was inserted
1314 break # don't really loop
1316 # Otherwise Let adjusted insertion location be inside target, after
1317 # its last child (if any).
1318 target_i = target.children.length
1320 # 3. If the adjusted insertion location is inside a template element,
1321 # let it instead be inside the template element's template contents,
1322 # after its last child (if any).
1323 # fixfull (template)
1325 # 4. Return the adjusted insertion location.
1326 return [target, target_i]
1328 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1329 # aka create_an_element_for_token
1330 token_to_element = (t, namespace, intended_parent) ->
1331 # convert attributes into a hash
1334 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1335 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1337 # TODO 2. If the newly created element has an xmlns attribute in the
1338 # XMLNS namespace whose value is not exactly the same as the element's
1339 # namespace, that is a parse error. Similarly, if the newly created
1340 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1341 # value is not the XLink Namespace, that is a parse error.
1343 # fixfull: the spec says stuff about form pointers and ownerDocument
1347 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1348 insert_foreign_element = (token, namespace) ->
1349 ail = adjusted_insertion_location()
1352 el = token_to_element token, namespace, ail_el
1353 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1355 ail_el.children.splice ail_i, 0, el
1358 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1359 insert_html_element = (token) ->
1360 insert_foreign_element token, NS_HTML
1362 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1363 # position should be [node, index_within_children]
1364 insert_comment = (t, position = null) ->
1365 position ?= adjusted_insertion_location()
1366 position[0].children.splice position[1], 0, t
1369 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1370 parse_generic_raw_text = (t) ->
1371 insert_html_element t
1372 tok_state = tok_state_rawtext
1373 original_ins_mode = ins_mode
1374 ins_mode = ins_mode_text
1375 parse_generic_rcdata_text = (t) ->
1376 insert_html_element t
1377 tok_state = tok_state_rcdata
1378 original_ins_mode = ins_mode
1379 ins_mode = ins_mode_text
1381 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1382 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1383 generate_implied_end_tags = (except = null) ->
1384 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1387 # 8.2.5.4 The rules for parsing tokens in HTML content
1388 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1390 # 8.2.5.4.1 The "initial" insertion mode
1391 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1392 ins_mode_initial = (t) ->
1395 if t.type is TYPE_COMMENT
1399 if t.type is TYPE_DOCTYPE
1400 # FIXME check identifiers, set quirks, etc
1403 ins_mode = ins_mode_before_html
1406 #fixfull (iframe, quirks)
1407 ins_mode = ins_mode_before_html
1411 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1412 ins_mode_before_html = (t) ->
1413 if t.type is TYPE_DOCTYPE
1416 if t.type is TYPE_COMMENT
1421 if t.type is TYPE_START_TAG and t.name is 'html'
1422 el = token_to_element t, NS_HTML, doc
1423 doc.children.push el
1424 open_els.unshift(el)
1425 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1426 ins_mode = ins_mode_before_head
1428 if t.type is TYPE_END_TAG
1429 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1430 # fall through to "anything else"
1435 html_tok = new_open_tag 'html'
1436 el = token_to_element html_tok, NS_HTML, doc
1437 doc.children.push el
1439 # ?fixfull browsing context
1440 ins_mode = ins_mode_before_head
1444 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1445 ins_mode_before_head = (t) ->
1448 if t.type is TYPE_COMMENT
1451 if t.type is TYPE_DOCTYPE
1454 if t.type is TYPE_START_TAG and t.name is 'html'
1457 if t.type is TYPE_START_TAG and t.name is 'head'
1458 el = insert_html_element t
1459 head_element_pointer = el
1460 ins_mode = ins_mode_in_head
1462 if t.type is TYPE_END_TAG
1463 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1464 # fall through to Anything else below
1469 head_tok = new_open_tag 'head'
1470 el = insert_html_element head_tok
1471 head_element_pointer = el
1472 ins_mode = ins_mode_in_head
1475 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1476 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1477 open_els.shift() # spec says this will be a 'head' node
1478 ins_mode = ins_mode_after_head
1480 ins_mode_in_head = (t) ->
1481 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1484 if t.type is TYPE_COMMENT
1487 if t.type is TYPE_DOCTYPE
1490 if t.type is TYPE_START_TAG and t.name is 'html'
1493 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1494 el = insert_html_element t
1496 t.acknowledge_self_closing()
1498 if t.type is TYPE_START_TAG and t.name is 'meta'
1499 el = insert_html_element t
1501 t.acknowledge_self_closing()
1502 # fixfull encoding stuff
1504 if t.type is TYPE_START_TAG and t.name is 'title'
1505 parse_generic_rcdata_text t
1507 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1508 parse_generic_raw_text t
1510 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1511 insert_html_element t
1512 ins_mode = ins_mode_in_head_noscript
1514 if t.type is TYPE_START_TAG and t.name is 'script'
1515 ail = adjusted_insertion_location()
1516 el = token_to_element t, NS_HTML, ail
1517 el.flag 'parser-inserted', true
1518 # fixfull frament case
1519 ail[0].children.splice ail[1], 0, el
1521 tok_state = tok_state_script_data
1522 original_ins_mode = ins_mode # make sure orig... is defined
1523 ins_mode = ins_mode_text
1525 if t.type is TYPE_END_TAG and t.name is 'head'
1526 open_els.shift() # will be a head element... spec says so
1527 ins_mode = ins_mode_after_head
1529 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1530 ins_mode_in_head_else t
1532 if t.type is TYPE_START_TAG and t.name is 'template'
1533 insert_html_element t
1535 flag_frameset_ok = false
1536 ins_mode = ins_mode_in_template
1537 template_ins_modes.unshift ins_mode_in_template
1539 if t.type is TYPE_END_TAG and t.name is 'template'
1540 if template_tag_is_open()
1541 generate_implied_end_tags
1542 if open_els[0].name isnt 'template'
1545 el = open_els.shift()
1546 if el.name is 'template' and el.namespace is NS_HTML
1548 clear_afe_to_marker()
1549 template_ins_modes.shift()
1554 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1557 ins_mode_in_head_else t
1559 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1560 ins_mode_in_head_noscript_else = (t) ->
1563 ins_mode = ins_mode_in_head
1565 ins_mode_in_head_noscript = (t) ->
1566 if t.type is TYPE_DOCTYPE
1569 if t.type is TYPE_START_TAG and t.name is 'html'
1572 if t.type is TYPE_END_TAG and t.name is 'noscript'
1574 ins_mode = ins_mode_in_head
1576 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1579 if t.type is TYPE_END_TAG and t.name is 'br'
1580 ins_mode_in_head_noscript_else t
1582 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1586 ins_mode_in_head_noscript_else t
1591 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1592 ins_mode_after_head_else = (t) ->
1593 body_tok = new_open_tag 'body'
1594 insert_html_element body_tok
1595 ins_mode = ins_mode_in_body
1598 ins_mode_after_head = (t) ->
1602 if t.type is TYPE_COMMENT
1605 if t.type is TYPE_DOCTYPE
1608 if t.type is TYPE_START_TAG and t.name is 'html'
1611 if t.type is TYPE_START_TAG and t.name is 'body'
1612 insert_html_element t
1613 flag_frameset_ok = false
1614 ins_mode = ins_mode_in_body
1616 if t.type is TYPE_START_TAG and t.name is 'frameset'
1617 insert_html_element t
1618 ins_mode = ins_mode_in_frameset
1620 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1622 open_els.unshift head_element_pointer
1624 for el, i of open_els
1625 if el is head_element_pointer
1626 open_els.splice i, 1
1628 console.log "warning: 23904 couldn't find head element in open_els"
1630 if t.type is TYPE_END_TAG and t.name is 'template'
1633 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1634 ins_mode_after_head_else t
1636 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1640 ins_mode_after_head_else t
1642 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1643 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1644 for el, i in open_els
1645 if el.name is name and el.namespace is NS_HTML
1646 generate_implied_end_tags name # arg is exception
1647 parse_error() unless i is 0
1652 if special_elements[el.name] is el.namespace
1656 ins_mode_in_body = (t) ->
1657 if t.type is TYPE_TEXT and t.text is "\u0000"
1664 if t.type is TYPE_TEXT
1667 flag_frameset_ok = false
1669 if t.type is TYPE_COMMENT
1672 if t.type is TYPE_DOCTYPE
1675 if t.type is TYPE_START_TAG and t.name is 'html'
1677 return if template_tag_is_open()
1678 root_attrs = open_els[open_els.length - 1].attrs
1680 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1683 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1686 if t.type is TYPE_START_TAG and t.name is 'body'
1688 return if open_els.length < 2
1689 second = open_els[open_els.length - 2]
1690 return unless second.namespace is NS_HTML
1691 return unless second.name is 'body'
1692 return if template_tag_is_open()
1693 flag_frameset_ok = false
1695 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1697 if t.type is TYPE_START_TAG and t.name is 'frameset'
1699 return if open_els.length < 2
1700 second_i = open_els.length - 2
1701 second = open_els[second_i]
1702 return unless second.namespace is NS_HTML
1703 return unless second.name is 'body'
1704 if flag_frameset_ok is false
1707 for el, i in second.parent.children
1709 second.parent.children.splice i, 1
1711 open_els.splice second_i, 1
1712 # pop everything except the "root html element"
1713 while open_els.length > 1
1715 insert_html_element t
1716 ins_mode = ins_mode_in_frameset
1718 if t.type is TYPE_EOF
1720 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1721 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1722 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1725 unless ok_tags[t.name] is el.namespace
1728 if template_ins_modes.length > 0
1729 ins_mode_in_template t
1733 if t.type is TYPE_END_TAG and t.name is 'body'
1734 unless is_in_scope 'body', NS_HTML
1738 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1739 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1740 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1741 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1745 unless ok_tags[t.name] is el.namespace
1748 ins_mode = ins_mode_after_body
1750 if t.type is TYPE_END_TAG and t.name is 'html'
1751 unless is_in_scope 'body', NS_HTML
1755 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1756 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1757 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1758 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1762 unless ok_tags[t.name] is el.namespace
1765 ins_mode = ins_mode_after_body
1768 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1769 close_p_if_in_button_scope()
1770 insert_html_element t
1772 if t.type is TYPE_START_TAG and h_tags[t.name]?
1773 close_p_if_in_button_scope()
1774 if h_tags[open_els[0].name] is open_els[0].namespace
1777 insert_html_element t
1779 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1780 close_p_if_in_button_scope()
1781 insert_html_element t
1782 # spec: If the next token is a "LF" (U+000A) character token, then
1783 # ignore that token and move on to the next one. (Newlines at the
1784 # start of pre blocks are ignored as an authoring convenience.)
1785 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1787 flag_frameset_ok = false
1789 if t.type is TYPE_START_TAG and t.name is 'form'
1790 unless form_element_pointer is null or template_tag_is_open()
1793 close_p_if_in_button_scope()
1794 el = insert_html_element t
1795 unless template_tag_is_open()
1796 form_element_pointer = el
1798 if t.type is TYPE_START_TAG and t.name is 'li'
1799 flag_frameset_ok = false
1800 for node in open_els
1801 if node.name is 'li' and node.namespace is NS_HTML
1802 generate_implied_end_tags 'li' # arg is exception
1803 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1806 el = open_els.shift()
1807 if el.name is 'li' and el.namespace is NS_HTML
1810 if el_is_special_not_adp node
1812 close_p_if_in_button_scope()
1813 insert_html_element t
1815 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1816 flag_frameset_ok = false
1817 for node in open_els
1818 if node.name is 'dd' and node.namespace is NS_HTML
1819 generate_implied_end_tags 'dd' # arg is exception
1820 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1823 el = open_els.shift()
1824 if el.name is 'dd' and el.namespace is NS_HTML
1827 if node.name is 'dt' and node.namespace is NS_HTML
1828 generate_implied_end_tags 'dt' # arg is exception
1829 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1832 el = open_els.shift()
1833 if el.name is 'dt' and el.namespace is NS_HTML
1836 if el_is_special_not_adp node
1838 close_p_if_in_button_scope()
1839 insert_html_element t
1841 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1842 close_p_if_in_button_scope()
1843 insert_html_element t
1844 tok_state = tok_state_plaintext
1846 if t.type is TYPE_START_TAG and t.name is 'button'
1847 if is_in_scope 'button', NS_HTML
1849 generate_implied_end_tags()
1851 el = open_els.shift()
1852 if el.name is 'button' and el.namespace is NS_HTML
1855 insert_html_element t
1856 flag_frameset_ok = false
1858 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1859 unless is_in_scope t.name, NS_HTML
1862 generate_implied_end_tags()
1863 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1866 el = open_els.shift()
1867 if el.name is t.name and el.namespace is NS_HTML
1870 if t.type is TYPE_END_TAG and t.name is 'form'
1871 unless template_tag_is_open()
1872 node = form_element_pointer
1873 form_element_pointer = null
1874 if node is null or not el_is_in_scope node
1877 generate_implied_end_tags()
1878 if open_els[0] isnt node
1880 for el, i in open_els
1882 open_els.splice i, 1
1885 unless is_in_scope 'form', NS_HTML
1888 generate_implied_end_tags()
1889 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1892 el = open_els.shift()
1893 if el.name is 'form' and el.namespace is NS_HTML
1896 if t.type is TYPE_END_TAG and t.name is 'p'
1897 unless is_in_button_scope 'p', NS_HTML
1899 insert_html_element new_open_tag 'p'
1902 if t.type is TYPE_END_TAG and t.name is 'li'
1903 unless is_in_li_scope 'li', NS_HTML
1906 generate_implied_end_tags 'li' # arg is exception
1907 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1910 el = open_els.shift()
1911 if el.name is 'li' and el.namespace is NS_HTML
1914 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1915 unless is_in_scope t.name, NS_HTML
1918 generate_implied_end_tags t.name # arg is exception
1919 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1922 el = open_els.shift()
1923 if el.name is t.name and el.namespace is NS_HTML
1926 if t.type is TYPE_END_TAG and h_tags[t.name]?
1929 if h_tags[el.name] is el.namespace
1932 if standard_scopers[el.name] is el.namespace
1937 generate_implied_end_tags()
1938 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1941 el = open_els.shift()
1942 if h_tags[el.name] is el.namespace
1946 if t.type is TYPE_START_TAG and t.name is 'a'
1947 # If the list of active formatting elements contains an a element
1948 # between the end of the list and the last marker on the list (or
1949 # the start of the list if there is no marker on the list), then
1950 # this is a parse error; run the adoption agency algorithm for the
1951 # tag name "a", then remove that element from the list of active
1952 # formatting elements and the stack of open elements if the
1953 # adoption agency algorithm didn't already remove it (it might not
1954 # have if the element is not in table scope).
1957 if el.type is TYPE_AFE_MARKER
1959 if el.name is 'a' and el.namespace is NS_HTML
1967 for el, i in open_els
1969 open_els.splice i, 1
1971 el = insert_html_element t
1974 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1976 el = insert_html_element t
1979 if t.type is TYPE_START_TAG and t.name is 'nobr'
1981 el = insert_html_element t
1984 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1985 adoption_agency t.name
1987 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1989 insert_html_element t
1991 flag_frameset_ok = false
1993 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1994 unless is_in_scope t.name, NS_HTML
1997 generate_implied_end_tags()
1998 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2001 el = open_els.shift()
2002 if el.name is t.name and el.namespace is NS_HTML
2004 clear_afe_to_marker()
2006 if t.type is TYPE_START_TAG and t.name is 'table'
2007 close_p_if_in_button_scope() # fixfull quirksmode thing
2008 insert_html_element t
2009 flag_frameset_ok = false
2010 ins_mode = ins_mode_in_table
2012 if t.type is TYPE_END_TAG and t.name is 'br'
2014 t.type is TYPE_START_TAG
2016 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2018 insert_html_element t
2020 t.acknowledge_self_closing()
2021 flag_frameset_ok = false
2023 if t.type is TYPE_START_TAG and t.name is 'input'
2025 insert_html_element t
2027 t.acknowledge_self_closing()
2028 unless is_input_hidden_tok t
2029 flag_frameset_ok = false
2031 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
2032 insert_html_element t
2034 t.acknowledge_self_closing()
2036 if t.type is TYPE_START_TAG and t.name is 'hr'
2037 close_p_if_in_button_scope()
2038 insert_html_element t
2040 t.acknowledge_self_closing()
2041 flag_frameset_ok = false
2043 if t.type is TYPE_START_TAG and t.name is 'image'
2048 if t.type is TYPE_START_TAG and t.name is 'isindex'
2050 if template_tag_is_open() is false and form_element_pointer isnt null
2052 t.acknowledge_self_closing()
2053 flag_frameset_ok = false
2054 close_p_if_in_button_scope()
2055 el = insert_html_element new_open_tag 'form'
2056 unless template_tag_is_open()
2057 form_element_pointer = el
2060 el.attrs['action'] = a[1]
2062 insert_html_element new_open_tag 'hr'
2065 insert_html_element new_open_tag 'label'
2066 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2067 input_el = new_open_tag 'input'
2072 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2073 input_el.attrs_a.push [a[0], a[1]]
2074 input_el.attrs_a.push ['name', 'isindex']
2075 # fixfull this next bit is in english... internationalize?
2076 prompt ?= "This is a searchable index. Enter search keywords: "
2077 insert_character new_character_token prompt # fixfull split
2078 # TODO submit typo "balue" in spec
2079 insert_html_element input_el
2081 # insert_character '' # you can put chars here if promt attr missing
2083 insert_html_element new_open_tag 'hr'
2086 unless template_tag_is_open()
2087 form_element_pointer = null
2089 if t.type is TYPE_START_TAG and t.name is 'textarea'
2090 insert_html_element t
2091 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2093 tok_state = tok_state_rcdata
2094 original_ins_mode = ins_mode
2095 flag_frameset_ok = false
2096 ins_mode = ins_mode_text
2098 if t.type is TYPE_START_TAG and t.name is 'xmp'
2099 close_p_if_in_button_scope()
2101 flag_frameset_ok = false
2102 parse_generic_raw_text t
2104 if t.type is TYPE_START_TAG and t.name is 'iframe'
2105 flag_frameset_ok = false
2106 parse_generic_raw_text t
2108 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2109 parse_generic_raw_text t
2111 if t.type is TYPE_START_TAG and t.name is 'select'
2113 insert_html_element t
2114 flag_frameset_ok = false
2115 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2116 ins_mode = ins_mode_in_select_in_table
2118 ins_mode = ins_mode_in_select
2120 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2121 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2124 insert_html_element t
2126 # this comment block implements the W3C spec
2127 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2128 # if is_in_scope 'ruby', NS_HTML
2129 # generate_implied_end_tags()
2130 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2132 # insert_html_element t
2134 # if t.type is TYPE_START_TAG and t.name is 'rt'
2135 # if is_in_scope 'ruby', NS_HTML
2136 # generate_implied_end_tags 'rtc' # arg is exception
2137 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2139 # insert_html_element t
2141 # below implements the WATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2142 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2143 if is_in_scope 'ruby', NS_HTML
2144 generate_implied_end_tags()
2145 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2147 insert_html_element t
2149 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2150 if is_in_scope 'ruby', NS_HTML
2151 generate_implied_end_tags 'rtc'
2152 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2154 insert_html_element t
2157 if t.type is TYPE_START_TAG and t.name is 'math'
2159 adjust_mathml_attributes t
2160 adjust_foreign_attributes t
2161 insert_foreign_element t, NS_MATHML
2162 if t.flag 'self-closing'
2164 t.acknowledge_self_closing()
2166 if t.type is TYPE_START_TAG and t.name is 'svg'
2168 adjust_svg_attributes t
2169 adjust_foreign_attributes t
2170 insert_foreign_element t, NS_SVG
2171 if t.flag 'self-closing'
2173 t.acknowledge_self_closing()
2175 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2178 if t.type is TYPE_START_TAG # any other start tag
2180 insert_html_element t
2182 if t.type is TYPE_END_TAG # any other end tag
2183 in_body_any_other_end_tag t.name
2187 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2188 ins_mode_text = (t) ->
2189 if t.type is TYPE_TEXT
2192 if t.type is TYPE_EOF
2194 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2195 open_els[0].flag 'already started', true
2197 ins_mode = original_ins_mode
2200 if t.type is TYPE_END_TAG and t.name is 'script'
2202 ins_mode = original_ins_mode
2203 # fixfull the spec seems to assume that I'm going to run the script
2204 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2206 if t.type is TYPE_END_TAG
2208 ins_mode = original_ins_mode
2210 console.log 'warning: end of ins_mode_text reached'
2212 # the functions below implement the tokenizer stats described here:
2213 # http://www.w3.org/TR/html5/syntax.html#tokenization
2215 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2216 ins_mode_in_table_else = (t) ->
2218 flag_foster_parenting = true
2220 flag_foster_parenting = false
2222 ins_mode_in_table = (t) ->
2225 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2226 pending_table_character_tokens = []
2227 original_ins_mode = ins_mode
2228 ins_mode = ins_mode_in_table_text
2231 ins_mode_in_table_else t
2239 clear_stack_to_table_context()
2241 insert_html_element t
2242 ins_mode = ins_mode_in_caption
2244 clear_stack_to_table_context()
2245 insert_html_element t
2246 ins_mode = ins_mode_in_column_group
2248 clear_stack_to_table_context()
2249 insert_html_element new_open_tag 'colgroup'
2250 ins_mode = ins_mode_in_column_group
2252 when 'tbody', 'tfoot', 'thead'
2253 clear_stack_to_table_context()
2254 insert_html_element t
2255 ins_mode = ins_mode_in_table_body
2256 when 'td', 'th', 'tr'
2257 clear_stack_to_table_context()
2258 insert_html_element new_open_tag 'tbody'
2259 ins_mode = ins_mode_in_table_body
2263 if is_in_table_scope 'table', NS_HTML
2265 el = open_els.shift()
2266 if el.name is 'table' and el.namespace is NS_HTML
2270 when 'style', 'script', 'template'
2273 unless is_input_hidden_tok t
2274 ins_mode_in_table_else t
2277 el = insert_html_element t
2279 t.acknowledge_self_closing()
2282 if form_element_pointer?
2284 if template_tag_is_open()
2286 form_element_pointer = insert_html_element t
2289 ins_mode_in_table_else t
2293 if is_in_table_scope 'table', NS_HTML
2295 el = open_els.shift()
2296 if el.name is 'table' and el.namespace is NS_HTML
2301 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2306 ins_mode_in_table_else t
2310 ins_mode_in_table_else t
2313 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2314 ins_mode_in_table_text = (t) ->
2315 if t.type is TYPE_TEXT and t.text is "\u0000"
2319 if t.type is TYPE_TEXT
2320 pending_table_character_tokens.push t
2324 for old in pending_table_character_tokens
2325 unless is_space_tok old
2329 for old in pending_table_character_tokens
2330 insert_character old
2332 for old in pending_table_character_tokens
2333 ins_mode_in_table_else old
2334 pending_table_character_tokens = []
2335 ins_mode = original_ins_mode
2338 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2339 ins_mode_in_caption = (t) ->
2340 if t.type is TYPE_END_TAG and t.name is 'caption'
2341 if is_in_table_scope 'caption', NS_HTML
2342 generate_implied_end_tags()
2343 if open_els[0].name isnt 'caption'
2346 el = open_els.shift()
2347 if el.name is 'caption' and el.namespace is NS_HTML
2349 clear_afe_to_marker()
2350 ins_mode = ins_mode_in_table
2355 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2357 if is_in_table_scope 'caption', NS_HTML
2359 el = open_els.shift()
2360 if el.name is 'caption' and el.namespace is NS_HTML
2362 clear_afe_to_marker()
2363 ins_mode = ins_mode_in_table
2365 # else fragment case
2367 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2373 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2374 ins_mode_in_column_group = (t) ->
2378 if t.type is TYPE_COMMENT
2381 if t.type is TYPE_DOCTYPE
2384 if t.type is TYPE_START_TAG and t.name is 'html'
2387 if t.type is TYPE_START_TAG and t.name is 'col'
2388 el = insert_html_element t
2390 t.acknowledge_self_closing()
2392 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2393 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2395 ins_mode = ins_mode_in_table
2399 if t.type is TYPE_END_TAG and t.name is 'col'
2402 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2405 if t.type is TYPE_EOF
2409 if open_els[0].name isnt 'colgroup'
2413 ins_mode = ins_mode_in_table
2417 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2418 ins_mode_in_table_body = (t) ->
2419 if t.type is TYPE_START_TAG and t.name is 'tr'
2420 clear_stack_to_table_body_context()
2421 insert_html_element t
2422 ins_mode = ins_mode_in_row
2424 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2426 clear_stack_to_table_body_context()
2427 insert_html_element new_open_tag 'tr'
2428 ins_mode = ins_mode_in_row
2431 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2432 unless is_in_table_scope t.name, NS_HTML
2435 clear_stack_to_table_body_context()
2437 ins_mode = ins_mode_in_table
2439 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2442 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2445 if table_scopers[el.name] is el.namespace
2450 clear_stack_to_table_body_context()
2452 ins_mode = ins_mode_in_table
2455 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2461 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2462 ins_mode_in_row = (t) ->
2463 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2464 clear_stack_to_table_row_context()
2465 insert_html_element t
2466 ins_mode = ins_mode_in_cell
2469 if t.type is TYPE_END_TAG and t.name is 'tr'
2470 if is_in_table_scope 'tr', NS_HTML
2471 clear_stack_to_table_row_context()
2473 ins_mode = ins_mode_in_table_body
2477 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2478 if is_in_table_scope 'tr', NS_HTML
2479 clear_stack_to_table_row_context()
2481 ins_mode = ins_mode_in_table_body
2486 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2487 if is_in_table_scope t.name, NS_HTML
2488 if is_in_table_scope 'tr', NS_HTML
2489 clear_stack_to_table_row_context()
2491 ins_mode = ins_mode_in_table_body
2496 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2502 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2504 generate_implied_end_tags()
2505 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2508 el = open_els.shift()
2509 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2511 clear_afe_to_marker()
2512 ins_mode = ins_mode_in_row
2514 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2515 ins_mode_in_cell = (t) ->
2516 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2517 if is_in_table_scope t.name, NS_HTML
2518 generate_implied_end_tags()
2519 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2522 el = open_els.shift()
2523 if el.name is t.name and el.namespace is NS_HTML
2525 clear_afe_to_marker()
2526 ins_mode = ins_mode_in_row
2530 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2533 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2536 if table_scopers[el.name] is el.namespace
2544 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2547 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2548 if is_in_table_scope t.name, NS_HTML
2557 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2558 ins_mode_in_select = (t) ->
2559 if t.type is TYPE_TEXT and t.text is "\u0000"
2562 if t.type is TYPE_TEXT
2565 if t.type is TYPE_COMMENT
2568 if t.type is TYPE_DOCTYPE
2571 if t.type is TYPE_START_TAG and t.name is 'html'
2574 if t.type is TYPE_START_TAG and t.name is 'option'
2575 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2577 insert_html_element t
2579 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2580 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2582 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2584 insert_html_element t
2586 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2587 if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
2588 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2590 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2595 if t.type is TYPE_END_TAG and t.name is 'option'
2596 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2601 if t.type is TYPE_END_TAG and t.name is 'select'
2602 if is_in_select_scope 'select', NS_HTML
2604 el = open_els.shift()
2605 if el.name is 'select' and el.namespace is NS_HTML
2611 if t.type is TYPE_START_TAG and t.name is 'select'
2614 el = open_els.shift()
2615 if el.name is 'select' and el.namespace is NS_HTML
2618 # spec says that this is the same as </select> but it doesn't say
2619 # to check scope first
2621 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2623 if is_in_select_scope 'select', NS_HTML
2626 el = open_els.shift()
2627 if el.name is 'select' and el.namespace is NS_HTML
2632 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2635 if t.type is TYPE_EOF
2642 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2643 ins_mode_in_select_in_table = (t) ->
2644 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2647 el = open_els.shift()
2648 if el.name is 'select' and el.namespace is NS_HTML
2653 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2655 unless is_in_table_scope t.name, NS_HTML
2658 el = open_els.shift()
2659 if el.name is 'select' and el.namespace is NS_HTML
2665 ins_mode_in_select t
2668 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2669 ins_mode_in_template = (t) ->
2670 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2673 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2676 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2677 template_ins_modes.shift()
2678 template_ins_modes.unshift ins_mode_in_table
2679 ins_mode = ins_mode_in_table
2682 if t.type is TYPE_START_TAG and t.name is 'col'
2683 template_ins_modes.shift()
2684 template_ins_modes.unshift ins_mode_in_column_group
2685 ins_mode = ins_mode_in_column_group
2688 if t.type is TYPE_START_TAG and t.name is 'tr'
2689 template_ins_modes.shift()
2690 template_ins_modes.unshift ins_mode_in_table_body
2691 ins_mode = ins_mode_in_table_body
2694 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2695 template_ins_modes.shift()
2696 template_ins_modes.unshift ins_mode_in_row
2697 ins_mode = ins_mode_in_row
2700 if t.type is TYPE_START_TAG
2701 template_ins_modes.shift()
2702 template_ins_modes.unshift ins_mode_in_body
2703 ins_mode = ins_mode_in_body
2706 if t.type is TYPE_END_TAG
2709 if t.type is TYPE_EOF
2710 unless template_tag_is_open()
2715 el = open_els.shift()
2716 if el.name is 'template' and el.namespace is NS_HTML
2718 clear_afe_to_marker()
2719 template_ins_modes.shift()
2723 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2724 ins_mode_after_body = (t) ->
2728 if t.type is TYPE_COMMENT
2729 insert_comment t, [open_els[0], open_els[0].children.length]
2731 if t.type is TYPE_DOCTYPE
2734 if t.type is TYPE_START_TAG and t.name is 'html'
2737 if t.type is TYPE_END_TAG and t.name is 'html'
2738 if flag_fragment_parsing
2741 ins_mode = ins_mode_after_after_body
2743 if t.type is TYPE_EOF
2748 ins_mode = ins_mode_in_body
2751 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2752 ins_mode_in_frameset = (t) ->
2756 if t.type is TYPE_COMMENT
2759 if t.type is TYPE_DOCTYPE
2762 if t.type is TYPE_START_TAG and t.name is 'html'
2765 if t.type is TYPE_START_TAG and t.name is 'frameset'
2766 insert_html_element t
2768 if t.type is TYPE_END_TAG and t.name is 'frameset'
2769 if open_els.length is 1
2771 return # fragment case
2773 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2774 ins_mode = ins_mode_after_frameset
2776 if t.type is TYPE_START_TAG and t.name is 'frame'
2777 insert_html_element t
2779 t.acknowledge_self_closing()
2781 if t.type is TYPE_START_TAG and t.name is 'noframes'
2784 if t.type is TYPE_EOF
2785 if open_els.length isnt 1
2793 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2794 ins_mode_after_frameset = (t) ->
2798 if t.type is TYPE_COMMENT
2801 if t.type is TYPE_DOCTYPE
2804 if t.type is TYPE_START_TAG and t.name is 'html'
2807 if t.type is TYPE_END_TAG and t.name is 'html'
2808 insert_mode = ins_mode_after_after_frameset
2810 if t.type is TYPE_START_TAG and t.name is 'noframes'
2813 if t.type is TYPE_EOF
2820 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2821 ins_mode_after_after_body = (t) ->
2822 if t.type is TYPE_COMMENT
2823 insert_comment t, [doc, doc.children.length]
2825 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2828 if t.type is TYPE_EOF
2833 ins_mode = ins_mode_in_body
2837 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2838 ins_mode_after_after_frameset = (t) ->
2839 if t.type is TYPE_COMMENT
2840 insert_comment t, [doc, doc.children.length]
2842 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2845 if t.type is TYPE_EOF
2848 if t.type is TYPE_START_TAG and t.name is 'noframes'
2855 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2856 has_color_face_or_size = (t) ->
2858 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2861 in_foreign_content_end_script = ->
2865 in_foreign_content_other_start = (t) ->
2866 acn = adjusted_current_node()
2867 if acn.namespace is NS_MATHML
2868 adjust_mathml_attributes t
2869 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2870 t.name = svg_name_fixes[t.name]
2871 if acn.namespace is NS_SVG
2872 adjust_svg_attributes t
2873 adjust_foreign_attributes t
2874 insert_foreign_element t, acn.namespace
2875 if t.flag 'self-closing' # FIXME CONTINUE this isn't getting set
2876 if t.name is 'script'
2877 t.acknowledge_self_closing()
2878 in_foreign_content_end_script()
2882 t.acknowledge_self_closing()
2884 in_foreign_content = (t) ->
2885 if t.type is TYPE_TEXT and t.text is "\u0000"
2887 insert_character new_character_token "\ufffd"
2892 if t.type is TYPE_TEXT
2893 flag_frameset_ok = false
2896 if t.type is TYPE_COMMENT
2899 if t.type is TYPE_DOCTYPE
2902 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
2904 if flag_fragment_parsing
2905 in_foreign_content_other_start t
2907 loop # is this safe?
2910 if is_mathml_text_integration_point(cn) or is_html_integration(cn) or cn.namespace is NS_HTML
2914 if t.type is TYPE_START_TAG
2915 in_foreign_content_other_start t
2917 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
2918 in_foreign_content_end_script()
2920 if t.type is TYPE_END_TAG
2921 if open_els[0].name.toLowerCase() isnt t.name
2923 for node in open_els
2924 if node is open_els[open_els.length - 1]
2926 if node.name.toLowerCase() is t.name
2928 el = open_els.shift()
2931 if node.namespace is NS_HTML
2933 ins_mode t # explicitly call HTML insertion mode
2936 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2938 switch c = txt.charAt(cur++)
2940 return new_text_node parse_character_reference()
2942 tok_state = tok_state_tag_open
2945 return new_text_node "\ufffd"
2947 return new_eof_token()
2949 return new_text_node c
2952 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2953 # not needed: tok_state_character_reference_in_data = ->
2954 # just call parse_character_reference()
2956 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2957 tok_state_rcdata = ->
2958 switch c = txt.charAt(cur++)
2960 return new_text_node parse_character_reference()
2962 tok_state = tok_state_rcdata_less_than_sign
2965 return new_character_token "\ufffd"
2967 return new_eof_token()
2969 return new_character_token c
2972 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2973 # not needed: tok_state_character_reference_in_rcdata = ->
2974 # just call parse_character_reference()
2976 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2977 tok_state_rawtext = ->
2978 switch c = txt.charAt(cur++)
2980 tok_state = tok_state_rawtext_less_than_sign
2983 return new_character_token "\ufffd"
2985 return new_eof_token()
2987 return new_character_token c
2990 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2991 tok_state_script_data = ->
2992 switch c = txt.charAt(cur++)
2994 tok_state = tok_state_script_data_less_than_sign
2997 return new_character_token "\ufffd"
2999 return new_eof_token()
3001 return new_character_token c
3004 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3005 tok_state_plaintext = ->
3006 switch c = txt.charAt(cur++)
3009 return new_character_token "\ufffd"
3011 return new_eof_token()
3013 return new_character_token c
3017 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3018 tok_state_tag_open = ->
3019 switch c = txt.charAt(cur++)
3021 tok_state = tok_state_markup_declaration_open
3023 tok_state = tok_state_end_tag_open
3026 tok_cur_tag = new_comment_token '?'
3027 tok_state = tok_state_bogus_comment
3030 tok_cur_tag = new_open_tag c
3031 tok_state = tok_state_tag_name
3032 else if is_uc_alpha(c)
3033 tok_cur_tag = new_open_tag c.toLowerCase()
3034 tok_state = tok_state_tag_name
3037 tok_state = tok_state_data
3038 cur -= 1 # we didn't parse/handle the char after <
3039 return new_text_node '<'
3042 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3043 tok_state_end_tag_open = ->
3044 switch c = txt.charAt(cur++)
3047 tok_state = tok_state_data
3050 tok_state = tok_state_data
3051 return new_text_node '</'
3054 tok_cur_tag = new_end_tag c.toLowerCase()
3055 tok_state = tok_state_tag_name
3056 else if is_lc_alpha(c)
3057 tok_cur_tag = new_end_tag c
3058 tok_state = tok_state_tag_name
3061 tok_cur_tag = new_comment_token '/'
3062 tok_state = tok_state_bogus_comment
3065 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3066 tok_state_tag_name = ->
3067 switch c = txt.charAt(cur++)
3068 when "\t", "\n", "\u000c", ' '
3069 tok_state = tok_state_before_attribute_name
3071 tok_state = tok_state_self_closing_start_tag
3073 tok_state = tok_state_data
3079 tok_cur_tag.name += "\ufffd"
3082 tok_state = tok_state_data
3085 tok_cur_tag.name += c.toLowerCase()
3087 tok_cur_tag.name += c
3090 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3091 tok_state_rcdata_less_than_sign = ->
3092 c = txt.charAt(cur++)
3094 temporary_buffer = ''
3095 tok_state = tok_state_rcdata_end_tag_open
3098 tok_state = tok_state_rcdata
3099 cur -= 1 # reconsume the input character
3100 return new_character_token '<'
3102 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3103 tok_state_rcdata_end_tag_open = ->
3104 c = txt.charAt(cur++)
3106 tok_cur_tag = new_end_tag c.toLowerCase()
3107 temporary_buffer += c
3108 tok_state = tok_state_rcdata_end_tag_name
3111 tok_cur_tag = new_end_tag c
3112 temporary_buffer += c
3113 tok_state = tok_state_rcdata_end_tag_name
3116 tok_state = tok_state_rcdata
3117 cur -= 1 # reconsume the input character
3118 return new_character_token "</" # fixfull separate these
3120 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3121 is_appropriate_end_tag = (t) ->
3122 # spec says to check against "the tag name of the last start tag to
3123 # have been emitted from this tokenizer", but this is only called from
3124 # the various "raw" states, so it's hopefully ok to assume that
3125 # open_els[0].name will work instead TODO: verify this after the script
3126 # data states are implemented
3127 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3128 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3130 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3131 tok_state_rcdata_end_tag_name = ->
3132 c = txt.charAt(cur++)
3133 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3134 if is_appropriate_end_tag tok_cur_tag
3135 tok_state = tok_state_before_attribute_name
3137 # else fall through to "Anything else"
3139 if is_appropriate_end_tag tok_cur_tag
3140 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3142 # else fall through to "Anything else"
3144 if is_appropriate_end_tag tok_cur_tag
3145 tok_state = tok_state_data
3147 # else fall through to "Anything else"
3149 tok_cur_tag.name += c.toLowerCase()
3150 temporary_buffer += c
3153 tok_cur_tag.name += c
3154 temporary_buffer += c
3157 tok_state = tok_state_rcdata
3158 cur -= 1 # reconsume the input character
3159 return new_character_token '</' + temporary_buffer # fixfull separate these
3161 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3162 tok_state_rawtext_less_than_sign = ->
3163 c = txt.charAt(cur++)
3165 temporary_buffer = ''
3166 tok_state = tok_state_rawtext_end_tag_open
3169 tok_state = tok_state_rawtext
3170 cur -= 1 # reconsume the input character
3171 return new_character_token '<'
3173 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3174 tok_state_rawtext_end_tag_open = ->
3175 c = txt.charAt(cur++)
3177 tok_cur_tag = new_end_tag c.toLowerCase()
3178 temporary_buffer += c
3179 tok_state = tok_state_rawtext_end_tag_name
3182 tok_cur_tag = new_end_tag c
3183 temporary_buffer += c
3184 tok_state = tok_state_rawtext_end_tag_name
3187 tok_state = tok_state_rawtext
3188 cur -= 1 # reconsume the input character
3189 return new_character_token "</" # fixfull separate these
3191 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3192 tok_state_rawtext_end_tag_name = ->
3193 c = txt.charAt(cur++)
3194 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3195 if is_appropriate_end_tag tok_cur_tag
3196 tok_state = tok_state_before_attribute_name
3198 # else fall through to "Anything else"
3200 if is_appropriate_end_tag tok_cur_tag
3201 tok_state = tok_state_self_closing_start_tag
3203 # else fall through to "Anything else"
3205 if is_appropriate_end_tag tok_cur_tag
3206 tok_state = tok_state_data
3208 # else fall through to "Anything else"
3210 tok_cur_tag.name += c.toLowerCase()
3211 temporary_buffer += c
3214 tok_cur_tag.name += c
3215 temporary_buffer += c
3218 tok_state = tok_state_rawtext
3219 cur -= 1 # reconsume the input character
3220 return new_character_token '</' + temporary_buffer # fixfull separate these
3222 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3223 tok_state_script_data_less_than_sign = ->
3224 c = txt.charAt(cur++)
3226 temporary_buffer = ''
3227 tok_state = tok_state_script_data_end_tag_open
3230 tok_state = tok_state_script_data_escape_start
3231 return new_character_token '<!' # fixfull split
3233 tok_state = tok_state_script_data
3234 cur -= 1 # Reconsume
3235 return new_character_token '<'
3237 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3238 tok_state_script_data_end_tag_open = ->
3239 c = txt.charAt(cur++)
3241 tok_cur_tag = new_end_tag c.toLowerCase()
3242 temporary_buffer += c
3243 tok_state = tok_state_script_data_end_tag_name
3246 tok_cur_tag = new_end_tag c
3247 temporary_buffer += c
3248 tok_state = tok_state_script_data_end_tag_name
3251 tok_state = tok_state_script_data
3252 cur -= 1 # Reconsume
3253 return new_character_token '</'
3255 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3256 tok_state_script_data_end_tag_name = ->
3257 c = txt.charAt(cur++)
3258 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3259 if is_appropriate_end_tag tok_cur_tag
3260 tok_state = tok_state_before_attribute_name
3264 if is_appropriate_end_tag tok_cur_tag
3265 tok_state = tok_state_self_closing_start_tag
3269 if is_appropriate_end_tag tok_cur_tag
3270 tok_state = tok_state_data
3274 tok_cur_tag.name += c.toLowerCase()
3275 temporary_buffer += c
3278 tok_cur_tag.name += c
3279 temporary_buffer += c
3282 tok_state = tok_state_script_data
3283 cur -= 1 # Reconsume
3284 return new_character_token "</#{temporary_buffer}" # fixfull split
3286 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3287 tok_state_script_data_escape_start = ->
3288 c = txt.charAt(cur++)
3290 tok_state = tok_state_script_data_escape_start_dash
3291 return new_character_token '-'
3293 tok_state = tok_state_script_data
3294 cur -= 1 # Reconsume
3297 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3298 tok_state_script_data_escape_start_dash = ->
3299 c = txt.charAt(cur++)
3301 tok_state = tok_state_script_data_escaped_dash_dash
3302 return new_character_token '-'
3304 tok_state = tok_state_script_data
3305 cur -= 1 # Reconsume
3308 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3309 tok_state_script_data_escaped = ->
3310 c = txt.charAt(cur++)
3312 tok_state = tok_state_script_data_escaped_dash
3313 return new_character_token '-'
3315 tok_state = tok_state_script_data_escaped_less_than_sign
3319 return new_character_token "\ufffd"
3321 tok_state = tok_state_data
3323 cur -= 1 # Reconsume
3326 return new_character_token c
3328 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3329 tok_state_script_data_escaped_dash = ->
3330 c = txt.charAt(cur++)
3332 tok_state = tok_state_script_data_escaped_dash_dash
3333 return new_character_token '-'
3335 tok_state = tok_state_script_data_escaped_less_than_sign
3339 tok_state = tok_state_script_data_escaped
3340 return new_character_token "\ufffd"
3342 tok_state = tok_state_data
3344 cur -= 1 # Reconsume
3347 tok_state = tok_state_script_data_escaped
3348 return new_character_token c
3350 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3351 tok_state_script_data_escaped_dash_dash = ->
3352 c = txt.charAt(cur++)
3354 return new_character_token '-'
3356 tok_state = tok_state_script_data_escaped_less_than_sign
3359 tok_state = tok_state_script_data
3360 return new_character_token '>'
3363 tok_state = tok_state_script_data_escaped
3364 return new_character_token "\ufffd"
3367 tok_state = tok_state_data
3368 cur -= 1 # Reconsume
3371 tok_state = tok_state_script_data_escaped
3372 return new_character_token c
3374 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3375 tok_state_script_data_escaped_less_than_sign = ->
3376 c = txt.charAt(cur++)
3378 temporary_buffer = ''
3379 tok_state = tok_state_script_data_escaped_end_tag_open
3382 temporary_buffer = c.toLowerCase() # yes, really
3383 tok_state = tok_state_script_data_double_escape_start
3384 return new_character_token "<#{c}" # fixfull split
3386 temporary_buffer = c
3387 tok_state = tok_state_script_data_double_escape_start
3388 return new_character_token "<#{c}" # fixfull split
3390 tok_state = tok_state_script_data_escaped
3391 cur -= 1 # Reconsume
3392 return new_character_token c
3394 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3395 tok_state_script_data_escaped_end_tag_open = ->
3396 c = txt.charAt(cur++)
3398 tok_cur_tag = new_end_tag c.toLowerCase()
3399 temporary_buffer += c
3400 tok_state = tok_state_script_data_escaped_end_tag_name
3403 tok_cur_tag = new_end_tag c
3404 temporary_buffer += c
3405 tok_state = tok_state_script_data_escaped_end_tag_name
3408 tok_state = tok_state_script_data_escaped
3409 cur -= 1 # Reconsume
3410 return new_character_token '</' # fixfull split
3412 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3413 tok_state_script_data_escaped_end_tag_name = ->
3414 c = txt.charAt(cur++)
3415 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3416 if is_appropriate_end_tag tok_cur_tag
3417 tok_state = tok_state_before_attribute_name
3421 if is_appropriate_end_tag tok_cur_tag
3422 tok_state = tok_state_self_closing_start_tag
3426 if is_appropriate_end_tag tok_cur_tag
3427 tok_state = tok_state_data
3431 tok_cur_tag.name += c.toLowerCase()
3432 temporary_buffer += c.toLowerCase()
3435 tok_cur_tag.name += c
3436 temporary_buffer += c.toLowerCase()
3439 tok_state = tok_state_script_data_escaped
3440 cur -= 1 # Reconsume
3441 return new_character_token "</#{temporary_buffer}" # fixfull split
3443 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3444 tok_state_script_data_double_escape_start = ->
3445 c = txt.charAt(cur++)
3446 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3447 if temporary_buffer is 'script'
3448 tok_state = tok_state_script_data_double_escaped
3450 tok_state = tok_state_script_data_escaped
3451 return new_character_token c
3453 temporary_buffer += c.toLowerCase() # yes, really lowercase
3454 return new_character_token c
3456 temporary_buffer += c
3457 return new_character_token c
3459 tok_state = tok_state_script_data_escaped
3460 cur -= 1 # Reconsume
3463 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3464 tok_state_script_data_double_escaped = ->
3465 c = txt.charAt(cur++)
3467 tok_state = tok_state_script_data_double_escaped_dash
3468 return new_character_token '-'
3470 tok_state = tok_state_script_data_double_escaped_less_than_sign
3471 return new_character_token '<'
3474 return new_character_token "\ufffd"
3477 tok_state = tok_state_data
3478 cur -= 1 # Reconsume
3481 return new_character_token c
3483 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3484 tok_state_script_data_double_escaped_dash = ->
3485 c = txt.charAt(cur++)
3487 tok_state = tok_state_script_data_double_escaped_dash_dash
3488 return new_character_token '-'
3490 tok_state = tok_state_script_data_double_escaped_less_than_sign
3491 return new_character_token '<'
3494 tok_state = tok_state_script_data_double_escaped
3495 return new_character_token "\ufffd"
3498 tok_state = tok_state_data
3499 cur -= 1 # Reconsume
3502 tok_state = tok_state_script_data_double_escaped
3503 return new_character_token c
3505 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3506 tok_state_script_data_double_escaped_dash_dash = ->
3507 c = txt.charAt(cur++)
3509 return new_character_token '-'
3511 tok_state = tok_state_script_data_double_escaped_less_than_sign
3512 return new_character_token '<'
3514 tok_state = tok_state_script_data
3515 return new_character_token '>'
3518 tok_state = tok_state_script_data_double_escaped
3519 return new_character_token "\ufffd"
3522 tok_state = tok_state_data
3523 cur -= 1 # Reconsume
3526 tok_state = tok_state_script_data_double_escaped
3527 return new_character_token c
3529 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3530 tok_state_script_data_double_escaped_less_than_sign = ->
3531 c = txt.charAt(cur++)
3533 temporary_buffer = ''
3534 tok_state = tok_state_script_data_double_escape_end
3535 return new_character_token '/'
3537 tok_state = tok_state_script_data_double_escaped
3538 cur -= 1 # Reconsume
3541 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3542 tok_state_script_data_double_escape_end = ->
3543 c = txt.charAt(cur++)
3544 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3545 if temporary_buffer is 'script'
3546 tok_state = tok_state_script_data_escaped
3548 tok_state = tok_state_script_data_double_escaped
3549 return new_character_token c
3551 temporary_buffer += c.toLowerCase() # yes, really lowercase
3552 return new_character_token c
3554 temporary_buffer += c
3555 return new_character_token c
3557 tok_state = tok_state_script_data_double_escaped
3558 cur -= 1 # Reconsume
3561 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3562 tok_state_before_attribute_name = ->
3564 switch c = txt.charAt(cur++)
3565 when "\t", "\n", "\u000c", ' '
3568 tok_state = tok_state_self_closing_start_tag
3571 tok_state = tok_state_data
3577 attr_name = "\ufffd"
3578 when '"', "'", '<', '='
3583 tok_state = tok_state_data
3586 attr_name = c.toLowerCase()
3590 tok_cur_tag.attrs_a.unshift [attr_name, '']
3591 tok_state = tok_state_attribute_name
3594 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3595 tok_state_attribute_name = ->
3596 switch c = txt.charAt(cur++)
3597 when "\t", "\n", "\u000c", ' '
3598 tok_state = tok_state_after_attribute_name
3600 tok_state = tok_state_self_closing_start_tag
3602 tok_state = tok_state_before_attribute_value
3604 tok_state = tok_state_data
3610 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3613 tok_cur_tag.attrs_a[0][0] += c
3616 tok_state = tok_state_data
3619 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3621 tok_cur_tag.attrs_a[0][0] += c
3624 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3625 tok_state_after_attribute_name = ->
3626 c = txt.charAt(cur++)
3627 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3630 tok_state = tok_state_self_closing_start_tag
3633 tok_state = tok_state_before_attribute_value
3636 tok_state = tok_state_data
3639 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3640 tok_state = tok_state_attribute_name
3644 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3645 tok_state = tok_state_attribute_name
3649 tok_state = tok_state_data
3650 cur -= 1 # reconsume
3652 if c is '"' or c is "'" or c is '<'
3654 # fall through to Anything else
3656 tok_cur_tag.attrs_a.unshift [c, '']
3657 tok_state = tok_state_attribute_name
3659 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3660 tok_state_before_attribute_value = ->
3661 switch c = txt.charAt(cur++)
3662 when "\t", "\n", "\u000c", ' '
3665 tok_state = tok_state_attribute_value_double_quoted
3667 tok_state = tok_state_attribute_value_unquoted
3670 tok_state = tok_state_attribute_value_single_quoted
3673 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3674 tok_state = tok_state_attribute_value_unquoted
3677 tok_state = tok_state_data
3683 tok_state = tok_state_data
3685 tok_cur_tag.attrs_a[0][1] += c
3686 tok_state = tok_state_attribute_value_unquoted
3689 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3690 tok_state_attribute_value_double_quoted = ->
3691 switch c = txt.charAt(cur++)
3693 tok_state = tok_state_after_attribute_value_quoted
3695 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3698 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3701 tok_state = tok_state_data
3703 tok_cur_tag.attrs_a[0][1] += c
3706 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3707 tok_state_attribute_value_single_quoted = ->
3708 switch c = txt.charAt(cur++)
3710 tok_state = tok_state_after_attribute_value_quoted
3712 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3715 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3718 tok_state = tok_state_data
3720 tok_cur_tag.attrs_a[0][1] += c
3723 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3724 tok_state_attribute_value_unquoted = ->
3725 switch c = txt.charAt(cur++)
3726 when "\t", "\n", "\u000c", ' '
3727 tok_state = tok_state_before_attribute_name
3729 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3731 tok_state = tok_state_data
3736 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3739 tok_state = tok_state_data
3741 # Parse Error if ', <, = or ` (backtick)
3742 tok_cur_tag.attrs_a[0][1] += c
3745 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3746 tok_state_after_attribute_value_quoted = ->
3747 switch c = txt.charAt(cur++)
3748 when "\t", "\n", "\u000c", ' '
3749 tok_state = tok_state_before_attribute_name
3751 tok_state = tok_state_self_closing_start_tag
3753 tok_state = tok_state_data
3759 tok_state = tok_state_data
3762 tok_state = tok_state_before_attribute_name
3763 cur -= 1 # we didn't handle that char
3766 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3767 tok_state_self_closing_start_tag = ->
3768 c = txt.charAt(cur++)
3770 tok_cur_tag.flag 'self-closing'
3771 tok_state = tok_state_data
3775 tok_state = tok_state_data
3776 cur -= 1 # Reconsume
3780 tok_state = tok_state_before_attribute_name
3781 cur -= 1 # Reconsume
3784 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3785 # WARNING: put a comment token in tok_cur_tag before setting this state
3786 tok_state_bogus_comment = ->
3787 next_gt = txt.indexOf '>', cur
3789 val = txt.substr cur
3792 val = txt.substr cur, (next_gt - cur)
3794 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3795 tok_cur_tag.text += val
3796 tok_state = tok_state_data
3799 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3800 tok_state_markup_declaration_open = ->
3801 if txt.substr(cur, 2) is '--'
3803 tok_cur_tag = new_comment_token ''
3804 tok_state = tok_state_comment_start
3806 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3808 tok_state = tok_state_doctype
3810 acn = adjusted_current_node()
3811 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3813 tok_state = tok_state_cdata_section
3817 tok_cur_tag = new_comment_token ''
3818 tok_state = tok_state_bogus_comment
3821 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3822 tok_state_comment_start = ->
3823 switch c = txt.charAt(cur++)
3825 tok_state = tok_state_comment_start_dash
3828 tok_state = tok_state_comment
3829 return new_character_token "\ufffd"
3832 tok_state = tok_state_data
3836 tok_state = tok_state_data
3837 cur -= 1 # Reconsume
3840 tok_cur_tag.text += c
3841 tok_state = tok_state_comment
3844 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3845 tok_state_comment_start_dash = ->
3846 switch c = txt.charAt(cur++)
3848 tok_state = tok_state_comment_end
3851 tok_cur_tag.text += "-\ufffd"
3852 tok_state = tok_state_comment
3855 tok_state = tok_state_data
3859 tok_state = tok_state_data
3860 cur -= 1 # Reconsume
3863 tok_cur_tag.text += "-#{c}"
3864 tok_state = tok_state_comment
3867 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3868 tok_state_comment = ->
3869 switch c = txt.charAt(cur++)
3871 tok_state = tok_state_comment_end_dash
3874 tok_cur_tag.text += "\ufffd"
3877 tok_state = tok_state_data
3878 cur -= 1 # Reconsume
3881 tok_cur_tag.text += c
3884 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3885 tok_state_comment_end_dash = ->
3886 switch c = txt.charAt(cur++)
3888 tok_state = tok_state_comment_end
3891 tok_cur_tag.text += "-\ufffd"
3892 tok_state = tok_state_comment
3895 tok_state = tok_state_data
3896 cur -= 1 # Reconsume
3899 tok_cur_tag.text += "-#{c}"
3900 tok_state = tok_state_comment
3903 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3904 tok_state_comment_end = ->
3905 switch c = txt.charAt(cur++)
3907 tok_state = tok_state_data
3911 tok_cur_tag.text += "--\ufffd"
3912 tok_state = tok_state_comment
3915 tok_state = tok_state_comment_end_bang
3918 tok_cur_tag.text += '-'
3921 tok_state = tok_state_data
3922 cur -= 1 # Reconsume
3926 tok_cur_tag.text += "--#{c}"
3927 tok_state = tok_state_comment
3930 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3931 tok_state_comment_end_bang = ->
3932 switch c = txt.charAt(cur++)
3934 tok_cur_tag.text += "--!#{c}"
3935 tok_state = tok_state_comment_end_dash
3937 tok_state = tok_state_data
3941 tok_cur_tag.text += "--!\ufffd"
3942 tok_state = tok_state_comment
3945 tok_state = tok_state_data
3946 cur -= 1 # Reconsume
3949 tok_cur_tag.text += "--!#{c}"
3950 tok_state = tok_state_comment
3953 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3954 tok_state_doctype = ->
3955 switch c = txt.charAt(cur++)
3956 when "\t", "\u000a", "\u000c", ' '
3957 tok_state = tok_state_before_doctype_name
3960 tok_state = tok_state_data
3961 el = new_doctype_token ''
3962 el.flag 'force-quirks', true
3963 cur -= 1 # Reconsume
3967 tok_state = tok_state_before_doctype_name
3968 cur -= 1 # Reconsume
3971 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3972 tok_state_before_doctype_name = ->
3973 c = txt.charAt(cur++)
3974 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3977 tok_cur_tag = new_doctype_token c.toLowerCase()
3978 tok_state = tok_state_doctype_name
3982 tok_cur_tag = new_doctype_token "\ufffd"
3983 tok_state = tok_state_doctype_name
3987 el = new_doctype_token ''
3988 el.flag 'force-quirks', true
3989 tok_state = tok_state_data
3993 tok_state = tok_state_data
3994 el = new_doctype_token ''
3995 el.flag 'force-quirks', true
3996 cur -= 1 # Reconsume
3999 tok_cur_tag = new_doctype_token c
4000 tok_state = tok_state_doctype_name
4003 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4004 tok_state_doctype_name = ->
4005 c = txt.charAt(cur++)
4006 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4007 tok_state = tok_state_after_doctype_name
4010 tok_state = tok_state_data
4013 tok_cur_tag.name += c.toLowerCase()
4017 tok_cur_tag.name += "\ufffd"
4021 tok_state = tok_state_data
4022 tok_cur_tag.flag 'force-quirks', true
4023 cur -= 1 # Reconsume
4026 tok_cur_tag.name += c
4029 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4030 tok_state_after_doctype_name = ->
4031 c = txt.charAt(cur++)
4032 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4035 tok_state = tok_state_data
4039 tok_state = tok_state_data
4040 tok_cur_tag.flag 'force-quirks', true
4041 cur -= 1 # Reconsume
4044 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4046 tok_state = tok_state_after_doctype_public_keyword
4048 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4050 tok_state = tok_state_after_doctype_system_keyword
4053 tok_cur_tag.flag 'force-quirks', true
4054 tok_state = tok_state_bogus_doctype
4057 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4058 tok_state_after_doctype_public_keyword = ->
4059 c = txt.charAt(cur++)
4060 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4061 tok_state = tok_state_before_doctype_public_identifier
4065 tok_cur_tag.public_identifier = ''
4066 tok_state = tok_state_doctype_public_identifier_double_quoted
4070 tok_cur_tag.public_identifier = ''
4071 tok_state = tok_state_doctype_public_identifier_single_quoted
4075 tok_cur_tag.flag 'force-quirks', true
4076 tok_state = tok_state_data
4080 tok_state = tok_state_data
4081 tok_cur_tag.flag 'force-quirks', true
4082 cur -= 1 # Reconsume
4086 tok_cur_tag.flag 'force-quirks', true
4087 tok_state = tok_state_bogus_doctype
4090 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4091 tok_state_before_doctype_public_identifier = ->
4092 c = txt.charAt(cur++)
4093 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4097 tok_cur_tag.public_identifier = ''
4098 tok_state = tok_state_doctype_public_identifier_double_quoted
4102 tok_cur_tag.public_identifier = ''
4103 tok_state = tok_state_doctype_public_identifier_single_quoted
4107 tok_cur_tag.flag 'force-quirks', true
4108 tok_state = tok_state_data
4112 tok_state = tok_state_data
4113 tok_cur_tag.flag 'force-quirks', true
4114 cur -= 1 # Reconsume
4118 tok_cur_tag.flag 'force-quirks', true
4119 tok_state = tok_state_bogus_doctype
4123 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4124 tok_state_doctype_public_identifier_double_quoted = ->
4125 c = txt.charAt(cur++)
4127 tok_state = tok_state_after_doctype_public_identifier
4131 tok_cur_tag.public_identifier += "\ufffd"
4135 tok_cur_tag.flag 'force-quirks', true
4136 tok_state = tok_state_data
4140 tok_state = tok_state_data
4141 tok_cur_tag.flag 'force-quirks', true
4142 cur -= 1 # Reconsume
4145 tok_cur_tag.public_identifier += c
4148 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4149 tok_state_doctype_public_identifier_single_quoted = ->
4150 c = txt.charAt(cur++)
4152 tok_state = tok_state_after_doctype_public_identifier
4156 tok_cur_tag.public_identifier += "\ufffd"
4160 tok_cur_tag.flag 'force-quirks', true
4161 tok_state = tok_state_data
4165 tok_state = tok_state_data
4166 tok_cur_tag.flag 'force-quirks', true
4167 cur -= 1 # Reconsume
4170 tok_cur_tag.public_identifier += c
4173 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4174 tok_state_after_doctype_public_identifier = ->
4175 c = txt.charAt(cur++)
4176 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4177 tok_state = tok_state_between_doctype_public_and_system_identifiers
4180 tok_state = tok_state_data
4184 tok_cur_tag.system_identifier = ''
4185 tok_state = tok_state_doctype_system_identifier_double_quoted
4189 tok_cur_tag.system_identifier = ''
4190 tok_state = tok_state_doctype_system_identifier_single_quoted
4194 tok_state = tok_state_data
4195 tok_cur_tag.flag 'force-quirks', true
4196 cur -= 1 # Reconsume
4200 tok_cur_tag.flag 'force-quirks', true
4201 tok_state = tok_state_bogus_doctype
4204 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4205 tok_state_between_doctype_public_and_system_identifiers = ->
4206 c = txt.charAt(cur++)
4207 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4210 tok_state = tok_state_data
4214 tok_cur_tag.system_identifier = ''
4215 tok_state = tok_state_doctype_system_identifier_double_quoted
4219 tok_cur_tag.system_identifier = ''
4220 tok_state = tok_state_doctype_system_identifier_single_quoted
4224 tok_state = tok_state_data
4225 tok_cur_tag.flag 'force-quirks', true
4226 cur -= 1 # Reconsume
4230 tok_cur_tag.flag 'force-quirks', true
4231 tok_state = tok_state_bogus_doctype
4234 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4235 tok_state_after_doctype_system_keyword = ->
4236 c = txt.charAt(cur++)
4237 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4238 tok_state = tok_state_before_doctype_system_identifier
4242 tok_cur_tag.system_identifier = ''
4243 tok_state = tok_state_doctype_system_identifier_double_quoted
4247 tok_cur_tag.system_identifier = ''
4248 tok_state = tok_state_doctype_system_identifier_single_quoted
4252 tok_cur_tag.flag 'force-quirks', true
4253 tok_state = tok_state_data
4257 tok_state = tok_state_data
4258 tok_cur_tag.flag 'force-quirks', true
4259 cur -= 1 # Reconsume
4263 tok_cur_tag.flag 'force-quirks', true
4264 tok_state = tok_state_bogus_doctype
4267 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4268 tok_state_before_doctype_system_identifier = ->
4269 c = txt.charAt(cur++)
4270 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4273 tok_cur_tag.system_identifier = ''
4274 tok_state = tok_state_doctype_system_identifier_double_quoted
4277 tok_cur_tag.system_identifier = ''
4278 tok_state = tok_state_doctype_system_identifier_single_quoted
4282 tok_cur_tag.flag 'force-quirks', true
4283 tok_state = tok_state_data
4287 tok_state = tok_state_data
4288 tok_cur_tag.flag 'force-quirks', true
4289 cur -= 1 # Reconsume
4293 tok_cur_tag.flag 'force-quirks', true
4294 tok_state = tok_state_bogus_doctype
4297 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4298 tok_state_doctype_system_identifier_double_quoted = ->
4299 c = txt.charAt(cur++)
4301 tok_state = tok_state_after_doctype_system_identifier
4305 tok_cur_tag.system_identifier += "\ufffd"
4309 tok_cur_tag.flag 'force-quirks', true
4310 tok_state = tok_state_data
4314 tok_state = tok_state_data
4315 tok_cur_tag.flag 'force-quirks', true
4316 cur -= 1 # Reconsume
4319 tok_cur_tag.system_identifier += c
4322 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4323 tok_state_doctype_system_identifier_single_quoted = ->
4324 c = txt.charAt(cur++)
4326 tok_state = tok_state_after_doctype_system_identifier
4330 tok_cur_tag.system_identifier += "\ufffd"
4334 tok_cur_tag.flag 'force-quirks', true
4335 tok_state = tok_state_data
4339 tok_state = tok_state_data
4340 tok_cur_tag.flag 'force-quirks', true
4341 cur -= 1 # Reconsume
4344 tok_cur_tag.system_identifier += c
4347 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4348 tok_state_after_doctype_system_identifier = ->
4349 c = txt.charAt(cur++)
4350 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4353 tok_state = tok_state_data
4357 tok_state = tok_state_data
4358 tok_cur_tag.flag 'force-quirks', true
4359 cur -= 1 # Reconsume
4363 # do _not_ tok_cur_tag.flag 'force-quirks', true
4364 tok_state = tok_state_bogus_doctype
4367 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4368 tok_state_bogus_doctype = ->
4369 c = txt.charAt(cur++)
4371 tok_state = tok_state_data
4374 tok_state = tok_state_data
4375 cur -= 1 # Reconsume
4380 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4381 tok_state_cdata_section = ->
4382 tok_state = tok_state_data
4383 next_gt = txt.indexOf ']]>', cur
4385 val = txt.substr cur
4388 val = txt.substr cur, (next_gt - cur)
4390 return new_character_token val # fixfull split
4392 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4393 # Don't set this as a state, just call it
4394 # returns a string (NOT a text node)
4395 parse_character_reference = (allowed_char = null, in_attr = false) ->
4396 if cur >= txt.length
4398 switch c = txt.charAt(cur)
4399 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4400 # explicitly not a parse error
4403 # there has to be "one or more" alnums between & and ; to be a parse error
4406 if cur + 1 >= txt.length
4408 if txt.charAt(cur + 1).toLowerCase() is 'x'
4417 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4422 if txt.charAt(start + i) is ';'
4426 code_point = txt.substr(start, i)
4427 while code_point.charAt(0) is '0' and code_point.length > 1
4428 code_point = code_point.substr 1
4429 code_point = parseInt(code_point, base)
4430 if unicode_fixes[code_point]?
4432 return unicode_fixes[code_point]
4434 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4438 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4440 return from_code_point code_point
4444 if alnum.indexOf(txt.charAt(cur + i)) is -1
4447 # exit early, because parse_error() below needs at least one alnum
4449 if txt.charAt(cur + i) is ';'
4450 i += 1 # include ';' terminator in value
4451 decoded = decode_named_char_ref txt.substr(cur, i)
4458 # no ';' terminator (only legacy char refs)
4460 for i in [2..max] # no prefix matches, so ok to check shortest first
4461 c = legacy_char_refs[txt.substr(cur, i)]
4464 if txt.charAt(cur + i) is '='
4465 # "because some legacy user agents will
4466 # misinterpret the markup in those cases"
4469 if alnum.indexOf(txt.charAt(cur + i)) > -1
4470 # this makes attributes forgiving about url args
4472 # ok, and besides the weird exceptions for attributes...
4473 # return the matching char
4474 cur += i # consume entity chars
4475 parse_error() # because no terminating ";"
4479 return # never reached
4481 # tree constructor initialization
4482 # see comments on TYPE_TAG/etc for the structure of this data
4485 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4487 afe = [] # active formatting elements
4488 template_ins_modes = []
4489 ins_mode = ins_mode_initial
4490 original_ins_mode = ins_mode # TODO check spec
4491 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4492 flag_frameset_ok = true
4494 flag_foster_parenting = false
4495 form_element_pointer = null
4496 temporary_buffer = null
4497 pending_table_character_tokens = []
4498 head_element_pointer = null
4499 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4500 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4502 # tokenizer initialization
4503 tok_state = tok_state_data
4505 # text pre-processing
4506 # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4507 txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4508 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4509 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4511 if args.name is "plain-text-unsafe.dat #4"
4514 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4519 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4522 serialize_els = (els, shallow, show_ids) ->
4528 serialized += t.serialize shallow, show_ids
4531 module.exports.parse_html = parse_html
4532 module.exports.debug_log_reset = debug_log_reset
4533 module.exports.debug_log_each = debug_log_each
4534 module.exports.TYPE_TAG = TYPE_TAG
4535 module.exports.TYPE_TEXT = TYPE_TEXT
4536 module.exports.TYPE_COMMENT = TYPE_COMMENT
4537 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4538 module.exports.NS_HTML = NS_HTML
4539 module.exports.NS_MATHML = NS_MATHML
4540 module.exports.NS_SVG = NS_SVG