1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
25 # Deviations from that spec:
27 # Purposeful: search this file for "WTAG"
29 # Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
34 # the spec uses a many different words do indicate which ends of lists/stacks
35 # they are talking about (and relative movement within the lists/stacks). This
36 # section splains. I'm implementing "lists" (afe and open_els) the same way
39 # stacks grow downward (current element is index=0)
41 # example: open_els = [a, b, c, d, e, f, g]
43 # "grows downwards" means it's visualized like this: (index: el, names)
45 # 6: g "start of the list", "topmost", "first"
47 # 4: e "previous" (to d), "above", "before"
48 # 3: d (previous/next are relative to this element)
49 # 2: c "next", "after", "lower", "below"
51 # 0: a "end of the list", "current node", "bottommost", "last"
55 # note: to get this to run outside a browser, you'll have to write a native
56 # implementation of decode_named_char_ref()
57 unless module?.exports?
59 module = exports: window.wheic
61 from_code_point = (x) ->
62 if String.fromCodePoint?
63 return String.fromCodePoint x
66 return String.fromCharCode x
68 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
70 # Each node is an obect of the Node class. Here are the Node types:
71 TYPE_TAG = 0 # name, {attributes}, [children]
72 TYPE_TEXT = 1 # "text"
75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
77 TYPE_END_TAG = 5 # name
79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
92 debug_log_each = (cb) ->
93 for str in g_debug_log
98 constructor: (type, args = {}) ->
99 @type = type # one of the TYPE_* constants above
100 @name = args.name ? '' # tag name
101 @text = args.text ? '' # contents for text/comment nodes
102 @attrs = args.attrs ? {}
103 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
104 @children = args.children ? []
105 @namespace = args.namespace ? NS_HTML
106 @parent = args.parent ? null
107 @token = args.token ? null
108 @flags = args.flags ? {}
112 @id = "#{++prev_node_id}"
113 acknowledge_self_closing: ->
115 @token.flag 'did_self_close'
117 @flag 'did_self_close', true
118 flag: (key, value = null) ->
123 serialize: (shallow = false, show_ids = false) -> # for unit tests
128 ret += JSON.stringify @name
143 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
149 ret += c.serialize shallow, show_ids
153 ret += JSON.stringify @text
156 ret += JSON.stringify @text
158 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
161 when TYPE_AAA_BOOKMARK
162 ret += 'aaa_bookmark'
165 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
168 # helpers: (only take args that are normally known when parser creates nodes)
169 new_open_tag = (name) ->
170 return new Node TYPE_START_TAG, name: name
171 new_end_tag = (name) ->
172 return new Node TYPE_END_TAG, name: name
173 new_element = (name) ->
174 return new Node TYPE_TAG, name: name
175 new_text_node = (txt) ->
176 return new Node TYPE_TEXT, text: txt
177 new_character_token = new_text_node
178 new_comment_token = (txt) ->
179 return new Node TYPE_COMMENT, text: txt
180 new_doctype_token = (name) ->
181 return new Node TYPE_DOCTYPE, name: name
183 return new Node TYPE_EOF
185 return new Node TYPE_AFE_MARKER
186 new_aaa_bookmark = ->
187 return new Node TYPE_AAA_BOOKMARK
189 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
190 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
191 digits = "0123456789"
192 alnum = lc_alpha + uc_alpha + digits
193 hex_chars = digits + "abcdefABCDEF"
195 is_uc_alpha = (str) ->
196 return str.length is 1 and uc_alpha.indexOf(str) > -1
197 is_lc_alpha = (str) ->
198 return str.length is 1 and lc_alpha.indexOf(str) > -1
200 # some SVG elements have dashes in them
201 tag_name_chars = alnum + "-"
203 # http://www.w3.org/TR/html5/infrastructure.html#space-character
204 space_chars = "\u0009\u000a\u000c\u000d\u0020"
206 return txt.length is 1 and space_chars.indexOf(txt) > -1
207 is_space_tok = (t) ->
208 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
210 is_input_hidden_tok = (t) ->
211 return false unless t.type is TYPE_START_TAG
214 if a[1].toLowerCase() is 'hidden'
219 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
220 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
223 unicode_fixes[0x00] = "\uFFFD"
224 unicode_fixes[0x80] = "\u20AC"
225 unicode_fixes[0x82] = "\u201A"
226 unicode_fixes[0x83] = "\u0192"
227 unicode_fixes[0x84] = "\u201E"
228 unicode_fixes[0x85] = "\u2026"
229 unicode_fixes[0x86] = "\u2020"
230 unicode_fixes[0x87] = "\u2021"
231 unicode_fixes[0x88] = "\u02C6"
232 unicode_fixes[0x89] = "\u2030"
233 unicode_fixes[0x8A] = "\u0160"
234 unicode_fixes[0x8B] = "\u2039"
235 unicode_fixes[0x8C] = "\u0152"
236 unicode_fixes[0x8E] = "\u017D"
237 unicode_fixes[0x91] = "\u2018"
238 unicode_fixes[0x92] = "\u2019"
239 unicode_fixes[0x93] = "\u201C"
240 unicode_fixes[0x94] = "\u201D"
241 unicode_fixes[0x95] = "\u2022"
242 unicode_fixes[0x96] = "\u2013"
243 unicode_fixes[0x97] = "\u2014"
244 unicode_fixes[0x98] = "\u02DC"
245 unicode_fixes[0x99] = "\u2122"
246 unicode_fixes[0x9A] = "\u0161"
247 unicode_fixes[0x9B] = "\u203A"
248 unicode_fixes[0x9C] = "\u0153"
249 unicode_fixes[0x9E] = "\u017E"
250 unicode_fixes[0x9F] = "\u0178"
252 # These are the character references that don't need a terminating semicolon
253 # min length: 2, max: 6, none are a prefix of any other.
255 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
256 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
257 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
258 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
259 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
260 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
261 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
262 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
263 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
264 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
265 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
266 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
267 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
268 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
269 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
270 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
271 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
275 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
276 raw_text_elements = ['script', 'style']
277 escapable_raw_text_elements = ['textarea', 'title']
278 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
280 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
281 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
282 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
283 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
284 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
285 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
286 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
287 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
288 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
289 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
290 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
291 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
292 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
293 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
297 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
299 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
300 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
301 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
302 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
303 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
304 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
305 'determinant', 'diff', 'divergence', 'divide', 'domain',
306 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
307 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
308 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
309 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
310 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
311 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
312 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
313 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
314 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
315 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
316 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
317 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
318 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
319 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
320 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
321 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
322 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
323 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
324 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
325 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
326 'vectorproduct', 'xor'
328 # foreign_elements = [svg_elements..., mathml_elements...]
329 #normal_elements = All other allowed HTML elements are normal elements.
333 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
334 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
335 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
336 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
337 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
338 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
339 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
340 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
341 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
342 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
343 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
345 menu:NS_HTML,menuitem:NS_HTML, # WATWG adds these
347 meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
348 noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
349 plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
350 select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
351 table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
352 textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
353 tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
356 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
357 'annotation-xml':NS_MATHML,
360 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
363 formatting_elements = {
364 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
365 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
369 mathml_text_integration = {
370 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
372 is_mathml_text_integration_point = (el) ->
373 return mathml_text_integration[el.name] is el.namespace
374 is_html_integration = (el) -> # DON'T PASS A TOKEN
375 if el.namespace is NS_MATHML
376 if el.name is 'annotation-xml'
377 if el.attrs.encoding?
378 if el.attrs.encoding.toLowerCase() is 'text/html'
380 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
383 if el.namespace is NS_SVG
384 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
389 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
392 foster_parenting_targets = {
413 el_is_special = (e) ->
414 return special_elements[e.name] is e.namespace
416 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
417 el_is_special_not_adp = (el) ->
418 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
422 altglyphdef: 'altGlyphDef'
423 altglyphitem: 'altGlyphItem'
424 animatecolor: 'animateColor'
425 animatemotion: 'animateMotion'
426 animatetransform: 'animateTransform'
429 fecolormatrix: 'feColorMatrix'
430 fecomponenttransfer: 'feComponentTransfer'
431 fecomposite: 'feComposite'
432 feconvolvematrix: 'feConvolveMatrix'
433 fediffuselighting: 'feDiffuseLighting'
434 fedisplacementmap: 'feDisplacementMap'
435 fedistantlight: 'feDistantLight'
436 fedropshadow: 'feDropShadow'
442 fegaussianblur: 'feGaussianBlur'
445 femergenode: 'feMergeNode'
446 femorphology: 'feMorphology'
448 fepointlight: 'fePointLight'
449 fespecularlighting: 'feSpecularLighting'
450 fespotlight: 'feSpotLight'
452 feturbulence: 'feTurbulence'
453 foreignobject: 'foreignObject'
455 lineargradient: 'linearGradient'
456 radialgradient: 'radialGradient'
459 svg_attribute_fixes = {
460 attributename: 'attributeName'
461 attributetype: 'attributeType'
462 basefrequency: 'baseFrequency'
463 baseprofile: 'baseProfile'
465 clippathunits: 'clipPathUnits'
466 contentscripttype: 'contentScriptType'
467 contentstyletype: 'contentStyleType'
468 diffuseconstant: 'diffuseConstant'
470 externalresourcesrequired: 'externalResourcesRequired'
471 filterres: 'filterRes'
472 filterunits: 'filterUnits'
474 gradienttransform: 'gradientTransform'
475 gradientunits: 'gradientUnits'
476 kernelmatrix: 'kernelMatrix'
477 kernelunitlength: 'kernelUnitLength'
478 keypoints: 'keyPoints'
479 keysplines: 'keySplines'
481 lengthadjust: 'lengthAdjust'
482 limitingconeangle: 'limitingConeAngle'
483 markerheight: 'markerHeight'
484 markerunits: 'markerUnits'
485 markerwidth: 'markerWidth'
486 maskcontentunits: 'maskContentUnits'
487 maskunits: 'maskUnits'
488 numoctaves: 'numOctaves'
489 pathlength: 'pathLength'
490 patterncontentunits: 'patternContentUnits'
491 patterntransform: 'patternTransform'
492 patternunits: 'patternUnits'
493 pointsatx: 'pointsAtX'
494 pointsaty: 'pointsAtY'
495 pointsatz: 'pointsAtZ'
496 preservealpha: 'preserveAlpha'
497 preserveaspectratio: 'preserveAspectRatio'
498 primitiveunits: 'primitiveUnits'
501 repeatcount: 'repeatCount'
502 repeatdur: 'repeatDur'
503 requiredextensions: 'requiredExtensions'
504 requiredfeatures: 'requiredFeatures'
505 specularconstant: 'specularConstant'
506 specularexponent: 'specularExponent'
507 spreadmethod: 'spreadMethod'
508 startoffset: 'startOffset'
509 stddeviation: 'stdDeviation'
510 stitchtiles: 'stitchTiles'
511 surfacescale: 'surfaceScale'
512 systemlanguage: 'systemLanguage'
513 tablevalues: 'tableValues'
516 textlength: 'textLength'
518 viewtarget: 'viewTarget'
519 xchannelselector: 'xChannelSelector'
520 ychannelselector: 'yChannelSelector'
521 zoomandpan: 'zoomAndPan'
523 adjust_mathml_attributes = (t) ->
525 if a[0] is 'definitionurl'
526 a[0] = 'definitionURL'
528 adjust_svg_attributes = (t) ->
530 if svg_attribute_fixes[a[0]]?
531 a[0] = svg_attribute_fixes[a[0]]
533 adjust_foreign_attributes = (t) ->
537 # decode_named_char_ref()
539 # The list of named character references is _huge_ so ask the browser to decode
540 # for us instead of wasting bandwidth/space on including the table here.
542 # Pass without the "&" but with the ";" examples:
543 # for "&" pass "amp;"
544 # for "′" pass "x2032;"
547 textarea: document.createElement('textarea')
549 # TODO test this in IE8
550 decode_named_char_ref = (txt) ->
552 decoded = g_dncr.cache[txt]
553 return decoded if decoded?
554 g_dncr.textarea.innerHTML = txt
555 decoded = g_dncr.textarea.value
556 return null if decoded is txt
557 return g_dncr.cache[txt] = decoded
559 parse_html = (args) ->
561 cur = null # index of next char in txt to be parsed
562 # declare doc and tokenizer variables so they're in scope below
564 open_els = null # stack of open elements
565 afe = null # active formatting elements
566 template_ins_modes = null
568 original_ins_mode = null
570 tok_cur_tag = null # partially parsed tag
571 flag_scripting = null
572 flag_frameset_ok = null
574 flag_foster_parenting = null
575 form_element_pointer = null
576 temporary_buffer = null
577 pending_table_character_tokens = null
578 head_element_pointer = null
579 flag_fragment_parsing = null
580 context_element = null
589 console.log "Parse error at character #{cur} of #{txt.length}"
591 afe_push = (new_el) ->
594 if el.name is new_el.name and el.namespace is new_el.namespace
596 continue unless new_el.attrs[k] is v
597 for k, v of new_el.attrs
598 continue unless el.attrs[k] is v
605 afe.unshift new_afe_marker()
607 # the functions below impliment the Tree Contstruction algorithm
608 # http://www.w3.org/TR/html5/syntax.html#tree-construction
610 # But first... the helpers
611 template_tag_is_open = ->
613 if t.name is 'template' and t.namespace is NS_HTML
616 is_in_scope_x = (tag_name, scope, namespace) ->
618 if t.name is tag_name and (namespace is null or namespace is t.namespace)
620 if scope[t.name] is t.namespace
623 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
625 if t.name is tag_name and (namespace is null or namespace is t.namespace)
627 if scope[t.name] is t.namespace
629 if scope2[t.name] is t.namespace
633 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
634 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
635 template: NS_HTML, mi: NS_MATHML,
637 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
638 'annotation-xml': NS_MATHML,
640 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
642 button_scopers = button: NS_HTML
643 li_scopers = ol: NS_HTML, ul: NS_HTML
644 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
645 is_in_scope = (tag_name, namespace = null) ->
646 return is_in_scope_x tag_name, standard_scopers, namespace
647 is_in_button_scope = (tag_name, namespace = null) ->
648 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
649 is_in_table_scope = (tag_name, namespace = null) ->
650 return is_in_scope_x tag_name, table_scopers, namespace
651 # aka is_in_list_item_scope
652 is_in_li_scope = (tag_name, namespace = null) ->
653 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
654 is_in_select_scope = (tag_name, namespace = null) ->
656 if t.name is tag_name and (namespace is null or namespace is t.namespace)
658 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
661 # this checks for a particular element, not by name
662 # this requires a namespace match
663 el_is_in_scope = (needle) ->
667 if standard_scopers[el.name] is el.namespace
671 clear_to_table_stopers = {
676 clear_stack_to_table_context = ->
678 if clear_to_table_stopers[open_els[0].name]?
682 clear_to_table_body_stopers = {
689 clear_stack_to_table_body_context = ->
691 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
695 clear_to_table_row_stopers = {
700 clear_stack_to_table_row_context = ->
702 if clear_to_table_row_stopers[open_els[0].name]?
706 clear_afe_to_marker = ->
708 return unless afe.length > 0 # this happens in fragment case, ?spec error
710 if el.type is TYPE_AFE_MARKER
715 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
717 # 1. Let last be false.
719 # 2. Let node be the last node in the stack of open elements.
721 node = open_els[node_i]
722 # 3. Loop: If node is the first node in the stack of open elements,
723 # then set last to true, and, if the parser was originally created as
724 # part of the HTML fragment parsing algorithm (fragment case) set node
725 # to the context element.
727 if node_i is open_els.length - 1
729 # fixfull (fragment case)
731 # 4. If node is a select element, run these substeps:
732 if node.name is 'select' and node.namespace is NS_HTML
733 # 1. If last is true, jump to the step below labeled done.
735 # 2. Let ancestor be node.
738 # 3. Loop: If ancestor is the first node in the stack of
739 # open elements, jump to the step below labeled done.
741 if ancestor_i is open_els.length - 1
743 # 4. Let ancestor be the node before ancestor in the stack
746 ancestor = open_els[ancestor_i]
747 # 5. If ancestor is a template node, jump to the step below
749 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
751 # 6. If ancestor is a table node, switch the insertion mode
752 # to "in select in table" and abort these steps.
753 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
754 ins_mode = ins_mode_in_select_in_table
756 # 7. Jump back to the step labeled loop.
757 # 8. Done: Switch the insertion mode to "in select" and abort
759 ins_mode = ins_mode_in_select
761 # 5. If node is a td or th element and last is false, then switch
762 # the insertion mode to "in cell" and abort these steps.
763 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
764 ins_mode = ins_mode_in_cell
766 # 6. If node is a tr element, then switch the insertion mode to "in
767 # row" and abort these steps.
768 if node.name is 'tr' and node.namespace is NS_HTML
769 ins_mode = ins_mode_in_row
771 # 7. If node is a tbody, thead, or tfoot element, then switch the
772 # insertion mode to "in table body" and abort these steps.
773 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
774 ins_mode = ins_mode_in_table_body
776 # 8. If node is a caption element, then switch the insertion mode
777 # to "in caption" and abort these steps.
778 if node.name is 'caption' and node.namespace is NS_HTML
779 ins_mode = ins_mode_in_caption
781 # 9. If node is a colgroup element, then switch the insertion mode
782 # to "in column group" and abort these steps.
783 if node.name is 'colgroup' and node.namespace is NS_HTML
784 ins_mode = ins_mode_in_column_group
786 # 10. If node is a table element, then switch the insertion mode to
787 # "in table" and abort these steps.
788 if node.name is 'table' and node.namespace is NS_HTML
789 ins_mode = ins_mode_in_table
791 # 11. If node is a template element, then switch the insertion mode
792 # to the current template insertion mode and abort these steps.
793 if node.name is 'template' and node.namespace is NS_HTML
794 ins_mode = template_ins_modes[0]
796 # 12. If node is a head element and last is true, then switch the
797 # insertion mode to "in body" ("in body"! not "in head"!) and abort
798 # these steps. (fragment case)
799 if node.name is 'head' and node.namespace is NS_HTML and last
800 ins_mode = ins_mode_in_body
802 # 13. If node is a head element and last is false, then switch the
803 # insertion mode to "in head" and abort these steps.
804 if node.name is 'head' and node.namespace is NS_HTML and last is false
805 ins_mode = ins_mode_in_head
807 # 14. If node is a body element, then switch the insertion mode to
808 # "in body" and abort these steps.
809 if node.name is 'body' and node.namespace is NS_HTML
810 ins_mode = ins_mode_in_body
812 # 15. If node is a frameset element, then switch the insertion mode
813 # to "in frameset" and abort these steps. (fragment case)
814 if node.name is 'frameset' and node.namespace is NS_HTML
815 ins_mode = ins_mode_in_frameset
817 # 16. If node is an html element, run these substeps:
818 if node.name is 'html' and node.namespace is NS_HTML
819 # 1. If the head element pointer is null, switch the insertion
820 # mode to "before head" and abort these steps. (fragment case)
821 if head_element_pointer is null
822 ins_mode = ins_mode_before_head
824 # 2. Otherwise, the head element pointer is not null,
825 # switch the insertion mode to "after head" and abort these
827 ins_mode = ins_mode_after_head
829 # 17. If last is true, then switch the insertion mode to "in body"
830 # and abort these steps. (fragment case)
832 ins_mode = ins_mode_in_body
834 # 18. Let node now be the node before node in the stack of open
837 node = open_els[node_i]
838 # 19. Return to the step labeled loop.
842 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
843 adjusted_current_node = ->
844 if open_els.length is 1 and flag_fragment_parsing
845 return context_element
848 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
849 # this implementation is structured (mostly) as described at the link above.
850 # capitalized comments are the "labels" described at the link above.
852 return if afe.length is 0
853 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
858 if i is afe.length - 1
861 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
866 el = insert_html_element afe[i].token
871 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
872 # adoption agency algorithm
874 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
875 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
876 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
877 adoption_agency = (subject) ->
878 debug_log "adoption_agency()"
879 debug_log "tree: #{serialize_els doc.children, false, true}"
880 debug_log "open_els: #{serialize_els open_els, true, true}"
881 debug_log "afe: #{serialize_els afe, true, true}"
882 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
885 # remove it from the list of active formatting elements (if found)
890 debug_log "aaa: starting off with subject on top of stack, exiting"
897 # 5. Let formatting element be the last element in the list of
898 # active formatting elements that: is between the end of the list
899 # and the last scope marker in the list, if any, or the start of
900 # the list otherwise, and has the tag name subject.
902 for t, fe_of_afe in afe
903 if t.type is TYPE_AFE_MARKER
908 # If there is no such element, then abort these steps and instead
909 # act as described in the "any other end tag" entry above.
911 debug_log "aaa: fe not found in afe"
912 in_body_any_other_end_tag subject
914 # 6. If formatting element is not in the stack of open elements,
915 # then this is a parse error; remove the element from the list, and
918 for t, fe_of_open_els in open_els
923 debug_log "aaa: fe not found in open_els"
925 # "remove it from the list" must mean afe, since it's not in open_els
926 afe.splice fe_of_afe, 1
928 # 7. If formatting element is in the stack of open elements, but
929 # the element is not in scope, then this is a parse error; abort
931 unless el_is_in_scope fe
932 debug_log "aaa: fe not in scope"
935 # 8. If formatting element is not the current node, this is a parse
936 # error. (But do not abort these steps.)
937 unless open_els[0] is fe
940 # 9. Let furthest block be the topmost node in the stack of open
941 # elements that is lower in the stack than formatting element, and
942 # is an element in the special category. There might not be one.
944 fb_of_open_els = null
951 # and continue, to see if there's one that's more "topmost"
952 # 10. If there is no furthest block, then the UA must first pop all
953 # the nodes from the bottom of the stack of open elements, from the
954 # current node up to and including formatting element, then remove
955 # formatting element from the list of active formatting elements,
956 # and finally abort these steps.
958 debug_log "aaa: no fb"
962 afe.splice fe_of_afe, 1
964 # 11. Let common ancestor be the element immediately above
965 # formatting element in the stack of open elements.
966 ca = open_els[fe_of_open_els + 1] # common ancestor
968 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
969 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
970 bookmark = new_aaa_bookmark()
973 afe.splice i, 0, bookmark
975 node = last_node = fb
979 # 3. Let node be the element immediately above node in the
980 # stack of open elements, or if node is no longer in the stack
981 # of open elements (e.g. because it got removed by this
982 # algorithm), the element that was immediately above node in
983 # the stack of open elements before node was removed.
987 node_next = open_els[i + 1]
989 node = node_next ? node_above
990 debug_log "inner loop #{inner}"
991 debug_log "tree: #{serialize_els doc.children, false, true}"
992 debug_log "open_els: #{serialize_els open_els, true, true}"
993 debug_log "afe: #{serialize_els afe, true, true}"
994 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
995 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
996 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
997 debug_log "node: #{node.serialize true, true}"
998 # TODO make sure node_above gets re-set if/when node is removed from open_els
1000 # 4. If node is formatting element, then go to the next step in
1001 # the overall algorithm.
1004 debug_log "the meat"
1005 # 5. If inner loop counter is greater than three and node is in
1006 # the list of active formatting elements, then remove node from
1007 # the list of active formatting elements.
1013 debug_log "max out inner"
1018 # 6. If node is not in the list of active formatting elements,
1019 # then remove node from the stack of open elements and then go
1020 # back to the step labeled inner loop.
1022 debug_log "not in afe"
1023 for t, i in open_els
1025 node_above = open_els[i + 1]
1026 open_els.splice i, 1
1029 debug_log "the bones"
1030 # 7. create an element for the token for which the element node
1031 # was created, in the HTML namespace, with common ancestor as
1032 # the intended parent; replace the entry for node in the list
1033 # of active formatting elements with an entry for the new
1034 # element, replace the entry for node in the stack of open
1035 # elements with an entry for the new element, and let node be
1037 new_node = token_to_element node.token, NS_HTML, ca
1041 debug_log "replaced in afe"
1043 for t, i in open_els
1045 node_above = open_els[i + 1]
1046 open_els[i] = new_node
1047 debug_log "replaced in open_els"
1050 # 8. If last node is furthest block, then move the
1051 # aforementioned bookmark to be immediately after the new node
1052 # in the list of active formatting elements.
1057 debug_log "removed bookmark"
1061 # "after" means lower
1062 afe.splice i, 0, bookmark # "after as <-
1063 debug_log "placed bookmark after node"
1064 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1066 # 9. Insert last node into node, first removing it from its
1067 # previous parent node if any.
1068 if last_node.parent?
1069 debug_log "last_node has parent"
1070 for c, i in last_node.parent.children
1072 debug_log "removing last_node from parent"
1073 last_node.parent.children.splice i, 1
1075 node.children.push last_node
1076 last_node.parent = node
1077 # 10. Let last node be node.
1080 # 11. Return to the step labeled inner loop.
1081 # 14. Insert whatever last node ended up being in the previous step
1082 # at the appropriate place for inserting a node, but using common
1083 # ancestor as the override target.
1085 # In the case where fe is immediately followed by fb:
1086 # * inner loop exits out early (node==fe)
1088 # * last_node is still in the tree (not a duplicate)
1089 if last_node.parent?
1090 debug_log "FEFIRST? last_node has parent"
1091 for c, i in last_node.parent.children
1093 debug_log "removing last_node from parent"
1094 last_node.parent.children.splice i, 1
1097 debug_log "after aaa inner loop"
1098 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1099 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1100 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1101 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1102 debug_log "tree: #{serialize_els doc.children, false, true}"
1107 # can't use standard insert token thing, because it's already in
1108 # open_els and must stay at it's current position in open_els
1109 dest = adjusted_insertion_location ca
1110 dest[0].children.splice dest[1], 0, last_node
1111 last_node.parent = dest[0]
1114 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1115 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1116 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1117 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1118 debug_log "tree: #{serialize_els doc.children, false, true}"
1120 # 15. Create an element for the token for which formatting element
1121 # was created, in the HTML namespace, with furthest block as the
1123 new_element = token_to_element fe.token, NS_HTML, fb
1124 # 16. Take all of the child nodes of furthest block and append them
1125 # to the element created in the last step.
1126 while fb.children.length
1127 t = fb.children.shift()
1128 t.parent = new_element
1129 new_element.children.push t
1130 # 17. Append that new element to furthest block.
1131 new_element.parent = fb
1132 fb.children.push new_element
1133 # 18. Remove formatting element from the list of active formatting
1134 # elements, and insert the new element into the list of active
1135 # formatting elements at the position of the aforementioned
1143 afe[i] = new_element
1145 # 19. Remove formatting element from the stack of open elements,
1146 # and insert the new element into the stack of open elements
1147 # immediately below the position of furthest block in that stack.
1148 for t, i in open_els
1150 open_els.splice i, 1
1152 for t, i in open_els
1154 open_els.splice i, 0, new_element
1156 # 20. Jump back to the step labeled outer loop.
1157 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1158 debug_log "tree: #{serialize_els doc.children, false, true}"
1159 debug_log "open_els: #{serialize_els open_els, true, true}"
1160 debug_log "afe: #{serialize_els afe, true, true}"
1161 debug_log "AAA DONE"
1163 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1164 close_p_element = ->
1165 generate_implied_end_tags 'p' # arg is exception
1166 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1168 while open_els.length > 1 # just in case
1169 el = open_els.shift()
1170 if el.name is 'p' and el.namespace is NS_HTML
1172 close_p_if_in_button_scope = ->
1173 if is_in_button_scope 'p', NS_HTML
1176 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1177 # aka insert_a_character = (t) ->
1178 insert_character = (t) ->
1179 dest = adjusted_insertion_location()
1180 # fixfull check for Document node
1182 prev = dest[0].children[dest[1] - 1]
1183 if prev.type is TYPE_TEXT
1186 dest[0].children.splice dest[1], 0, t
1189 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1190 process_token = (t) ->
1191 acn = adjusted_current_node()
1195 if acn.namespace is NS_HTML
1198 if is_mathml_text_integration_point(acn)
1199 if t.type is TYPE_START_TAG and (t.name is 'mglyph' or t.name is 'malignmark')
1202 if t.type is TYPE_TEXT
1205 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1208 if is_html_integration acn
1209 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1212 if t.type is TYPE_EOF
1215 in_foreign_content t
1219 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1220 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1221 adjusted_insertion_location = (override_target = null) ->
1222 # 1. If there was an override target specified, then let target be the
1225 target = override_target
1226 else # Otherwise, let target be the current node.
1227 target = open_els[0]
1228 # 2. Determine the adjusted insertion location using the first matching
1229 # steps from the following list:
1231 # If foster parenting is enabled and target is a table, tbody, tfoot,
1232 # thead, or tr element Foster parenting happens when content is
1233 # misnested in tables.
1234 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1235 loop # once. this is here so we can ``break`` to "abort these substeps"
1236 # 1. Let last template be the last template element in the
1237 # stack of open elements, if any.
1238 last_template = null
1239 last_template_i = null
1240 for el, i in open_els
1241 if el.name is 'template' and el.namespace is NS_HTML
1245 # 2. Let last table be the last table element in the stack of
1246 # open elements, if any.
1249 for el, i in open_els
1250 if el.name is 'table' and el.namespace is NS_HTML
1254 # 3. If there is a last template and either there is no last
1255 # table, or there is one, but last template is lower (more
1256 # recently added) than last table in the stack of open
1257 # elements, then: let adjusted insertion location be inside
1258 # last template's template contents, after its last child (if
1259 # any), and abort these substeps.
1260 if last_template and (last_table is null or last_template_i < last_table_i)
1261 target = last_template # fixfull should be it's contents
1262 target_i = target.children.length
1264 # 4. If there is no last table, then let adjusted insertion
1265 # location be inside the first element in the stack of open
1266 # elements (the html element), after its last child (if any),
1267 # and abort these substeps. (fragment case)
1268 if last_table is null
1270 target = open_els[open_els.length - 1]
1271 target_i = target.children.length
1273 # 5. If last table has a parent element, then let adjusted
1274 # insertion location be inside last table's parent element,
1275 # immediately before last table, and abort these substeps.
1276 if last_table.parent?
1277 for c, i in last_table.parent.children
1279 target = last_table.parent
1283 # 6. Let previous element be the element immediately above last
1284 # table in the stack of open elements.
1286 # huh? how could it not have a parent?
1287 previous_element = open_els[last_table_i + 1]
1288 # 7. Let adjusted insertion location be inside previous
1289 # element, after its last child (if any).
1290 target = previous_element
1291 target_i = target.children.length
1292 # Note: These steps are involved in part because it's possible
1293 # for elements, the table element in this case in particular,
1294 # to have been moved by a script around in the DOM, or indeed
1295 # removed from the DOM entirely, after the element was inserted
1297 break # don't really loop
1299 # Otherwise Let adjusted insertion location be inside target, after
1300 # its last child (if any).
1301 target_i = target.children.length
1303 # 3. If the adjusted insertion location is inside a template element,
1304 # let it instead be inside the template element's template contents,
1305 # after its last child (if any).
1306 # fixfull (template)
1308 # 4. Return the adjusted insertion location.
1309 return [target, target_i]
1311 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1312 # aka create_an_element_for_token
1313 token_to_element = (t, namespace, intended_parent) ->
1314 # convert attributes into a hash
1317 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1318 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1320 # TODO 2. If the newly created element has an xmlns attribute in the
1321 # XMLNS namespace whose value is not exactly the same as the element's
1322 # namespace, that is a parse error. Similarly, if the newly created
1323 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1324 # value is not the XLink Namespace, that is a parse error.
1326 # fixfull: the spec says stuff about form pointers and ownerDocument
1330 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1331 insert_foreign_element = (token, namespace) ->
1332 ail = adjusted_insertion_location()
1335 el = token_to_element token, namespace, ail_el
1336 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1338 ail_el.children.splice ail_i, 0, el
1341 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1342 insert_html_element = (token) ->
1343 insert_foreign_element token, NS_HTML
1345 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1346 # position should be [node, index_within_children]
1347 insert_comment = (t, position = null) ->
1348 position ?= adjusted_insertion_location()
1349 position[0].children.splice position[1], 0, t
1352 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1353 parse_generic_raw_text = (t) ->
1354 insert_html_element t
1355 tok_state = tok_state_rawtext
1356 original_ins_mode = ins_mode
1357 ins_mode = ins_mode_text
1358 parse_generic_rcdata_text = (t) ->
1359 insert_html_element t
1360 tok_state = tok_state_rcdata
1361 original_ins_mode = ins_mode
1362 ins_mode = ins_mode_text
1364 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1365 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1366 generate_implied_end_tags = (except = null) ->
1367 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1370 # 8.2.5.4 The rules for parsing tokens in HTML content
1371 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1373 # 8.2.5.4.1 The "initial" insertion mode
1374 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1375 ins_mode_initial = (t) ->
1378 if t.type is TYPE_COMMENT
1382 if t.type is TYPE_DOCTYPE
1383 # FIXME check identifiers, set quirks, etc
1386 ins_mode = ins_mode_before_html
1389 #fixfull (iframe, quirks)
1390 ins_mode = ins_mode_before_html
1394 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1395 ins_mode_before_html = (t) ->
1396 if t.type is TYPE_DOCTYPE
1399 if t.type is TYPE_COMMENT
1404 if t.type is TYPE_START_TAG and t.name is 'html'
1405 el = token_to_element t, NS_HTML, doc
1406 doc.children.push el
1407 open_els.unshift(el)
1408 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1409 ins_mode = ins_mode_before_head
1411 if t.type is TYPE_END_TAG
1412 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1413 # fall through to "anything else"
1418 html_tok = new_open_tag 'html'
1419 el = token_to_element html_tok, NS_HTML, doc
1420 doc.children.push el
1422 # ?fixfull browsing context
1423 ins_mode = ins_mode_before_head
1427 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1428 ins_mode_before_head = (t) ->
1431 if t.type is TYPE_COMMENT
1434 if t.type is TYPE_DOCTYPE
1437 if t.type is TYPE_START_TAG and t.name is 'html'
1440 if t.type is TYPE_START_TAG and t.name is 'head'
1441 el = insert_html_element t
1442 head_element_pointer = el
1443 ins_mode = ins_mode_in_head
1445 if t.type is TYPE_END_TAG
1446 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1447 # fall through to Anything else below
1452 head_tok = new_open_tag 'head'
1453 el = insert_html_element head_tok
1454 head_element_pointer = el
1455 ins_mode = ins_mode_in_head
1458 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1459 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1460 open_els.shift() # spec says this will be a 'head' node
1461 ins_mode = ins_mode_after_head
1463 ins_mode_in_head = (t) ->
1464 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1467 if t.type is TYPE_COMMENT
1470 if t.type is TYPE_DOCTYPE
1473 if t.type is TYPE_START_TAG and t.name is 'html'
1476 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1477 el = insert_html_element t
1479 t.acknowledge_self_closing()
1481 if t.type is TYPE_START_TAG and t.name is 'meta'
1482 el = insert_html_element t
1484 t.acknowledge_self_closing()
1485 # fixfull encoding stuff
1487 if t.type is TYPE_START_TAG and t.name is 'title'
1488 parse_generic_rcdata_text t
1490 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1491 parse_generic_raw_text t
1493 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1494 insert_html_element t
1495 ins_mode = ins_mode_in_head_noscript
1497 if t.type is TYPE_START_TAG and t.name is 'script'
1498 ail = adjusted_insertion_location()
1499 el = token_to_element t, NS_HTML, ail
1500 el.flag 'parser-inserted', true
1501 # fixfull frament case
1502 ail[0].children.splice ail[1], 0, el
1504 tok_state = tok_state_script_data
1505 original_ins_mode = ins_mode # make sure orig... is defined
1506 ins_mode = ins_mode_text
1508 if t.type is TYPE_END_TAG and t.name is 'head'
1509 open_els.shift() # will be a head element... spec says so
1510 ins_mode = ins_mode_after_head
1512 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1513 ins_mode_in_head_else t
1515 if t.type is TYPE_START_TAG and t.name is 'template'
1516 insert_html_element t
1518 flag_frameset_ok = false
1519 ins_mode = ins_mode_in_template
1520 template_ins_modes.unshift ins_mode_in_template
1522 if t.type is TYPE_END_TAG and t.name is 'template'
1523 if template_tag_is_open()
1524 generate_implied_end_tags
1525 if open_els[0].name isnt 'template'
1528 el = open_els.shift()
1529 if el.name is 'template' and el.namespace is NS_HTML
1531 clear_afe_to_marker()
1532 template_ins_modes.shift()
1537 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1540 ins_mode_in_head_else t
1542 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1543 ins_mode_in_head_noscript_else = (t) ->
1546 ins_mode = ins_mode_in_head
1548 ins_mode_in_head_noscript = (t) ->
1549 if t.type is TYPE_DOCTYPE
1552 if t.type is TYPE_START_TAG and t.name is 'html'
1555 if t.type is TYPE_END_TAG and t.name is 'noscript'
1557 ins_mode = ins_mode_in_head
1559 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1562 if t.type is TYPE_END_TAG and t.name is 'br'
1563 ins_mode_in_head_noscript_else t
1565 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1569 ins_mode_in_head_noscript_else t
1574 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1575 ins_mode_after_head_else = (t) ->
1576 body_tok = new_open_tag 'body'
1577 insert_html_element body_tok
1578 ins_mode = ins_mode_in_body
1581 ins_mode_after_head = (t) ->
1585 if t.type is TYPE_COMMENT
1588 if t.type is TYPE_DOCTYPE
1591 if t.type is TYPE_START_TAG and t.name is 'html'
1594 if t.type is TYPE_START_TAG and t.name is 'body'
1595 insert_html_element t
1596 flag_frameset_ok = false
1597 ins_mode = ins_mode_in_body
1599 if t.type is TYPE_START_TAG and t.name is 'frameset'
1600 insert_html_element t
1601 ins_mode = ins_mode_in_frameset
1603 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1605 open_els.unshift head_element_pointer
1607 for el, i of open_els
1608 if el is head_element_pointer
1609 open_els.splice i, 1
1611 console.log "warning: 23904 couldn't find head element in open_els"
1613 if t.type is TYPE_END_TAG and t.name is 'template'
1616 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1617 ins_mode_after_head_else t
1619 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1623 ins_mode_after_head_else t
1625 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1626 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1627 for el, i in open_els
1628 if el.name is name and el.namespace is NS_HTML
1629 generate_implied_end_tags name # arg is exception
1630 parse_error() unless i is 0
1635 if special_elements[el.name] is el.namespace
1639 ins_mode_in_body = (t) ->
1640 if t.type is TYPE_TEXT and t.text is "\u0000"
1647 if t.type is TYPE_TEXT
1650 flag_frameset_ok = false
1652 if t.type is TYPE_COMMENT
1655 if t.type is TYPE_DOCTYPE
1658 if t.type is TYPE_START_TAG and t.name is 'html'
1660 return if template_tag_is_open()
1661 root_attrs = open_els[open_els.length - 1].attrs
1663 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1666 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1669 if t.type is TYPE_START_TAG and t.name is 'body'
1671 return if open_els.length < 2
1672 second = open_els[open_els.length - 2]
1673 return unless second.namespace is NS_HTML
1674 return unless second.name is 'body'
1675 return if template_tag_is_open()
1676 flag_frameset_ok = false
1678 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1680 if t.type is TYPE_START_TAG and t.name is 'frameset'
1682 return if open_els.length < 2
1683 second_i = open_els.length - 2
1684 second = open_els[second_i]
1685 return unless second.namespace is NS_HTML
1686 return unless second.name is 'body'
1687 if flag_frameset_ok is false
1690 for el, i in second.parent.children
1692 second.parent.children.splice i, 1
1694 open_els.splice second_i, 1
1695 # pop everything except the "root html element"
1696 while open_els.length > 1
1698 insert_html_element t
1699 ins_mode = ins_mode_in_frameset
1701 if t.type is TYPE_EOF
1703 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1704 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1705 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1708 unless ok_tags[t.name] is el.namespace
1711 if template_ins_modes.length > 0
1712 ins_mode_in_template t
1716 if t.type is TYPE_END_TAG and t.name is 'body'
1717 unless is_in_scope 'body', NS_HTML
1721 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1722 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1723 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1724 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1728 unless ok_tags[t.name] is el.namespace
1731 ins_mode = ins_mode_after_body
1733 if t.type is TYPE_END_TAG and t.name is 'html'
1734 unless is_in_scope 'body', NS_HTML
1738 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1739 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1740 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1741 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1745 unless ok_tags[t.name] is el.namespace
1748 ins_mode = ins_mode_after_body
1751 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1752 close_p_if_in_button_scope()
1753 insert_html_element t
1755 if t.type is TYPE_START_TAG and h_tags[t.name]?
1756 close_p_if_in_button_scope()
1757 if h_tags[open_els[0].name] is open_els[0].namespace
1760 insert_html_element t
1762 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1763 close_p_if_in_button_scope()
1764 insert_html_element t
1765 # spec: If the next token is a "LF" (U+000A) character token, then
1766 # ignore that token and move on to the next one. (Newlines at the
1767 # start of pre blocks are ignored as an authoring convenience.)
1768 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1770 flag_frameset_ok = false
1772 if t.type is TYPE_START_TAG and t.name is 'form'
1773 unless form_element_pointer is null or template_tag_is_open()
1776 close_p_if_in_button_scope()
1777 el = insert_html_element t
1778 unless template_tag_is_open()
1779 form_element_pointer = el
1781 if t.type is TYPE_START_TAG and t.name is 'li'
1782 flag_frameset_ok = false
1783 for node in open_els
1784 if node.name is 'li' and node.namespace is NS_HTML
1785 generate_implied_end_tags 'li' # arg is exception
1786 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1789 el = open_els.shift()
1790 if el.name is 'li' and el.namespace is NS_HTML
1793 if el_is_special_not_adp node
1795 close_p_if_in_button_scope()
1796 insert_html_element t
1798 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1799 flag_frameset_ok = false
1800 for node in open_els
1801 if node.name is 'dd' and node.namespace is NS_HTML
1802 generate_implied_end_tags 'dd' # arg is exception
1803 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1806 el = open_els.shift()
1807 if el.name is 'dd' and el.namespace is NS_HTML
1810 if node.name is 'dt' and node.namespace is NS_HTML
1811 generate_implied_end_tags 'dt' # arg is exception
1812 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1815 el = open_els.shift()
1816 if el.name is 'dt' and el.namespace is NS_HTML
1819 if el_is_special_not_adp node
1821 close_p_if_in_button_scope()
1822 insert_html_element t
1824 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1825 close_p_if_in_button_scope()
1826 insert_html_element t
1827 tok_state = tok_state_plaintext
1829 if t.type is TYPE_START_TAG and t.name is 'button'
1830 if is_in_scope 'button', NS_HTML
1832 generate_implied_end_tags()
1834 el = open_els.shift()
1835 if el.name is 'button' and el.namespace is NS_HTML
1838 insert_html_element t
1839 flag_frameset_ok = false
1841 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1842 unless is_in_scope t.name, NS_HTML
1845 generate_implied_end_tags()
1846 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1849 el = open_els.shift()
1850 if el.name is t.name and el.namespace is NS_HTML
1853 if t.type is TYPE_END_TAG and t.name is 'form'
1854 unless template_tag_is_open()
1855 node = form_element_pointer
1856 form_element_pointer = null
1857 if node is null or not el_is_in_scope node
1860 generate_implied_end_tags()
1861 if open_els[0] isnt node
1863 for el, i in open_els
1865 open_els.splice i, 1
1868 unless is_in_scope 'form', NS_HTML
1871 generate_implied_end_tags()
1872 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1875 el = open_els.shift()
1876 if el.name is 'form' and el.namespace is NS_HTML
1879 if t.type is TYPE_END_TAG and t.name is 'p'
1880 unless is_in_button_scope 'p', NS_HTML
1882 insert_html_element new_open_tag 'p'
1885 if t.type is TYPE_END_TAG and t.name is 'li'
1886 unless is_in_li_scope 'li', NS_HTML
1889 generate_implied_end_tags 'li' # arg is exception
1890 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1893 el = open_els.shift()
1894 if el.name is 'li' and el.namespace is NS_HTML
1897 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1898 unless is_in_scope t.name, NS_HTML
1901 generate_implied_end_tags t.name # arg is exception
1902 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1905 el = open_els.shift()
1906 if el.name is t.name and el.namespace is NS_HTML
1909 if t.type is TYPE_END_TAG and h_tags[t.name]?
1912 if h_tags[el.name] is el.namespace
1915 if standard_scopers[el.name] is el.namespace
1920 generate_implied_end_tags()
1921 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1924 el = open_els.shift()
1925 if h_tags[el.name] is el.namespace
1929 if t.type is TYPE_START_TAG and t.name is 'a'
1930 # If the list of active formatting elements contains an a element
1931 # between the end of the list and the last marker on the list (or
1932 # the start of the list if there is no marker on the list), then
1933 # this is a parse error; run the adoption agency algorithm for the
1934 # tag name "a", then remove that element from the list of active
1935 # formatting elements and the stack of open elements if the
1936 # adoption agency algorithm didn't already remove it (it might not
1937 # have if the element is not in table scope).
1940 if el.type is TYPE_AFE_MARKER
1942 if el.name is 'a' and el.namespace is NS_HTML
1950 for el, i in open_els
1952 open_els.splice i, 1
1954 el = insert_html_element t
1957 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1959 el = insert_html_element t
1962 if t.type is TYPE_START_TAG and t.name is 'nobr'
1964 el = insert_html_element t
1967 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1968 adoption_agency t.name
1970 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1972 insert_html_element t
1974 flag_frameset_ok = false
1976 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1977 unless is_in_scope t.name, NS_HTML
1980 generate_implied_end_tags()
1981 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1984 el = open_els.shift()
1985 if el.name is t.name and el.namespace is NS_HTML
1987 clear_afe_to_marker()
1989 if t.type is TYPE_START_TAG and t.name is 'table'
1990 close_p_if_in_button_scope() # fixfull quirksmode thing
1991 insert_html_element t
1992 flag_frameset_ok = false
1993 ins_mode = ins_mode_in_table
1995 if t.type is TYPE_END_TAG and t.name is 'br'
1997 t.type is TYPE_START_TAG
1999 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2001 insert_html_element t
2003 t.acknowledge_self_closing()
2004 flag_frameset_ok = false
2006 if t.type is TYPE_START_TAG and t.name is 'input'
2008 insert_html_element t
2010 t.acknowledge_self_closing()
2011 unless is_input_hidden_tok t
2012 flag_frameset_ok = false
2014 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
2015 insert_html_element t
2017 t.acknowledge_self_closing()
2019 if t.type is TYPE_START_TAG and t.name is 'hr'
2020 close_p_if_in_button_scope()
2021 insert_html_element t
2023 t.acknowledge_self_closing()
2024 flag_frameset_ok = false
2026 if t.type is TYPE_START_TAG and t.name is 'image'
2031 if t.type is TYPE_START_TAG and t.name is 'isindex'
2033 if template_tag_is_open() is false and form_element_pointer isnt null
2035 t.acknowledge_self_closing()
2036 flag_frameset_ok = false
2037 close_p_if_in_button_scope()
2038 el = insert_html_element new_open_tag 'form'
2039 unless template_tag_is_open()
2040 form_element_pointer = el
2043 el.attrs['action'] = a[1]
2045 insert_html_element new_open_tag 'hr'
2048 insert_html_element new_open_tag 'label'
2049 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2050 input_el = new_open_tag 'input'
2055 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2056 input_el.attrs_a.push [a[0], a[1]]
2057 input_el.attrs_a.push ['name', 'isindex']
2058 # fixfull this next bit is in english... internationalize?
2059 prompt ?= "This is a searchable index. Enter search keywords: "
2060 insert_character new_character_token prompt # fixfull split
2061 # TODO submit typo "balue" in spec
2062 insert_html_element input_el
2064 # insert_character '' # you can put chars here if promt attr missing
2066 insert_html_element new_open_tag 'hr'
2069 unless template_tag_is_open()
2070 form_element_pointer = null
2072 if t.type is TYPE_START_TAG and t.name is 'textarea'
2073 insert_html_element t
2074 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2076 tok_state = tok_state_rcdata
2077 original_ins_mode = ins_mode
2078 flag_frameset_ok = false
2079 ins_mode = ins_mode_text
2081 if t.type is TYPE_START_TAG and t.name is 'xmp'
2082 close_p_if_in_button_scope()
2084 flag_frameset_ok = false
2085 parse_generic_raw_text t
2087 if t.type is TYPE_START_TAG and t.name is 'iframe'
2088 flag_frameset_ok = false
2089 parse_generic_raw_text t
2091 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2092 parse_generic_raw_text t
2094 if t.type is TYPE_START_TAG and t.name is 'select'
2096 insert_html_element t
2097 flag_frameset_ok = false
2098 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2099 ins_mode = ins_mode_in_select_in_table
2101 ins_mode = ins_mode_in_select
2103 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2104 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2107 insert_html_element t
2109 # this comment block implements the W3C spec
2110 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2111 # if is_in_scope 'ruby', NS_HTML
2112 # generate_implied_end_tags()
2113 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2115 # insert_html_element t
2117 # if t.type is TYPE_START_TAG and t.name is 'rt'
2118 # if is_in_scope 'ruby', NS_HTML
2119 # generate_implied_end_tags 'rtc' # arg is exception
2120 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2122 # insert_html_element t
2124 # below implements the WATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2125 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2126 if is_in_scope 'ruby', NS_HTML
2127 generate_implied_end_tags()
2128 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2130 insert_html_element t
2132 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2133 if is_in_scope 'ruby', NS_HTML
2134 generate_implied_end_tags 'rtc'
2135 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2137 insert_html_element t
2140 if t.type is TYPE_START_TAG and t.name is 'math'
2142 adjust_mathml_attributes t
2143 adjust_foreign_attributes t
2144 insert_foreign_element t, NS_MATHML
2145 if t.flag 'self-closing'
2147 t.acknowledge_self_closing()
2149 if t.type is TYPE_START_TAG and t.name is 'svg'
2151 adjust_svg_attributes t
2152 adjust_foreign_attributes t
2153 insert_foreign_element t, NS_SVG
2154 if t.flag 'self-closing'
2156 t.acknowledge_self_closing()
2158 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2161 if t.type is TYPE_START_TAG # any other start tag
2163 insert_html_element t
2165 if t.type is TYPE_END_TAG # any other end tag
2166 in_body_any_other_end_tag t.name
2170 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2171 ins_mode_text = (t) ->
2172 if t.type is TYPE_TEXT
2175 if t.type is TYPE_EOF
2177 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2178 open_els[0].flag 'already started', true
2180 ins_mode = original_ins_mode
2183 if t.type is TYPE_END_TAG and t.name is 'script'
2185 ins_mode = original_ins_mode
2186 # fixfull the spec seems to assume that I'm going to run the script
2187 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2189 if t.type is TYPE_END_TAG
2191 ins_mode = original_ins_mode
2193 console.log 'warning: end of ins_mode_text reached'
2195 # the functions below implement the tokenizer stats described here:
2196 # http://www.w3.org/TR/html5/syntax.html#tokenization
2198 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2199 ins_mode_in_table_else = (t) ->
2201 flag_foster_parenting = true
2203 flag_foster_parenting = false
2205 ins_mode_in_table = (t) ->
2208 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2209 pending_table_character_tokens = []
2210 original_ins_mode = ins_mode
2211 ins_mode = ins_mode_in_table_text
2214 ins_mode_in_table_else t
2222 clear_stack_to_table_context()
2224 insert_html_element t
2225 ins_mode = ins_mode_in_caption
2227 clear_stack_to_table_context()
2228 insert_html_element t
2229 ins_mode = ins_mode_in_column_group
2231 clear_stack_to_table_context()
2232 insert_html_element new_open_tag 'colgroup'
2233 ins_mode = ins_mode_in_column_group
2235 when 'tbody', 'tfoot', 'thead'
2236 clear_stack_to_table_context()
2237 insert_html_element t
2238 ins_mode = ins_mode_in_table_body
2239 when 'td', 'th', 'tr'
2240 clear_stack_to_table_context()
2241 insert_html_element new_open_tag 'tbody'
2242 ins_mode = ins_mode_in_table_body
2246 if is_in_table_scope 'table', NS_HTML
2248 el = open_els.shift()
2249 if el.name is 'table' and el.namespace is NS_HTML
2253 when 'style', 'script', 'template'
2256 unless is_input_hidden_tok t
2257 ins_mode_in_table_else t
2260 el = insert_html_element t
2262 t.acknowledge_self_closing()
2265 if form_element_pointer?
2267 if template_tag_is_open()
2269 form_element_pointer = insert_html_element t
2272 ins_mode_in_table_else t
2276 if is_in_table_scope 'table', NS_HTML
2278 el = open_els.shift()
2279 if el.name is 'table' and el.namespace is NS_HTML
2284 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2289 ins_mode_in_table_else t
2293 ins_mode_in_table_else t
2296 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2297 ins_mode_in_table_text = (t) ->
2298 if t.type is TYPE_TEXT and t.text is "\u0000"
2302 if t.type is TYPE_TEXT
2303 pending_table_character_tokens.push t
2307 for old in pending_table_character_tokens
2308 unless is_space_tok old
2312 for old in pending_table_character_tokens
2313 insert_character old
2315 for old in pending_table_character_tokens
2316 ins_mode_in_table_else old
2317 pending_table_character_tokens = []
2318 ins_mode = original_ins_mode
2321 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2322 ins_mode_in_caption = (t) ->
2323 if t.type is TYPE_END_TAG and t.name is 'caption'
2324 if is_in_table_scope 'caption', NS_HTML
2325 generate_implied_end_tags()
2326 if open_els[0].name isnt 'caption'
2329 el = open_els.shift()
2330 if el.name is 'caption' and el.namespace is NS_HTML
2332 clear_afe_to_marker()
2333 ins_mode = ins_mode_in_table
2338 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2340 if is_in_table_scope 'caption', NS_HTML
2342 el = open_els.shift()
2343 if el.name is 'caption' and el.namespace is NS_HTML
2345 clear_afe_to_marker()
2346 ins_mode = ins_mode_in_table
2348 # else fragment case
2350 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2356 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2357 ins_mode_in_column_group = (t) ->
2361 if t.type is TYPE_COMMENT
2364 if t.type is TYPE_DOCTYPE
2367 if t.type is TYPE_START_TAG and t.name is 'html'
2370 if t.type is TYPE_START_TAG and t.name is 'col'
2371 el = insert_html_element t
2373 t.acknowledge_self_closing()
2375 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2376 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2378 ins_mode = ins_mode_in_table
2382 if t.type is TYPE_END_TAG and t.name is 'col'
2385 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2388 if t.type is TYPE_EOF
2392 if open_els[0].name isnt 'colgroup'
2396 ins_mode = ins_mode_in_table
2400 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2401 ins_mode_in_table_body = (t) ->
2402 if t.type is TYPE_START_TAG and t.name is 'tr'
2403 clear_stack_to_table_body_context()
2404 insert_html_element t
2405 ins_mode = ins_mode_in_row
2407 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2409 clear_stack_to_table_body_context()
2410 insert_html_element new_open_tag 'tr'
2411 ins_mode = ins_mode_in_row
2414 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2415 unless is_in_table_scope t.name, NS_HTML
2418 clear_stack_to_table_body_context()
2420 ins_mode = ins_mode_in_table
2422 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2425 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2428 if table_scopers[el.name] is el.namespace
2433 clear_stack_to_table_body_context()
2435 ins_mode = ins_mode_in_table
2438 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2444 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2445 ins_mode_in_row = (t) ->
2446 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2447 clear_stack_to_table_row_context()
2448 insert_html_element t
2449 ins_mode = ins_mode_in_cell
2452 if t.type is TYPE_END_TAG and t.name is 'tr'
2453 if is_in_table_scope 'tr', NS_HTML
2454 clear_stack_to_table_row_context()
2456 ins_mode = ins_mode_in_table_body
2460 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2461 if is_in_table_scope 'tr', NS_HTML
2462 clear_stack_to_table_row_context()
2464 ins_mode = ins_mode_in_table_body
2469 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2470 if is_in_table_scope t.name, NS_HTML
2471 if is_in_table_scope 'tr', NS_HTML
2472 clear_stack_to_table_row_context()
2474 ins_mode = ins_mode_in_table_body
2479 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2485 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2487 generate_implied_end_tags()
2488 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2491 el = open_els.shift()
2492 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2494 clear_afe_to_marker()
2495 ins_mode = ins_mode_in_row
2497 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2498 ins_mode_in_cell = (t) ->
2499 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2500 if is_in_table_scope t.name, NS_HTML
2501 generate_implied_end_tags()
2502 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2505 el = open_els.shift()
2506 if el.name is t.name and el.namespace is NS_HTML
2508 clear_afe_to_marker()
2509 ins_mode = ins_mode_in_row
2513 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2516 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2519 if table_scopers[el.name] is el.namespace
2527 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2530 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2531 if is_in_table_scope t.name, NS_HTML
2540 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2541 ins_mode_in_select = (t) ->
2542 if t.type is TYPE_TEXT and t.text is "\u0000"
2545 if t.type is TYPE_TEXT
2548 if t.type is TYPE_COMMENT
2551 if t.type is TYPE_DOCTYPE
2554 if t.type is TYPE_START_TAG and t.name is 'html'
2557 if t.type is TYPE_START_TAG and t.name is 'option'
2558 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2560 insert_html_element t
2562 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2563 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2565 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2567 insert_html_element t
2569 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2570 if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
2571 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2573 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2578 if t.type is TYPE_END_TAG and t.name is 'option'
2579 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2584 if t.type is TYPE_END_TAG and t.name is 'select'
2585 if is_in_select_scope 'select', NS_HTML
2587 el = open_els.shift()
2588 if el.name is 'select' and el.namespace is NS_HTML
2594 if t.type is TYPE_START_TAG and t.name is 'select'
2597 el = open_els.shift()
2598 if el.name is 'select' and el.namespace is NS_HTML
2601 # spec says that this is the same as </select> but it doesn't say
2602 # to check scope first
2604 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2606 if is_in_select_scope 'select', NS_HTML
2609 el = open_els.shift()
2610 if el.name is 'select' and el.namespace is NS_HTML
2615 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2618 if t.type is TYPE_EOF
2625 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2626 ins_mode_in_select_in_table = (t) ->
2627 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2630 el = open_els.shift()
2631 if el.name is 'select' and el.namespace is NS_HTML
2636 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2638 unless is_in_table_scope t.name, NS_HTML
2641 el = open_els.shift()
2642 if el.name is 'select' and el.namespace is NS_HTML
2648 ins_mode_in_select t
2651 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2652 ins_mode_in_template = (t) ->
2653 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2656 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2659 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2660 template_ins_modes.shift()
2661 template_ins_modes.unshift ins_mode_in_table
2662 ins_mode = ins_mode_in_table
2665 if t.type is TYPE_START_TAG and t.name is 'col'
2666 template_ins_modes.shift()
2667 template_ins_modes.unshift ins_mode_in_column_group
2668 ins_mode = ins_mode_in_column_group
2671 if t.type is TYPE_START_TAG and t.name is 'tr'
2672 template_ins_modes.shift()
2673 template_ins_modes.unshift ins_mode_in_table_body
2674 ins_mode = ins_mode_in_table_body
2677 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2678 template_ins_modes.shift()
2679 template_ins_modes.unshift ins_mode_in_row
2680 ins_mode = ins_mode_in_row
2683 if t.type is TYPE_START_TAG
2684 template_ins_modes.shift()
2685 template_ins_modes.unshift ins_mode_in_body
2686 ins_mode = ins_mode_in_body
2689 if t.type is TYPE_END_TAG
2692 if t.type is TYPE_EOF
2693 unless template_tag_is_open()
2698 el = open_els.shift()
2699 if el.name is 'template' and el.namespace is NS_HTML
2701 clear_afe_to_marker()
2702 template_ins_modes.shift()
2706 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2707 ins_mode_after_body = (t) ->
2711 if t.type is TYPE_COMMENT
2712 insert_comment t, [open_els[0], open_els[0].children.length]
2714 if t.type is TYPE_DOCTYPE
2717 if t.type is TYPE_START_TAG and t.name is 'html'
2720 if t.type is TYPE_END_TAG and t.name is 'html'
2721 # fixfull fragment case
2722 ins_mode = ins_mode_after_after_body
2724 if t.type is TYPE_EOF
2729 ins_mode = ins_mode_in_body
2732 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2733 ins_mode_in_frameset = (t) ->
2737 if t.type is TYPE_COMMENT
2740 if t.type is TYPE_DOCTYPE
2743 if t.type is TYPE_START_TAG and t.name is 'html'
2746 if t.type is TYPE_START_TAG and t.name is 'frameset'
2747 insert_html_element t
2749 if t.type is TYPE_END_TAG and t.name is 'frameset'
2750 if open_els.length is 1
2752 return # fragment case
2754 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2755 ins_mode = ins_mode_after_frameset
2757 if t.type is TYPE_START_TAG and t.name is 'frame'
2758 insert_html_element t
2760 t.acknowledge_self_closing()
2762 if t.type is TYPE_START_TAG and t.name is 'noframes'
2765 if t.type is TYPE_EOF
2766 if open_els.length isnt 1
2774 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2775 ins_mode_after_frameset = (t) ->
2779 if t.type is TYPE_COMMENT
2782 if t.type is TYPE_DOCTYPE
2785 if t.type is TYPE_START_TAG and t.name is 'html'
2788 if t.type is TYPE_END_TAG and t.name is 'html'
2789 insert_mode = ins_mode_after_after_frameset
2791 if t.type is TYPE_START_TAG and t.name is 'noframes'
2794 if t.type is TYPE_EOF
2801 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2802 ins_mode_after_after_body = (t) ->
2803 if t.type is TYPE_COMMENT
2804 insert_comment t, [doc, doc.children.length]
2806 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2809 if t.type is TYPE_EOF
2814 ins_mode = ins_mode_in_body
2817 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2818 ins_mode_after_after_frameset = (t) ->
2819 if t.type is TYPE_COMMENT
2820 insert_comment t, [doc, doc.children.length]
2822 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2825 if t.type is TYPE_EOF
2828 if t.type is TYPE_START_TAG and t.name is 'noframes'
2835 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2836 has_color_face_or_size = (t) ->
2838 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2841 in_foreign_content_end_script = ->
2845 in_foreign_content_other_start = (t) ->
2846 acn = adjusted_current_node()
2847 if acn.namespace is NS_MATHML
2848 adjust_mathml_attributes t
2849 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2850 t.name = svg_name_fixes[t.name]
2851 if acn.namespace is NS_SVG
2852 adjust_svg_attributes t
2853 adjust_foreign_attributes t
2854 insert_foreign_element t, acn.namespace
2855 if t.flag 'self-closing'
2856 if t.name is 'script'
2857 t.acknowledge_self_closing()
2858 in_foreign_content_end_script()
2861 t.acknowledge_self_closing()
2863 in_foreign_content = (t) ->
2864 if t.type is TYPE_TEXT and t.text is "\u0000"
2866 insert_character new_character_token "\ufffd"
2871 if t.type is TYPE_TEXT
2872 flag_frameset_ok = false
2875 if t.type is TYPE_COMMENT
2878 if t.type is TYPE_DOCTYPE
2881 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
2883 if flag_fragment_parsing
2884 in_foreign_content_other_start t
2886 loop # is this safe?
2889 if is_mathml_text_integration_point(cn) or is_html_integration(cn) or cn.namespace is NS_HTML
2893 if t.type is TYPE_START_TAG
2894 in_foreign_content_other_start t
2896 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
2897 in_foreign_content_end_script()
2899 if t.type is TYPE_END_TAG
2900 if open_els[0].name.toLowerCase() isnt t.name
2902 for node in open_els
2903 if node is open_els[open_els.length - 1]
2905 if node.name.toLowerCase() is t.name
2907 el = open_els.shift()
2910 if node.namespace is NS_HTML
2912 ins_mode t # explicitly call HTML insertion mode
2915 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2917 switch c = txt.charAt(cur++)
2919 return new_text_node parse_character_reference()
2921 tok_state = tok_state_tag_open
2924 return new_text_node "\ufffd"
2926 return new_eof_token()
2928 return new_text_node c
2931 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2932 # not needed: tok_state_character_reference_in_data = ->
2933 # just call parse_character_reference()
2935 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2936 tok_state_rcdata = ->
2937 switch c = txt.charAt(cur++)
2939 return new_text_node parse_character_reference()
2941 tok_state = tok_state_rcdata_less_than_sign
2944 return new_character_token "\ufffd"
2946 return new_eof_token()
2948 return new_character_token c
2951 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2952 # not needed: tok_state_character_reference_in_rcdata = ->
2953 # just call parse_character_reference()
2955 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2956 tok_state_rawtext = ->
2957 switch c = txt.charAt(cur++)
2959 tok_state = tok_state_rawtext_less_than_sign
2962 return new_character_token "\ufffd"
2964 return new_eof_token()
2966 return new_character_token c
2969 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2970 tok_state_script_data = ->
2971 switch c = txt.charAt(cur++)
2973 tok_state = tok_state_script_data_less_than_sign
2976 return new_character_token "\ufffd"
2978 return new_eof_token()
2980 return new_character_token c
2983 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2984 tok_state_plaintext = ->
2985 switch c = txt.charAt(cur++)
2988 return new_character_token "\ufffd"
2990 return new_eof_token()
2992 return new_character_token c
2996 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2997 tok_state_tag_open = ->
2998 switch c = txt.charAt(cur++)
3000 tok_state = tok_state_markup_declaration_open
3002 tok_state = tok_state_end_tag_open
3005 tok_cur_tag = new_comment_token '?'
3006 tok_state = tok_state_bogus_comment
3009 tok_cur_tag = new_open_tag c
3010 tok_state = tok_state_tag_name
3011 else if is_uc_alpha(c)
3012 tok_cur_tag = new_open_tag c.toLowerCase()
3013 tok_state = tok_state_tag_name
3016 tok_state = tok_state_data
3017 cur -= 1 # we didn't parse/handle the char after <
3018 return new_text_node '<'
3021 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3022 tok_state_end_tag_open = ->
3023 switch c = txt.charAt(cur++)
3026 tok_state = tok_state_data
3029 tok_state = tok_state_data
3030 return new_text_node '</'
3033 tok_cur_tag = new_end_tag c.toLowerCase()
3034 tok_state = tok_state_tag_name
3035 else if is_lc_alpha(c)
3036 tok_cur_tag = new_end_tag c
3037 tok_state = tok_state_tag_name
3040 tok_cur_tag = new_comment_token '/'
3041 tok_state = tok_state_bogus_comment
3044 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3045 tok_state_tag_name = ->
3046 switch c = txt.charAt(cur++)
3047 when "\t", "\n", "\u000c", ' '
3048 tok_state = tok_state_before_attribute_name
3050 tok_state = tok_state_self_closing_start_tag
3052 tok_state = tok_state_data
3058 tok_cur_tag.name += "\ufffd"
3061 tok_state = tok_state_data
3064 tok_cur_tag.name += c.toLowerCase()
3066 tok_cur_tag.name += c
3069 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3070 tok_state_rcdata_less_than_sign = ->
3071 c = txt.charAt(cur++)
3073 temporary_buffer = ''
3074 tok_state = tok_state_rcdata_end_tag_open
3077 tok_state = tok_state_rcdata
3078 cur -= 1 # reconsume the input character
3079 return new_character_token '<'
3081 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3082 tok_state_rcdata_end_tag_open = ->
3083 c = txt.charAt(cur++)
3085 tok_cur_tag = new_end_tag c.toLowerCase()
3086 temporary_buffer += c
3087 tok_state = tok_state_rcdata_end_tag_name
3090 tok_cur_tag = new_end_tag c
3091 temporary_buffer += c
3092 tok_state = tok_state_rcdata_end_tag_name
3095 tok_state = tok_state_rcdata
3096 cur -= 1 # reconsume the input character
3097 return new_character_token "</" # fixfull separate these
3099 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3100 is_appropriate_end_tag = (t) ->
3101 # spec says to check against "the tag name of the last start tag to
3102 # have been emitted from this tokenizer", but this is only called from
3103 # the various "raw" states, so it's hopefully ok to assume that
3104 # open_els[0].name will work instead TODO: verify this after the script
3105 # data states are implemented
3106 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3107 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3109 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3110 tok_state_rcdata_end_tag_name = ->
3111 c = txt.charAt(cur++)
3112 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3113 if is_appropriate_end_tag tok_cur_tag
3114 tok_state = tok_state_before_attribute_name
3116 # else fall through to "Anything else"
3118 if is_appropriate_end_tag tok_cur_tag
3119 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3121 # else fall through to "Anything else"
3123 if is_appropriate_end_tag tok_cur_tag
3124 tok_state = tok_state_data
3126 # else fall through to "Anything else"
3128 tok_cur_tag.name += c.toLowerCase()
3129 temporary_buffer += c
3132 tok_cur_tag.name += c
3133 temporary_buffer += c
3136 tok_state = tok_state_rcdata
3137 cur -= 1 # reconsume the input character
3138 return new_character_token '</' + temporary_buffer # fixfull separate these
3140 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3141 tok_state_rawtext_less_than_sign = ->
3142 c = txt.charAt(cur++)
3144 temporary_buffer = ''
3145 tok_state = tok_state_rawtext_end_tag_open
3148 tok_state = tok_state_rawtext
3149 cur -= 1 # reconsume the input character
3150 return new_character_token '<'
3152 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3153 tok_state_rawtext_end_tag_open = ->
3154 c = txt.charAt(cur++)
3156 tok_cur_tag = new_end_tag c.toLowerCase()
3157 temporary_buffer += c
3158 tok_state = tok_state_rawtext_end_tag_name
3161 tok_cur_tag = new_end_tag c
3162 temporary_buffer += c
3163 tok_state = tok_state_rawtext_end_tag_name
3166 tok_state = tok_state_rawtext
3167 cur -= 1 # reconsume the input character
3168 return new_character_token "</" # fixfull separate these
3170 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3171 tok_state_rawtext_end_tag_name = ->
3172 c = txt.charAt(cur++)
3173 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3174 if is_appropriate_end_tag tok_cur_tag
3175 tok_state = tok_state_before_attribute_name
3177 # else fall through to "Anything else"
3179 if is_appropriate_end_tag tok_cur_tag
3180 tok_state = tok_state_self_closing_start_tag
3182 # else fall through to "Anything else"
3184 if is_appropriate_end_tag tok_cur_tag
3185 tok_state = tok_state_data
3187 # else fall through to "Anything else"
3189 tok_cur_tag.name += c.toLowerCase()
3190 temporary_buffer += c
3193 tok_cur_tag.name += c
3194 temporary_buffer += c
3197 tok_state = tok_state_rawtext
3198 cur -= 1 # reconsume the input character
3199 return new_character_token '</' + temporary_buffer # fixfull separate these
3201 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3202 tok_state_script_data_less_than_sign = ->
3203 c = txt.charAt(cur++)
3205 temporary_buffer = ''
3206 tok_state = tok_state_script_data_end_tag_open
3209 tok_state = tok_state_script_data_escape_start
3210 return new_character_token '<!' # fixfull split
3212 tok_state = tok_state_script_data
3213 cur -= 1 # Reconsume
3214 return new_character_token '<'
3216 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3217 tok_state_script_data_end_tag_open = ->
3218 c = txt.charAt(cur++)
3220 tok_cur_tag = new_end_tag c.toLowerCase()
3221 temporary_buffer += c
3222 tok_state = tok_state_script_data_end_tag_name
3225 tok_cur_tag = new_end_tag c
3226 temporary_buffer += c
3227 tok_state = tok_state_script_data_end_tag_name
3230 tok_state = tok_state_script_data
3231 cur -= 1 # Reconsume
3232 return new_character_token '</'
3234 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3235 tok_state_script_data_end_tag_name = ->
3236 c = txt.charAt(cur++)
3237 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3238 if is_appropriate_end_tag tok_cur_tag
3239 tok_state = tok_state_before_attribute_name
3243 if is_appropriate_end_tag tok_cur_tag
3244 tok_state = tok_state_self_closing_start_tag
3248 if is_appropriate_end_tag tok_cur_tag
3249 tok_state = tok_state_data
3253 tok_cur_tag.name += c.toLowerCase()
3254 temporary_buffer += c
3257 tok_cur_tag.name += c
3258 temporary_buffer += c
3261 tok_state = tok_state_script_data
3262 cur -= 1 # Reconsume
3263 return new_character_token "</#{temporary_buffer}" # fixfull split
3265 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3266 tok_state_script_data_escape_start = ->
3267 c = txt.charAt(cur++)
3269 tok_state = tok_state_script_data_escape_start_dash
3270 return new_character_token '-'
3272 tok_state = tok_state_script_data
3273 cur -= 1 # Reconsume
3276 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3277 tok_state_script_data_escape_start_dash = ->
3278 c = txt.charAt(cur++)
3280 tok_state = tok_state_script_data_escaped_dash_dash
3281 return new_character_token '-'
3283 tok_state = tok_state_script_data
3284 cur -= 1 # Reconsume
3287 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3288 tok_state_script_data_escaped = ->
3289 c = txt.charAt(cur++)
3291 tok_state = tok_state_script_data_escaped_dash
3292 return new_character_token '-'
3294 tok_state = tok_state_script_data_escaped_less_than_sign
3298 return new_character_token "\ufffd"
3300 tok_state = tok_state_data
3302 cur -= 1 # Reconsume
3305 return new_character_token c
3307 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3308 tok_state_script_data_escaped_dash = ->
3309 c = txt.charAt(cur++)
3311 tok_state = tok_state_script_data_escaped_dash_dash
3312 return new_character_token '-'
3314 tok_state = tok_state_script_data_escaped_less_than_sign
3318 tok_state = tok_state_script_data_escaped
3319 return new_character_token "\ufffd"
3321 tok_state = tok_state_data
3323 cur -= 1 # Reconsume
3326 tok_state = tok_state_script_data_escaped
3327 return new_character_token c
3329 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3330 tok_state_script_data_escaped_dash_dash = ->
3331 c = txt.charAt(cur++)
3333 return new_character_token '-'
3335 tok_state = tok_state_script_data_escaped_less_than_sign
3338 tok_state = tok_state_script_data
3339 return new_character_token '>'
3342 tok_state = tok_state_script_data_escaped
3343 return new_character_token "\ufffd"
3346 tok_state = tok_state_data
3347 cur -= 1 # Reconsume
3350 tok_state = tok_state_script_data_escaped
3351 return new_character_token c
3353 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3354 tok_state_script_data_escaped_less_than_sign = ->
3355 c = txt.charAt(cur++)
3357 temporary_buffer = ''
3358 tok_state = tok_state_script_data_escaped_end_tag_open
3361 temporary_buffer = c.toLowerCase() # yes, really
3362 tok_state = tok_state_script_data_double_escape_start
3363 return new_character_token "<#{c}" # fixfull split
3365 temporary_buffer = c
3366 tok_state = tok_state_script_data_double_escape_start
3367 return new_character_token "<#{c}" # fixfull split
3369 tok_state = tok_state_script_data_escaped
3370 cur -= 1 # Reconsume
3371 return new_character_token c
3373 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3374 tok_state_script_data_escaped_end_tag_open = ->
3375 c = txt.charAt(cur++)
3377 tok_cur_tag = new_end_tag c.toLowerCase()
3378 temporary_buffer += c
3379 tok_state = tok_state_script_data_escaped_end_tag_name
3382 tok_cur_tag = new_end_tag c
3383 temporary_buffer += c
3384 tok_state = tok_state_script_data_escaped_end_tag_name
3387 tok_state = tok_state_script_data_escaped
3388 cur -= 1 # Reconsume
3389 return new_character_token '</' # fixfull split
3391 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3392 tok_state_script_data_escaped_end_tag_name = ->
3393 c = txt.charAt(cur++)
3394 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3395 if is_appropriate_end_tag tok_cur_tag
3396 tok_state = tok_state_before_attribute_name
3400 if is_appropriate_end_tag tok_cur_tag
3401 tok_state = tok_state_self_closing_start_tag
3405 if is_appropriate_end_tag tok_cur_tag
3406 tok_state = tok_state_data
3410 tok_cur_tag.name += c.toLowerCase()
3411 temporary_buffer += c.toLowerCase()
3414 tok_cur_tag.name += c
3415 temporary_buffer += c.toLowerCase()
3418 tok_state = tok_state_script_data_escaped
3419 cur -= 1 # Reconsume
3420 return new_character_token "</#{temporary_buffer}" # fixfull split
3422 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3423 tok_state_script_data_double_escape_start = ->
3424 c = txt.charAt(cur++)
3425 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3426 if temporary_buffer is 'script'
3427 tok_state = tok_state_script_data_double_escaped
3429 tok_state = tok_state_script_data_escaped
3430 return new_character_token c
3432 temporary_buffer += c.toLowerCase() # yes, really lowercase
3433 return new_character_token c
3435 temporary_buffer += c
3436 return new_character_token c
3438 tok_state = tok_state_script_data_escaped
3439 cur -= 1 # Reconsume
3442 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3443 tok_state_script_data_double_escaped = ->
3444 c = txt.charAt(cur++)
3446 tok_state = tok_state_script_data_double_escaped_dash
3447 return new_character_token '-'
3449 tok_state = tok_state_script_data_double_escaped_less_than_sign
3450 return new_character_token '<'
3453 return new_character_token "\ufffd"
3456 tok_state = tok_state_data
3457 cur -= 1 # Reconsume
3460 return new_character_token c
3462 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3463 tok_state_script_data_double_escaped_dash = ->
3464 c = txt.charAt(cur++)
3466 tok_state = tok_state_script_data_double_escaped_dash_dash
3467 return new_character_token '-'
3469 tok_state = tok_state_script_data_double_escaped_less_than_sign
3470 return new_character_token '<'
3473 tok_state = tok_state_script_data_double_escaped
3474 return new_character_token "\ufffd"
3477 tok_state = tok_state_data
3478 cur -= 1 # Reconsume
3481 tok_state = tok_state_script_data_double_escaped
3482 return new_character_token c
3484 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3485 tok_state_script_data_double_escaped_dash_dash = ->
3486 c = txt.charAt(cur++)
3488 return new_character_token '-'
3490 tok_state = tok_state_script_data_double_escaped_less_than_sign
3491 return new_character_token '<'
3493 tok_state = tok_state_script_data
3494 return new_character_token '>'
3497 tok_state = tok_state_script_data_double_escaped
3498 return new_character_token "\ufffd"
3501 tok_state = tok_state_data
3502 cur -= 1 # Reconsume
3505 tok_state = tok_state_script_data_double_escaped
3506 return new_character_token c
3508 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3509 tok_state_script_data_double_escaped_less_than_sign = ->
3510 c = txt.charAt(cur++)
3512 temporary_buffer = ''
3513 tok_state = tok_state_script_data_double_escape_end
3514 return new_character_token '/'
3516 tok_state = tok_state_script_data_double_escaped
3517 cur -= 1 # Reconsume
3520 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3521 tok_state_script_data_double_escape_end = ->
3522 c = txt.charAt(cur++)
3523 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3524 if temporary_buffer is 'script'
3525 tok_state = tok_state_script_data_escaped
3527 tok_state = tok_state_script_data_double_escaped
3528 return new_character_token c
3530 temporary_buffer += c.toLowerCase() # yes, really lowercase
3531 return new_character_token c
3533 temporary_buffer += c
3534 return new_character_token c
3536 tok_state = tok_state_script_data_double_escaped
3537 cur -= 1 # Reconsume
3540 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3541 tok_state_before_attribute_name = ->
3543 switch c = txt.charAt(cur++)
3544 when "\t", "\n", "\u000c", ' '
3547 tok_state = tok_state_self_closing_start_tag
3550 tok_state = tok_state_data
3556 attr_name = "\ufffd"
3557 when '"', "'", '<', '='
3562 tok_state = tok_state_data
3565 attr_name = c.toLowerCase()
3569 tok_cur_tag.attrs_a.unshift [attr_name, '']
3570 tok_state = tok_state_attribute_name
3573 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3574 tok_state_attribute_name = ->
3575 switch c = txt.charAt(cur++)
3576 when "\t", "\n", "\u000c", ' '
3577 tok_state = tok_state_after_attribute_name
3579 tok_state = tok_state_self_closing_start_tag
3581 tok_state = tok_state_before_attribute_value
3583 tok_state = tok_state_data
3589 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3592 tok_cur_tag.attrs_a[0][0] += c
3595 tok_state = tok_state_data
3598 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3600 tok_cur_tag.attrs_a[0][0] += c
3603 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3604 tok_state_after_attribute_name = ->
3605 c = txt.charAt(cur++)
3606 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3609 tok_state = tok_state_self_closing_start_tag
3612 tok_state = tok_state_before_attribute_value
3615 tok_state = tok_state_data
3618 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3619 tok_state = tok_state_attribute_name
3623 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3624 tok_state = tok_state_attribute_name
3628 tok_state = tok_state_data
3629 cur -= 1 # reconsume
3631 if c is '"' or c is "'" or c is '<'
3633 # fall through to Anything else
3635 tok_cur_tag.attrs_a.unshift [c, '']
3636 tok_state = tok_state_attribute_name
3638 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3639 tok_state_before_attribute_value = ->
3640 switch c = txt.charAt(cur++)
3641 when "\t", "\n", "\u000c", ' '
3644 tok_state = tok_state_attribute_value_double_quoted
3646 tok_state = tok_state_attribute_value_unquoted
3649 tok_state = tok_state_attribute_value_single_quoted
3652 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3653 tok_state = tok_state_attribute_value_unquoted
3656 tok_state = tok_state_data
3662 tok_state = tok_state_data
3664 tok_cur_tag.attrs_a[0][1] += c
3665 tok_state = tok_state_attribute_value_unquoted
3668 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3669 tok_state_attribute_value_double_quoted = ->
3670 switch c = txt.charAt(cur++)
3672 tok_state = tok_state_after_attribute_value_quoted
3674 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3677 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3680 tok_state = tok_state_data
3682 tok_cur_tag.attrs_a[0][1] += c
3685 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3686 tok_state_attribute_value_single_quoted = ->
3687 switch c = txt.charAt(cur++)
3689 tok_state = tok_state_after_attribute_value_quoted
3691 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3694 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3697 tok_state = tok_state_data
3699 tok_cur_tag.attrs_a[0][1] += c
3702 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3703 tok_state_attribute_value_unquoted = ->
3704 switch c = txt.charAt(cur++)
3705 when "\t", "\n", "\u000c", ' '
3706 tok_state = tok_state_before_attribute_name
3708 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3710 tok_state = tok_state_data
3715 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3718 tok_state = tok_state_data
3720 # Parse Error if ', <, = or ` (backtick)
3721 tok_cur_tag.attrs_a[0][1] += c
3724 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3725 tok_state_after_attribute_value_quoted = ->
3726 switch c = txt.charAt(cur++)
3727 when "\t", "\n", "\u000c", ' '
3728 tok_state = tok_state_before_attribute_name
3730 tok_state = tok_state_self_closing_start_tag
3732 tok_state = tok_state_data
3738 tok_state = tok_state_data
3741 tok_state = tok_state_before_attribute_name
3742 cur -= 1 # we didn't handle that char
3745 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3746 tok_state_self_closing_start_tag = ->
3747 c = txt.charAt(cur++)
3749 tok_cur_tag.flag 'self-closing'
3750 tok_state = tok_state_data
3754 tok_state = tok_state_data
3755 cur -= 1 # Reconsume
3759 tok_state = tok_state_before_attribute_name
3760 cur -= 1 # Reconsume
3763 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3764 # WARNING: put a comment token in tok_cur_tag before setting this state
3765 tok_state_bogus_comment = ->
3766 next_gt = txt.indexOf '>', cur
3768 val = txt.substr cur
3771 val = txt.substr cur, (next_gt - cur)
3773 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3774 tok_cur_tag.text += val
3775 tok_state = tok_state_data
3778 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3779 tok_state_markup_declaration_open = ->
3780 if txt.substr(cur, 2) is '--'
3782 tok_cur_tag = new_comment_token ''
3783 tok_state = tok_state_comment_start
3785 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3787 tok_state = tok_state_doctype
3789 acn = adjusted_current_node()
3790 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3792 tok_state = tok_state_cdata_section
3796 tok_cur_tag = new_comment_token ''
3797 tok_state = tok_state_bogus_comment
3800 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3801 tok_state_comment_start = ->
3802 switch c = txt.charAt(cur++)
3804 tok_state = tok_state_comment_start_dash
3807 tok_state = tok_state_comment
3808 return new_character_token "\ufffd"
3811 tok_state = tok_state_data
3815 tok_state = tok_state_data
3816 cur -= 1 # Reconsume
3819 tok_cur_tag.text += c
3820 tok_state = tok_state_comment
3823 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3824 tok_state_comment_start_dash = ->
3825 switch c = txt.charAt(cur++)
3827 tok_state = tok_state_comment_end
3830 tok_cur_tag.text += "-\ufffd"
3831 tok_state = tok_state_comment
3834 tok_state = tok_state_data
3838 tok_state = tok_state_data
3839 cur -= 1 # Reconsume
3842 tok_cur_tag.text += "-#{c}"
3843 tok_state = tok_state_comment
3846 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3847 tok_state_comment = ->
3848 switch c = txt.charAt(cur++)
3850 tok_state = tok_state_comment_end_dash
3853 tok_cur_tag.text += "\ufffd"
3856 tok_state = tok_state_data
3857 cur -= 1 # Reconsume
3860 tok_cur_tag.text += c
3863 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3864 tok_state_comment_end_dash = ->
3865 switch c = txt.charAt(cur++)
3867 tok_state = tok_state_comment_end
3870 tok_cur_tag.text += "-\ufffd"
3871 tok_state = tok_state_comment
3874 tok_state = tok_state_data
3875 cur -= 1 # Reconsume
3878 tok_cur_tag.text += "-#{c}"
3879 tok_state = tok_state_comment
3882 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3883 tok_state_comment_end = ->
3884 switch c = txt.charAt(cur++)
3886 tok_state = tok_state_data
3890 tok_cur_tag.text += "--\ufffd"
3891 tok_state = tok_state_comment
3894 tok_state = tok_state_comment_end_bang
3897 tok_cur_tag.text += '-'
3900 tok_state = tok_state_data
3901 cur -= 1 # Reconsume
3905 tok_cur_tag.text += "--#{c}"
3906 tok_state = tok_state_comment
3909 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3910 tok_state_comment_end_bang = ->
3911 switch c = txt.charAt(cur++)
3913 tok_cur_tag.text += "--!#{c}"
3914 tok_state = tok_state_comment_end_dash
3916 tok_state = tok_state_data
3920 tok_cur_tag.text += "--!\ufffd"
3921 tok_state = tok_state_comment
3924 tok_state = tok_state_data
3925 cur -= 1 # Reconsume
3928 tok_cur_tag.text += "--!#{c}"
3929 tok_state = tok_state_comment
3932 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3933 tok_state_doctype = ->
3934 switch c = txt.charAt(cur++)
3935 when "\t", "\u000a", "\u000c", ' '
3936 tok_state = tok_state_before_doctype_name
3939 tok_state = tok_state_data
3940 el = new_doctype_token ''
3941 el.flag 'force-quirks', true
3942 cur -= 1 # Reconsume
3946 tok_state = tok_state_before_doctype_name
3947 cur -= 1 # Reconsume
3950 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3951 tok_state_before_doctype_name = ->
3952 c = txt.charAt(cur++)
3953 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3956 tok_cur_tag = new_doctype_token c.toLowerCase()
3957 tok_state = tok_state_doctype_name
3961 tok_cur_tag = new_doctype_token "\ufffd"
3962 tok_state = tok_state_doctype_name
3966 el = new_doctype_token ''
3967 el.flag 'force-quirks', true
3968 tok_state = tok_state_data
3972 tok_state = tok_state_data
3973 el = new_doctype_token ''
3974 el.flag 'force-quirks', true
3975 cur -= 1 # Reconsume
3978 tok_cur_tag = new_doctype_token c
3979 tok_state = tok_state_doctype_name
3982 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3983 tok_state_doctype_name = ->
3984 c = txt.charAt(cur++)
3985 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3986 tok_state = tok_state_after_doctype_name
3989 tok_state = tok_state_data
3992 tok_cur_tag.name += c.toLowerCase()
3996 tok_cur_tag.name += "\ufffd"
4000 tok_state = tok_state_data
4001 tok_cur_tag.flag 'force-quirks', true
4002 cur -= 1 # Reconsume
4005 tok_cur_tag.name += c
4008 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4009 tok_state_after_doctype_name = ->
4010 c = txt.charAt(cur++)
4011 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4014 tok_state = tok_state_data
4018 tok_state = tok_state_data
4019 tok_cur_tag.flag 'force-quirks', true
4020 cur -= 1 # Reconsume
4023 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4025 tok_state = tok_state_after_doctype_public_keyword
4027 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4029 tok_state = tok_state_after_doctype_system_keyword
4032 tok_cur_tag.flag 'force-quirks', true
4033 tok_state = tok_state_bogus_doctype
4036 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4037 tok_state_after_doctype_public_keyword = ->
4038 c = txt.charAt(cur++)
4039 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4040 tok_state = tok_state_before_doctype_public_identifier
4044 tok_cur_tag.public_identifier = ''
4045 tok_state = tok_state_doctype_public_identifier_double_quoted
4049 tok_cur_tag.public_identifier = ''
4050 tok_state = tok_state_doctype_public_identifier_single_quoted
4054 tok_cur_tag.flag 'force-quirks', true
4055 tok_state = tok_state_data
4059 tok_state = tok_state_data
4060 tok_cur_tag.flag 'force-quirks', true
4061 cur -= 1 # Reconsume
4065 tok_cur_tag.flag 'force-quirks', true
4066 tok_state = tok_state_bogus_doctype
4069 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4070 tok_state_before_doctype_public_identifier = ->
4071 c = txt.charAt(cur++)
4072 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4076 tok_cur_tag.public_identifier = ''
4077 tok_state = tok_state_doctype_public_identifier_double_quoted
4081 tok_cur_tag.public_identifier = ''
4082 tok_state = tok_state_doctype_public_identifier_single_quoted
4086 tok_cur_tag.flag 'force-quirks', true
4087 tok_state = tok_state_data
4091 tok_state = tok_state_data
4092 tok_cur_tag.flag 'force-quirks', true
4093 cur -= 1 # Reconsume
4097 tok_cur_tag.flag 'force-quirks', true
4098 tok_state = tok_state_bogus_doctype
4102 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4103 tok_state_doctype_public_identifier_double_quoted = ->
4104 c = txt.charAt(cur++)
4106 tok_state = tok_state_after_doctype_public_identifier
4110 tok_cur_tag.public_identifier += "\ufffd"
4114 tok_cur_tag.flag 'force-quirks', true
4115 tok_state = tok_state_data
4119 tok_state = tok_state_data
4120 tok_cur_tag.flag 'force-quirks', true
4121 cur -= 1 # Reconsume
4124 tok_cur_tag.public_identifier += c
4127 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4128 tok_state_doctype_public_identifier_single_quoted = ->
4129 c = txt.charAt(cur++)
4131 tok_state = tok_state_after_doctype_public_identifier
4135 tok_cur_tag.public_identifier += "\ufffd"
4139 tok_cur_tag.flag 'force-quirks', true
4140 tok_state = tok_state_data
4144 tok_state = tok_state_data
4145 tok_cur_tag.flag 'force-quirks', true
4146 cur -= 1 # Reconsume
4149 tok_cur_tag.public_identifier += c
4152 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4153 tok_state_after_doctype_public_identifier = ->
4154 c = txt.charAt(cur++)
4155 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4156 tok_state = tok_state_between_doctype_public_and_system_identifiers
4159 tok_state = tok_state_data
4163 tok_cur_tag.system_identifier = ''
4164 tok_state = tok_state_doctype_system_identifier_double_quoted
4168 tok_cur_tag.system_identifier = ''
4169 tok_state = tok_state_doctype_system_identifier_single_quoted
4173 tok_state = tok_state_data
4174 tok_cur_tag.flag 'force-quirks', true
4175 cur -= 1 # Reconsume
4179 tok_cur_tag.flag 'force-quirks', true
4180 tok_state = tok_state_bogus_doctype
4183 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4184 tok_state_between_doctype_public_and_system_identifiers = ->
4185 c = txt.charAt(cur++)
4186 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4189 tok_state = tok_state_data
4193 tok_cur_tag.system_identifier = ''
4194 tok_state = tok_state_doctype_system_identifier_double_quoted
4198 tok_cur_tag.system_identifier = ''
4199 tok_state = tok_state_doctype_system_identifier_single_quoted
4203 tok_state = tok_state_data
4204 tok_cur_tag.flag 'force-quirks', true
4205 cur -= 1 # Reconsume
4209 tok_cur_tag.flag 'force-quirks', true
4210 tok_state = tok_state_bogus_doctype
4213 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4214 tok_state_after_doctype_system_keyword = ->
4215 c = txt.charAt(cur++)
4216 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4217 tok_state = tok_state_before_doctype_system_identifier
4221 tok_cur_tag.system_identifier = ''
4222 tok_state = tok_state_doctype_system_identifier_double_quoted
4226 tok_cur_tag.system_identifier = ''
4227 tok_state = tok_state_doctype_system_identifier_single_quoted
4231 tok_cur_tag.flag 'force-quirks', true
4232 tok_state = tok_state_data
4236 tok_state = tok_state_data
4237 tok_cur_tag.flag 'force-quirks', true
4238 cur -= 1 # Reconsume
4242 tok_cur_tag.flag 'force-quirks', true
4243 tok_state = tok_state_bogus_doctype
4246 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4247 tok_state_before_doctype_system_identifier = ->
4248 c = txt.charAt(cur++)
4249 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4252 tok_cur_tag.system_identifier = ''
4253 tok_state = tok_state_doctype_system_identifier_double_quoted
4256 tok_cur_tag.system_identifier = ''
4257 tok_state = tok_state_doctype_system_identifier_single_quoted
4261 tok_cur_tag.flag 'force-quirks', true
4262 tok_state = tok_state_data
4266 tok_state = tok_state_data
4267 tok_cur_tag.flag 'force-quirks', true
4268 cur -= 1 # Reconsume
4272 tok_cur_tag.flag 'force-quirks', true
4273 tok_state = tok_state_bogus_doctype
4276 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4277 tok_state_doctype_system_identifier_double_quoted = ->
4278 c = txt.charAt(cur++)
4280 tok_state = tok_state_after_doctype_system_identifier
4284 tok_cur_tag.system_identifier += "\ufffd"
4288 tok_cur_tag.flag 'force-quirks', true
4289 tok_state = tok_state_data
4293 tok_state = tok_state_data
4294 tok_cur_tag.flag 'force-quirks', true
4295 cur -= 1 # Reconsume
4298 tok_cur_tag.system_identifier += c
4301 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4302 tok_state_doctype_system_identifier_single_quoted = ->
4303 c = txt.charAt(cur++)
4305 tok_state = tok_state_after_doctype_system_identifier
4309 tok_cur_tag.system_identifier += "\ufffd"
4313 tok_cur_tag.flag 'force-quirks', true
4314 tok_state = tok_state_data
4318 tok_state = tok_state_data
4319 tok_cur_tag.flag 'force-quirks', true
4320 cur -= 1 # Reconsume
4323 tok_cur_tag.system_identifier += c
4326 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4327 tok_state_after_doctype_system_identifier = ->
4328 c = txt.charAt(cur++)
4329 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4332 tok_state = tok_state_data
4336 tok_state = tok_state_data
4337 tok_cur_tag.flag 'force-quirks', true
4338 cur -= 1 # Reconsume
4342 # do _not_ tok_cur_tag.flag 'force-quirks', true
4343 tok_state = tok_state_bogus_doctype
4346 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4347 tok_state_bogus_doctype = ->
4348 c = txt.charAt(cur++)
4350 tok_state = tok_state_data
4353 tok_state = tok_state_data
4354 cur -= 1 # Reconsume
4359 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4360 tok_state_cdata_section = ->
4361 tok_state = tok_state_data
4362 next_gt = txt.indexOf ']]>', cur
4364 val = txt.substr cur
4367 val = txt.substr cur, (next_gt - cur)
4369 return new_character_token val # fixfull split
4371 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4372 # Don't set this as a state, just call it
4373 # returns a string (NOT a text node)
4374 parse_character_reference = (allowed_char = null, in_attr = false) ->
4375 if cur >= txt.length
4377 switch c = txt.charAt(cur)
4378 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4379 # explicitly not a parse error
4382 # there has to be "one or more" alnums between & and ; to be a parse error
4385 if cur + 1 >= txt.length
4387 if txt.charAt(cur + 1).toLowerCase() is 'x'
4396 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4401 if txt.charAt(start + i) is ';'
4405 code_point = txt.substr(start, i)
4406 while code_point.charAt(0) is '0' and code_point.length > 1
4407 code_point = code_point.substr 1
4408 code_point = parseInt(code_point, base)
4409 if unicode_fixes[code_point]?
4411 return unicode_fixes[code_point]
4413 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4417 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4419 return from_code_point code_point
4423 if alnum.indexOf(txt.charAt(cur + i)) is -1
4426 # exit early, because parse_error() below needs at least one alnum
4428 if txt.charAt(cur + i) is ';'
4429 i += 1 # include ';' terminator in value
4430 decoded = decode_named_char_ref txt.substr(cur, i)
4437 # no ';' terminator (only legacy char refs)
4439 for i in [2..max] # no prefix matches, so ok to check shortest first
4440 c = legacy_char_refs[txt.substr(cur, i)]
4443 if txt.charAt(cur + i) is '='
4444 # "because some legacy user agents will
4445 # misinterpret the markup in those cases"
4448 if alnum.indexOf(txt.charAt(cur + i)) > -1
4449 # this makes attributes forgiving about url args
4451 # ok, and besides the weird exceptions for attributes...
4452 # return the matching char
4453 cur += i # consume entity chars
4454 parse_error() # because no terminating ";"
4458 return # never reached
4460 # tree constructor initialization
4461 # see comments on TYPE_TAG/etc for the structure of this data
4464 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4466 afe = [] # active formatting elements
4467 template_ins_modes = []
4468 ins_mode = ins_mode_initial
4469 original_ins_mode = ins_mode # TODO check spec
4470 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4471 flag_frameset_ok = true
4473 flag_foster_parenting = false
4474 form_element_pointer = null
4475 temporary_buffer = null
4476 pending_table_character_tokens = []
4477 head_element_pointer = null
4478 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4479 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4481 # tokenizer initialization
4482 tok_state = tok_state_data
4484 # text pre-processing
4485 # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4486 txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4487 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4488 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4490 if args.name is "plain-text-unsafe.dat #4"
4493 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4498 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4501 serialize_els = (els, shallow, show_ids) ->
4507 serialized += t.serialize shallow, show_ids
4510 module.exports.parse_html = parse_html
4511 module.exports.debug_log_reset = debug_log_reset
4512 module.exports.debug_log_each = debug_log_each
4513 module.exports.TYPE_TAG = TYPE_TAG
4514 module.exports.TYPE_TEXT = TYPE_TEXT
4515 module.exports.TYPE_COMMENT = TYPE_COMMENT
4516 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4517 module.exports.NS_HTML = NS_HTML
4518 module.exports.NS_MATHML = NS_MATHML
4519 module.exports.NS_SVG = NS_SVG