1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
25 # Deviations from that spec:
27 # Purposeful: search this file for "WTAG"
29 # Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
34 # the spec uses a many different words do indicate which ends of lists/stacks
35 # they are talking about (and relative movement within the lists/stacks). This
36 # section splains. I'm implementing "lists" (afe and open_els) the same way
39 # stacks grow downward (current element is index=0)
41 # example: open_els = [a, b, c, d, e, f, g]
43 # "grows downwards" means it's visualized like this: (index: el, names)
45 # 6: g "start of the list", "topmost", "first"
47 # 4: e "previous" (to d), "above", "before"
48 # 3: d (previous/next are relative to this element)
49 # 2: c "next", "after", "lower", "below"
51 # 0: a "end of the list", "current node", "bottommost", "last"
55 # note: to get this to run outside a browser, you'll have to write a native
56 # implementation of decode_named_char_ref()
57 unless module?.exports?
59 module = exports: window.wheic
61 from_code_point = (x) ->
62 if String.fromCodePoint?
63 return String.fromCodePoint x
66 return String.fromCharCode x
68 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
70 # Each node is an obect of the Node class. Here are the Node types:
71 TYPE_TAG = 0 # name, {attributes}, [children]
72 TYPE_TEXT = 1 # "text"
75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
77 TYPE_END_TAG = 5 # name
79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
92 debug_log_each = (cb) ->
93 for str in g_debug_log
98 constructor: (type, args = {}) ->
99 @type = type # one of the TYPE_* constants above
100 @name = args.name ? '' # tag name
101 @text = args.text ? '' # contents for text/comment nodes
102 @attrs = args.attrs ? {}
103 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
104 @children = args.children ? []
105 @namespace = args.namespace ? NS_HTML
106 @parent = args.parent ? null
107 @token = args.token ? null
108 @flags = args.flags ? {}
112 @id = "#{++prev_node_id}"
113 acknowledge_self_closing: ->
115 @token.flag 'did_self_close'
117 @flag 'did_self_close', true
118 flag: (key, value = null) ->
123 serialize: (shallow = false, show_ids = false) -> # for unit tests
128 ret += JSON.stringify @name
143 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
149 ret += c.serialize shallow, show_ids
153 ret += JSON.stringify @text
156 ret += JSON.stringify @text
158 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
161 when TYPE_AAA_BOOKMARK
162 ret += 'aaa_bookmark'
165 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
168 # helpers: (only take args that are normally known when parser creates nodes)
169 new_open_tag = (name) ->
170 return new Node TYPE_START_TAG, name: name
171 new_end_tag = (name) ->
172 return new Node TYPE_END_TAG, name: name
173 new_element = (name) ->
174 return new Node TYPE_TAG, name: name
175 new_text_node = (txt) ->
176 return new Node TYPE_TEXT, text: txt
177 new_character_token = new_text_node
178 new_comment_token = (txt) ->
179 return new Node TYPE_COMMENT, text: txt
180 new_doctype_token = (name) ->
181 return new Node TYPE_DOCTYPE, name: name
183 return new Node TYPE_EOF
185 return new Node TYPE_AFE_MARKER
186 new_aaa_bookmark = ->
187 return new Node TYPE_AAA_BOOKMARK
189 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
190 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
191 digits = "0123456789"
192 alnum = lc_alpha + uc_alpha + digits
193 hex_chars = digits + "abcdefABCDEF"
195 is_uc_alpha = (str) ->
196 return str.length is 1 and uc_alpha.indexOf(str) > -1
197 is_lc_alpha = (str) ->
198 return str.length is 1 and lc_alpha.indexOf(str) > -1
200 # some SVG elements have dashes in them
201 tag_name_chars = alnum + "-"
203 # http://www.w3.org/TR/html5/infrastructure.html#space-character
204 space_chars = "\u0009\u000a\u000c\u000d\u0020"
206 return txt.length is 1 and space_chars.indexOf(txt) > -1
207 is_space_tok = (t) ->
208 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
210 is_input_hidden_tok = (t) ->
211 return false unless t.type is TYPE_START_TAG
214 if a[1].toLowerCase() is 'hidden'
219 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
220 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
223 unicode_fixes[0x00] = "\uFFFD"
224 unicode_fixes[0x80] = "\u20AC"
225 unicode_fixes[0x82] = "\u201A"
226 unicode_fixes[0x83] = "\u0192"
227 unicode_fixes[0x84] = "\u201E"
228 unicode_fixes[0x85] = "\u2026"
229 unicode_fixes[0x86] = "\u2020"
230 unicode_fixes[0x87] = "\u2021"
231 unicode_fixes[0x88] = "\u02C6"
232 unicode_fixes[0x89] = "\u2030"
233 unicode_fixes[0x8A] = "\u0160"
234 unicode_fixes[0x8B] = "\u2039"
235 unicode_fixes[0x8C] = "\u0152"
236 unicode_fixes[0x8E] = "\u017D"
237 unicode_fixes[0x91] = "\u2018"
238 unicode_fixes[0x92] = "\u2019"
239 unicode_fixes[0x93] = "\u201C"
240 unicode_fixes[0x94] = "\u201D"
241 unicode_fixes[0x95] = "\u2022"
242 unicode_fixes[0x96] = "\u2013"
243 unicode_fixes[0x97] = "\u2014"
244 unicode_fixes[0x98] = "\u02DC"
245 unicode_fixes[0x99] = "\u2122"
246 unicode_fixes[0x9A] = "\u0161"
247 unicode_fixes[0x9B] = "\u203A"
248 unicode_fixes[0x9C] = "\u0153"
249 unicode_fixes[0x9E] = "\u017E"
250 unicode_fixes[0x9F] = "\u0178"
252 # These are the character references that don't need a terminating semicolon
253 # min length: 2, max: 6, none are a prefix of any other.
255 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
256 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
257 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
258 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
259 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
260 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
261 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
262 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
263 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
264 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
265 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
266 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
267 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
268 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
269 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
270 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
271 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
275 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
276 raw_text_elements = ['script', 'style']
277 escapable_raw_text_elements = ['textarea', 'title']
278 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
280 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
281 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
282 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
283 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
284 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
285 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
286 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
287 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
288 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
289 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
290 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
291 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
292 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
293 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
297 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
299 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
300 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
301 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
302 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
303 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
304 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
305 'determinant', 'diff', 'divergence', 'divide', 'domain',
306 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
307 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
308 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
309 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
310 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
311 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
312 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
313 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
314 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
315 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
316 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
317 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
318 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
319 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
320 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
321 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
322 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
323 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
324 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
325 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
326 'vectorproduct', 'xor'
328 # foreign_elements = [svg_elements..., mathml_elements...]
329 #normal_elements = All other allowed HTML elements are normal elements.
333 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
334 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
335 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
336 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
337 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
338 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
339 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
340 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
341 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
342 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
343 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
344 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
345 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
346 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
347 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
348 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
349 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
350 wbr:NS_HTML, xmp:NS_HTML,
353 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
354 'annotation-xml':NS_MATHML,
357 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
360 formatting_elements = {
361 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
362 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
366 mathml_text_integration = {
367 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
369 is_mathml_text_integration_point = (el) ->
370 return mathml_text_integration[el.name] is el.namespace
371 is_html_integration = (el) -> # DON'T PASS A TOKEN
372 if el.namespace is NS_MATHML
373 if el.name is 'annotation-xml'
374 if el.attrs.encoding?
375 if el.attrs.encoding.toLowerCase() is 'text/html'
377 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
380 if el.namespace is NS_SVG
381 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
386 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
389 foster_parenting_targets = {
410 el_is_special = (e) ->
411 return special_elements[e.name] is e.namespace
413 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
414 el_is_special_not_adp = (el) ->
415 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
419 altglyphdef: 'altGlyphDef'
420 altglyphitem: 'altGlyphItem'
421 animatecolor: 'animateColor'
422 animatemotion: 'animateMotion'
423 animatetransform: 'animateTransform'
426 fecolormatrix: 'feColorMatrix'
427 fecomponenttransfer: 'feComponentTransfer'
428 fecomposite: 'feComposite'
429 feconvolvematrix: 'feConvolveMatrix'
430 fediffuselighting: 'feDiffuseLighting'
431 fedisplacementmap: 'feDisplacementMap'
432 fedistantlight: 'feDistantLight'
433 fedropshadow: 'feDropShadow'
439 fegaussianblur: 'feGaussianBlur'
442 femergenode: 'feMergeNode'
443 femorphology: 'feMorphology'
445 fepointlight: 'fePointLight'
446 fespecularlighting: 'feSpecularLighting'
447 fespotlight: 'feSpotLight'
449 feturbulence: 'feTurbulence'
450 foreignobject: 'foreignObject'
452 lineargradient: 'linearGradient'
453 radialgradient: 'radialGradient'
456 svg_attribute_fixes = {
457 attributename: 'attributeName'
458 attributetype: 'attributeType'
459 basefrequency: 'baseFrequency'
460 baseprofile: 'baseProfile'
462 clippathunits: 'clipPathUnits'
463 contentscripttype: 'contentScriptType'
464 contentstyletype: 'contentStyleType'
465 diffuseconstant: 'diffuseConstant'
467 externalresourcesrequired: 'externalResourcesRequired'
468 filterres: 'filterRes'
469 filterunits: 'filterUnits'
471 gradienttransform: 'gradientTransform'
472 gradientunits: 'gradientUnits'
473 kernelmatrix: 'kernelMatrix'
474 kernelunitlength: 'kernelUnitLength'
475 keypoints: 'keyPoints'
476 keysplines: 'keySplines'
478 lengthadjust: 'lengthAdjust'
479 limitingconeangle: 'limitingConeAngle'
480 markerheight: 'markerHeight'
481 markerunits: 'markerUnits'
482 markerwidth: 'markerWidth'
483 maskcontentunits: 'maskContentUnits'
484 maskunits: 'maskUnits'
485 numoctaves: 'numOctaves'
486 pathlength: 'pathLength'
487 patterncontentunits: 'patternContentUnits'
488 patterntransform: 'patternTransform'
489 patternunits: 'patternUnits'
490 pointsatx: 'pointsAtX'
491 pointsaty: 'pointsAtY'
492 pointsatz: 'pointsAtZ'
493 preservealpha: 'preserveAlpha'
494 preserveaspectratio: 'preserveAspectRatio'
495 primitiveunits: 'primitiveUnits'
498 repeatcount: 'repeatCount'
499 repeatdur: 'repeatDur'
500 requiredextensions: 'requiredExtensions'
501 requiredfeatures: 'requiredFeatures'
502 specularconstant: 'specularConstant'
503 specularexponent: 'specularExponent'
504 spreadmethod: 'spreadMethod'
505 startoffset: 'startOffset'
506 stddeviation: 'stdDeviation'
507 stitchtiles: 'stitchTiles'
508 surfacescale: 'surfaceScale'
509 systemlanguage: 'systemLanguage'
510 tablevalues: 'tableValues'
513 textlength: 'textLength'
515 viewtarget: 'viewTarget'
516 xchannelselector: 'xChannelSelector'
517 ychannelselector: 'yChannelSelector'
518 zoomandpan: 'zoomAndPan'
520 adjust_mathml_attributes = (t) ->
522 if a[0] is 'definitionurl'
523 a[0] = 'definitionURL'
525 adjust_svg_attributes = (t) ->
527 if svg_attribute_fixes[a[0]]?
528 a[0] = svg_attribute_fixes[a[0]]
530 adjust_foreign_attributes = (t) ->
534 # decode_named_char_ref()
536 # The list of named character references is _huge_ so ask the browser to decode
537 # for us instead of wasting bandwidth/space on including the table here.
539 # Pass without the "&" but with the ";" examples:
540 # for "&" pass "amp;"
541 # for "′" pass "x2032;"
544 textarea: document.createElement('textarea')
546 # TODO test this in IE8
547 decode_named_char_ref = (txt) ->
549 decoded = g_dncr.cache[txt]
550 return decoded if decoded?
551 g_dncr.textarea.innerHTML = txt
552 decoded = g_dncr.textarea.value
553 return null if decoded is txt
554 return g_dncr.cache[txt] = decoded
556 parse_html = (args) ->
558 cur = null # index of next char in txt to be parsed
559 # declare doc and tokenizer variables so they're in scope below
561 open_els = null # stack of open elements
562 afe = null # active formatting elements
563 template_ins_modes = null
565 original_ins_mode = null
567 tok_cur_tag = null # partially parsed tag
568 flag_scripting = null
569 flag_frameset_ok = null
571 flag_foster_parenting = null
572 form_element_pointer = null
573 temporary_buffer = null
574 pending_table_character_tokens = null
575 head_element_pointer = null
576 flag_fragment_parsing = null
577 context_element = null
586 console.log "Parse error at character #{cur} of #{txt.length}"
588 afe_push = (new_el) ->
591 if el.name is new_el.name and el.namespace is new_el.namespace
593 continue unless new_el.attrs[k] is v
594 for k, v of new_el.attrs
595 continue unless el.attrs[k] is v
602 afe.unshift new_afe_marker()
604 # the functions below impliment the Tree Contstruction algorithm
605 # http://www.w3.org/TR/html5/syntax.html#tree-construction
607 # But first... the helpers
608 template_tag_is_open = ->
610 if t.name is 'template' and t.namespace is NS_HTML
613 is_in_scope_x = (tag_name, scope, namespace) ->
615 if t.name is tag_name and (namespace is null or namespace is t.namespace)
617 if scope[t.name] is t.namespace
620 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
622 if t.name is tag_name and (namespace is null or namespace is t.namespace)
624 if scope[t.name] is t.namespace
626 if scope2[t.name] is t.namespace
630 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
631 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
632 template: NS_HTML, mi: NS_MATHML,
634 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
635 'annotation-xml': NS_MATHML,
637 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
639 button_scopers = button: NS_HTML
640 li_scopers = ol: NS_HTML, ul: NS_HTML
641 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
642 is_in_scope = (tag_name, namespace = null) ->
643 return is_in_scope_x tag_name, standard_scopers, namespace
644 is_in_button_scope = (tag_name, namespace = null) ->
645 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
646 is_in_table_scope = (tag_name, namespace = null) ->
647 return is_in_scope_x tag_name, table_scopers, namespace
648 # aka is_in_list_item_scope
649 is_in_li_scope = (tag_name, namespace = null) ->
650 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
651 is_in_select_scope = (tag_name, namespace = null) ->
653 if t.name is tag_name and (namespace is null or namespace is t.namespace)
655 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
658 # this checks for a particular element, not by name
659 # this requires a namespace match
660 el_is_in_scope = (needle) ->
664 if standard_scopers[el.name] is el.namespace
668 clear_to_table_stopers = {
673 clear_stack_to_table_context = ->
675 if clear_to_table_stopers[open_els[0].name]?
679 clear_to_table_body_stopers = {
686 clear_stack_to_table_body_context = ->
688 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
692 clear_to_table_row_stopers = {
697 clear_stack_to_table_row_context = ->
699 if clear_to_table_row_stopers[open_els[0].name]?
703 clear_afe_to_marker = ->
705 return unless afe.length > 0 # this happens in fragment case, ?spec error
707 if el.type is TYPE_AFE_MARKER
712 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
714 # 1. Let last be false.
716 # 2. Let node be the last node in the stack of open elements.
718 node = open_els[node_i]
719 # 3. Loop: If node is the first node in the stack of open elements,
720 # then set last to true, and, if the parser was originally created as
721 # part of the HTML fragment parsing algorithm (fragment case) set node
722 # to the context element.
724 if node_i is open_els.length - 1
726 # fixfull (fragment case)
728 # 4. If node is a select element, run these substeps:
729 if node.name is 'select' and node.namespace is NS_HTML
730 # 1. If last is true, jump to the step below labeled done.
732 # 2. Let ancestor be node.
735 # 3. Loop: If ancestor is the first node in the stack of
736 # open elements, jump to the step below labeled done.
738 if ancestor_i is open_els.length - 1
740 # 4. Let ancestor be the node before ancestor in the stack
743 ancestor = open_els[ancestor_i]
744 # 5. If ancestor is a template node, jump to the step below
746 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
748 # 6. If ancestor is a table node, switch the insertion mode
749 # to "in select in table" and abort these steps.
750 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
751 ins_mode = ins_mode_in_select_in_table
753 # 7. Jump back to the step labeled loop.
754 # 8. Done: Switch the insertion mode to "in select" and abort
756 ins_mode = ins_mode_in_select
758 # 5. If node is a td or th element and last is false, then switch
759 # the insertion mode to "in cell" and abort these steps.
760 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
761 ins_mode = ins_mode_in_cell
763 # 6. If node is a tr element, then switch the insertion mode to "in
764 # row" and abort these steps.
765 if node.name is 'tr' and node.namespace is NS_HTML
766 ins_mode = ins_mode_in_row
768 # 7. If node is a tbody, thead, or tfoot element, then switch the
769 # insertion mode to "in table body" and abort these steps.
770 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
771 ins_mode = ins_mode_in_table_body
773 # 8. If node is a caption element, then switch the insertion mode
774 # to "in caption" and abort these steps.
775 if node.name is 'caption' and node.namespace is NS_HTML
776 ins_mode = ins_mode_in_caption
778 # 9. If node is a colgroup element, then switch the insertion mode
779 # to "in column group" and abort these steps.
780 if node.name is 'colgroup' and node.namespace is NS_HTML
781 ins_mode = ins_mode_in_column_group
783 # 10. If node is a table element, then switch the insertion mode to
784 # "in table" and abort these steps.
785 if node.name is 'table' and node.namespace is NS_HTML
786 ins_mode = ins_mode_in_table
788 # 11. If node is a template element, then switch the insertion mode
789 # to the current template insertion mode and abort these steps.
790 if node.name is 'template' and node.namespace is NS_HTML
791 ins_mode = template_ins_modes[0]
793 # 12. If node is a head element and last is true, then switch the
794 # insertion mode to "in body" ("in body"! not "in head"!) and abort
795 # these steps. (fragment case)
796 if node.name is 'head' and node.namespace is NS_HTML and last
797 ins_mode = ins_mode_in_body
799 # 13. If node is a head element and last is false, then switch the
800 # insertion mode to "in head" and abort these steps.
801 if node.name is 'head' and node.namespace is NS_HTML and last is false
802 ins_mode = ins_mode_in_head
804 # 14. If node is a body element, then switch the insertion mode to
805 # "in body" and abort these steps.
806 if node.name is 'body' and node.namespace is NS_HTML
807 ins_mode = ins_mode_in_body
809 # 15. If node is a frameset element, then switch the insertion mode
810 # to "in frameset" and abort these steps. (fragment case)
811 if node.name is 'frameset' and node.namespace is NS_HTML
812 ins_mode = ins_mode_in_frameset
814 # 16. If node is an html element, run these substeps:
815 if node.name is 'html' and node.namespace is NS_HTML
816 # 1. If the head element pointer is null, switch the insertion
817 # mode to "before head" and abort these steps. (fragment case)
818 if head_element_pointer is null
819 ins_mode = ins_mode_before_head
821 # 2. Otherwise, the head element pointer is not null,
822 # switch the insertion mode to "after head" and abort these
824 ins_mode = ins_mode_after_head
826 # 17. If last is true, then switch the insertion mode to "in body"
827 # and abort these steps. (fragment case)
829 ins_mode = ins_mode_in_body
831 # 18. Let node now be the node before node in the stack of open
834 node = open_els[node_i]
835 # 19. Return to the step labeled loop.
839 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
840 adjusted_current_node = ->
841 if open_els.length is 1 and flag_fragment_parsing
842 return context_element
845 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
846 # this implementation is structured (mostly) as described at the link above.
847 # capitalized comments are the "labels" described at the link above.
849 return if afe.length is 0
850 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
855 if i is afe.length - 1
858 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
863 el = insert_html_element afe[i].token
868 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
869 # adoption agency algorithm
871 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
872 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
873 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
874 adoption_agency = (subject) ->
875 debug_log "adoption_agency()"
876 debug_log "tree: #{serialize_els doc.children, false, true}"
877 debug_log "open_els: #{serialize_els open_els, true, true}"
878 debug_log "afe: #{serialize_els afe, true, true}"
879 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
882 # remove it from the list of active formatting elements (if found)
887 debug_log "aaa: starting off with subject on top of stack, exiting"
894 # 5. Let formatting element be the last element in the list of
895 # active formatting elements that: is between the end of the list
896 # and the last scope marker in the list, if any, or the start of
897 # the list otherwise, and has the tag name subject.
899 for t, fe_of_afe in afe
900 if t.type is TYPE_AFE_MARKER
905 # If there is no such element, then abort these steps and instead
906 # act as described in the "any other end tag" entry above.
908 debug_log "aaa: fe not found in afe"
909 in_body_any_other_end_tag subject
911 # 6. If formatting element is not in the stack of open elements,
912 # then this is a parse error; remove the element from the list, and
915 for t, fe_of_open_els in open_els
920 debug_log "aaa: fe not found in open_els"
922 # "remove it from the list" must mean afe, since it's not in open_els
923 afe.splice fe_of_afe, 1
925 # 7. If formatting element is in the stack of open elements, but
926 # the element is not in scope, then this is a parse error; abort
928 unless el_is_in_scope fe
929 debug_log "aaa: fe not in scope"
932 # 8. If formatting element is not the current node, this is a parse
933 # error. (But do not abort these steps.)
934 unless open_els[0] is fe
937 # 9. Let furthest block be the topmost node in the stack of open
938 # elements that is lower in the stack than formatting element, and
939 # is an element in the special category. There might not be one.
941 fb_of_open_els = null
948 # and continue, to see if there's one that's more "topmost"
949 # 10. If there is no furthest block, then the UA must first pop all
950 # the nodes from the bottom of the stack of open elements, from the
951 # current node up to and including formatting element, then remove
952 # formatting element from the list of active formatting elements,
953 # and finally abort these steps.
955 debug_log "aaa: no fb"
959 afe.splice fe_of_afe, 1
961 # 11. Let common ancestor be the element immediately above
962 # formatting element in the stack of open elements.
963 ca = open_els[fe_of_open_els + 1] # common ancestor
965 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
966 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
967 bookmark = new_aaa_bookmark()
970 afe.splice i, 0, bookmark
972 node = last_node = fb
976 # 3. Let node be the element immediately above node in the
977 # stack of open elements, or if node is no longer in the stack
978 # of open elements (e.g. because it got removed by this
979 # algorithm), the element that was immediately above node in
980 # the stack of open elements before node was removed.
984 node_next = open_els[i + 1]
986 node = node_next ? node_above
987 debug_log "inner loop #{inner}"
988 debug_log "tree: #{serialize_els doc.children, false, true}"
989 debug_log "open_els: #{serialize_els open_els, true, true}"
990 debug_log "afe: #{serialize_els afe, true, true}"
991 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
992 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
993 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
994 debug_log "node: #{node.serialize true, true}"
995 # TODO make sure node_above gets re-set if/when node is removed from open_els
997 # 4. If node is formatting element, then go to the next step in
998 # the overall algorithm.
1001 debug_log "the meat"
1002 # 5. If inner loop counter is greater than three and node is in
1003 # the list of active formatting elements, then remove node from
1004 # the list of active formatting elements.
1010 debug_log "max out inner"
1015 # 6. If node is not in the list of active formatting elements,
1016 # then remove node from the stack of open elements and then go
1017 # back to the step labeled inner loop.
1019 debug_log "not in afe"
1020 for t, i in open_els
1022 node_above = open_els[i + 1]
1023 open_els.splice i, 1
1026 debug_log "the bones"
1027 # 7. create an element for the token for which the element node
1028 # was created, in the HTML namespace, with common ancestor as
1029 # the intended parent; replace the entry for node in the list
1030 # of active formatting elements with an entry for the new
1031 # element, replace the entry for node in the stack of open
1032 # elements with an entry for the new element, and let node be
1034 new_node = token_to_element node.token, NS_HTML, ca
1038 debug_log "replaced in afe"
1040 for t, i in open_els
1042 node_above = open_els[i + 1]
1043 open_els[i] = new_node
1044 debug_log "replaced in open_els"
1047 # 8. If last node is furthest block, then move the
1048 # aforementioned bookmark to be immediately after the new node
1049 # in the list of active formatting elements.
1054 debug_log "removed bookmark"
1058 # "after" means lower
1059 afe.splice i, 0, bookmark # "after as <-
1060 debug_log "placed bookmark after node"
1061 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1063 # 9. Insert last node into node, first removing it from its
1064 # previous parent node if any.
1065 if last_node.parent?
1066 debug_log "last_node has parent"
1067 for c, i in last_node.parent.children
1069 debug_log "removing last_node from parent"
1070 last_node.parent.children.splice i, 1
1072 node.children.push last_node
1073 last_node.parent = node
1074 # 10. Let last node be node.
1077 # 11. Return to the step labeled inner loop.
1078 # 14. Insert whatever last node ended up being in the previous step
1079 # at the appropriate place for inserting a node, but using common
1080 # ancestor as the override target.
1082 # In the case where fe is immediately followed by fb:
1083 # * inner loop exits out early (node==fe)
1085 # * last_node is still in the tree (not a duplicate)
1086 if last_node.parent?
1087 debug_log "FEFIRST? last_node has parent"
1088 for c, i in last_node.parent.children
1090 debug_log "removing last_node from parent"
1091 last_node.parent.children.splice i, 1
1094 debug_log "after aaa inner loop"
1095 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1096 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1097 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1098 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1099 debug_log "tree: #{serialize_els doc.children, false, true}"
1104 # can't use standard insert token thing, because it's already in
1105 # open_els and must stay at it's current position in open_els
1106 dest = adjusted_insertion_location ca
1107 dest[0].children.splice dest[1], 0, last_node
1108 last_node.parent = dest[0]
1111 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1112 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1113 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1114 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1115 debug_log "tree: #{serialize_els doc.children, false, true}"
1117 # 15. Create an element for the token for which formatting element
1118 # was created, in the HTML namespace, with furthest block as the
1120 new_element = token_to_element fe.token, NS_HTML, fb
1121 # 16. Take all of the child nodes of furthest block and append them
1122 # to the element created in the last step.
1123 while fb.children.length
1124 t = fb.children.shift()
1125 t.parent = new_element
1126 new_element.children.push t
1127 # 17. Append that new element to furthest block.
1128 new_element.parent = fb
1129 fb.children.push new_element
1130 # 18. Remove formatting element from the list of active formatting
1131 # elements, and insert the new element into the list of active
1132 # formatting elements at the position of the aforementioned
1140 afe[i] = new_element
1142 # 19. Remove formatting element from the stack of open elements,
1143 # and insert the new element into the stack of open elements
1144 # immediately below the position of furthest block in that stack.
1145 for t, i in open_els
1147 open_els.splice i, 1
1149 for t, i in open_els
1151 open_els.splice i, 0, new_element
1153 # 20. Jump back to the step labeled outer loop.
1154 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1155 debug_log "tree: #{serialize_els doc.children, false, true}"
1156 debug_log "open_els: #{serialize_els open_els, true, true}"
1157 debug_log "afe: #{serialize_els afe, true, true}"
1158 debug_log "AAA DONE"
1160 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1161 close_p_element = ->
1162 generate_implied_end_tags 'p' # arg is exception
1163 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1165 while open_els.length > 1 # just in case
1166 el = open_els.shift()
1167 if el.name is 'p' and el.namespace is NS_HTML
1169 close_p_if_in_button_scope = ->
1170 if is_in_button_scope 'p', NS_HTML
1173 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1174 # aka insert_a_character = (t) ->
1175 insert_character = (t) ->
1176 dest = adjusted_insertion_location()
1177 # fixfull check for Document node
1179 prev = dest[0].children[dest[1] - 1]
1180 if prev.type is TYPE_TEXT
1183 dest[0].children.splice dest[1], 0, t
1186 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1187 process_token = (t) ->
1188 acn = adjusted_current_node()
1192 if acn.namespace is NS_HTML
1195 if is_mathml_text_integration_point(acn)
1196 if t.type is TYPE_START_TAG and (t.name is 'mglyph' or t.name is 'malignmark')
1199 if t.type is TYPE_TEXT
1202 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1205 if is_html_integration acn
1206 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1209 if t.type is TYPE_EOF
1212 in_foreign_content t
1216 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1217 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1218 adjusted_insertion_location = (override_target = null) ->
1219 # 1. If there was an override target specified, then let target be the
1222 target = override_target
1223 else # Otherwise, let target be the current node.
1224 target = open_els[0]
1225 # 2. Determine the adjusted insertion location using the first matching
1226 # steps from the following list:
1228 # If foster parenting is enabled and target is a table, tbody, tfoot,
1229 # thead, or tr element Foster parenting happens when content is
1230 # misnested in tables.
1231 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1232 loop # once. this is here so we can ``break`` to "abort these substeps"
1233 # 1. Let last template be the last template element in the
1234 # stack of open elements, if any.
1235 last_template = null
1236 last_template_i = null
1237 for el, i in open_els
1238 if el.name is 'template' and el.namespace is NS_HTML
1242 # 2. Let last table be the last table element in the stack of
1243 # open elements, if any.
1246 for el, i in open_els
1247 if el.name is 'table' and el.namespace is NS_HTML
1251 # 3. If there is a last template and either there is no last
1252 # table, or there is one, but last template is lower (more
1253 # recently added) than last table in the stack of open
1254 # elements, then: let adjusted insertion location be inside
1255 # last template's template contents, after its last child (if
1256 # any), and abort these substeps.
1257 if last_template and (last_table is null or last_template_i < last_table_i)
1258 target = last_template # fixfull should be it's contents
1259 target_i = target.children.length
1261 # 4. If there is no last table, then let adjusted insertion
1262 # location be inside the first element in the stack of open
1263 # elements (the html element), after its last child (if any),
1264 # and abort these substeps. (fragment case)
1265 if last_table is null
1267 target = open_els[open_els.length - 1]
1268 target_i = target.children.length
1270 # 5. If last table has a parent element, then let adjusted
1271 # insertion location be inside last table's parent element,
1272 # immediately before last table, and abort these substeps.
1273 if last_table.parent?
1274 for c, i in last_table.parent.children
1276 target = last_table.parent
1280 # 6. Let previous element be the element immediately above last
1281 # table in the stack of open elements.
1283 # huh? how could it not have a parent?
1284 previous_element = open_els[last_table_i + 1]
1285 # 7. Let adjusted insertion location be inside previous
1286 # element, after its last child (if any).
1287 target = previous_element
1288 target_i = target.children.length
1289 # Note: These steps are involved in part because it's possible
1290 # for elements, the table element in this case in particular,
1291 # to have been moved by a script around in the DOM, or indeed
1292 # removed from the DOM entirely, after the element was inserted
1294 break # don't really loop
1296 # Otherwise Let adjusted insertion location be inside target, after
1297 # its last child (if any).
1298 target_i = target.children.length
1300 # 3. If the adjusted insertion location is inside a template element,
1301 # let it instead be inside the template element's template contents,
1302 # after its last child (if any).
1303 # fixfull (template)
1305 # 4. Return the adjusted insertion location.
1306 return [target, target_i]
1308 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1309 # aka create_an_element_for_token
1310 token_to_element = (t, namespace, intended_parent) ->
1311 # convert attributes into a hash
1314 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1315 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1317 # TODO 2. If the newly created element has an xmlns attribute in the
1318 # XMLNS namespace whose value is not exactly the same as the element's
1319 # namespace, that is a parse error. Similarly, if the newly created
1320 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1321 # value is not the XLink Namespace, that is a parse error.
1323 # fixfull: the spec says stuff about form pointers and ownerDocument
1327 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1328 insert_foreign_element = (token, namespace) ->
1329 ail = adjusted_insertion_location()
1332 el = token_to_element token, namespace, ail_el
1333 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1335 ail_el.children.splice ail_i, 0, el
1338 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1339 insert_html_element = (token) ->
1340 insert_foreign_element token, NS_HTML
1342 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1343 # position should be [node, index_within_children]
1344 insert_comment = (t, position = null) ->
1345 position ?= adjusted_insertion_location()
1346 position[0].children.splice position[1], 0, t
1349 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1350 parse_generic_raw_text = (t) ->
1351 insert_html_element t
1352 tok_state = tok_state_rawtext
1353 original_ins_mode = ins_mode
1354 ins_mode = ins_mode_text
1355 parse_generic_rcdata_text = (t) ->
1356 insert_html_element t
1357 tok_state = tok_state_rcdata
1358 original_ins_mode = ins_mode
1359 ins_mode = ins_mode_text
1361 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1362 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1363 generate_implied_end_tags = (except = null) ->
1364 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1367 # 8.2.5.4 The rules for parsing tokens in HTML content
1368 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1370 # 8.2.5.4.1 The "initial" insertion mode
1371 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1372 ins_mode_initial = (t) ->
1375 if t.type is TYPE_COMMENT
1379 if t.type is TYPE_DOCTYPE
1380 # FIXME check identifiers, set quirks, etc
1383 ins_mode = ins_mode_before_html
1386 #fixfull (iframe, quirks)
1387 ins_mode = ins_mode_before_html
1391 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1392 ins_mode_before_html = (t) ->
1393 if t.type is TYPE_DOCTYPE
1396 if t.type is TYPE_COMMENT
1401 if t.type is TYPE_START_TAG and t.name is 'html'
1402 el = token_to_element t, NS_HTML, doc
1403 doc.children.push el
1404 open_els.unshift(el)
1405 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1406 ins_mode = ins_mode_before_head
1408 if t.type is TYPE_END_TAG
1409 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1410 # fall through to "anything else"
1415 html_tok = new_open_tag 'html'
1416 el = token_to_element html_tok, NS_HTML, doc
1417 doc.children.push el
1419 # ?fixfull browsing context
1420 ins_mode = ins_mode_before_head
1424 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1425 ins_mode_before_head = (t) ->
1428 if t.type is TYPE_COMMENT
1431 if t.type is TYPE_DOCTYPE
1434 if t.type is TYPE_START_TAG and t.name is 'html'
1437 if t.type is TYPE_START_TAG and t.name is 'head'
1438 el = insert_html_element t
1439 head_element_pointer = el
1440 ins_mode = ins_mode_in_head
1442 if t.type is TYPE_END_TAG
1443 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1444 # fall through to Anything else below
1449 head_tok = new_open_tag 'head'
1450 el = insert_html_element head_tok
1451 head_element_pointer = el
1452 ins_mode = ins_mode_in_head
1455 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1456 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1457 open_els.shift() # spec says this will be a 'head' node
1458 ins_mode = ins_mode_after_head
1460 ins_mode_in_head = (t) ->
1461 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1464 if t.type is TYPE_COMMENT
1467 if t.type is TYPE_DOCTYPE
1470 if t.type is TYPE_START_TAG and t.name is 'html'
1473 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1474 el = insert_html_element t
1476 t.acknowledge_self_closing()
1478 if t.type is TYPE_START_TAG and t.name is 'meta'
1479 el = insert_html_element t
1481 t.acknowledge_self_closing()
1482 # fixfull encoding stuff
1484 if t.type is TYPE_START_TAG and t.name is 'title'
1485 parse_generic_rcdata_text t
1487 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1488 parse_generic_raw_text t
1490 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1491 insert_html_element t
1492 ins_mode = ins_mode_in_head_noscript
1494 if t.type is TYPE_START_TAG and t.name is 'script'
1495 ail = adjusted_insertion_location()
1496 el = token_to_element t, NS_HTML, ail
1497 el.flag 'parser-inserted', true
1498 # fixfull frament case
1499 ail[0].children.splice ail[1], 0, el
1501 tok_state = tok_state_script_data
1502 original_ins_mode = ins_mode # make sure orig... is defined
1503 ins_mode = ins_mode_text
1505 if t.type is TYPE_END_TAG and t.name is 'head'
1506 open_els.shift() # will be a head element... spec says so
1507 ins_mode = ins_mode_after_head
1509 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1510 ins_mode_in_head_else t
1512 if t.type is TYPE_START_TAG and t.name is 'template'
1513 insert_html_element t
1515 flag_frameset_ok = false
1516 ins_mode = ins_mode_in_template
1517 template_ins_modes.unshift ins_mode_in_template
1519 if t.type is TYPE_END_TAG and t.name is 'template'
1520 if template_tag_is_open()
1521 generate_implied_end_tags
1522 if open_els[0].name isnt 'template'
1525 el = open_els.shift()
1526 if el.name is 'template' and el.namespace is NS_HTML
1528 clear_afe_to_marker()
1529 template_ins_modes.shift()
1534 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1537 ins_mode_in_head_else t
1539 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1540 ins_mode_in_head_noscript_else = (t) ->
1543 ins_mode = ins_mode_in_head
1545 ins_mode_in_head_noscript = (t) ->
1546 if t.type is TYPE_DOCTYPE
1549 if t.type is TYPE_START_TAG and t.name is 'html'
1552 if t.type is TYPE_END_TAG and t.name is 'noscript'
1554 ins_mode = ins_mode_in_head
1556 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1559 if t.type is TYPE_END_TAG and t.name is 'br'
1560 ins_mode_in_head_noscript_else t
1562 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1566 ins_mode_in_head_noscript_else t
1571 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1572 ins_mode_after_head_else = (t) ->
1573 body_tok = new_open_tag 'body'
1574 insert_html_element body_tok
1575 ins_mode = ins_mode_in_body
1578 ins_mode_after_head = (t) ->
1582 if t.type is TYPE_COMMENT
1585 if t.type is TYPE_DOCTYPE
1588 if t.type is TYPE_START_TAG and t.name is 'html'
1591 if t.type is TYPE_START_TAG and t.name is 'body'
1592 insert_html_element t
1593 flag_frameset_ok = false
1594 ins_mode = ins_mode_in_body
1596 if t.type is TYPE_START_TAG and t.name is 'frameset'
1597 insert_html_element t
1598 ins_mode = ins_mode_in_frameset
1600 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1602 open_els.unshift head_element_pointer
1604 for el, i of open_els
1605 if el is head_element_pointer
1606 open_els.splice i, 1
1608 console.log "warning: 23904 couldn't find head element in open_els"
1610 if t.type is TYPE_END_TAG and t.name is 'template'
1613 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1614 ins_mode_after_head_else t
1616 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1620 ins_mode_after_head_else t
1622 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1623 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1624 for el, i in open_els
1625 if el.name is name and el.namespace is NS_HTML
1626 generate_implied_end_tags name # arg is exception
1627 parse_error() unless i is 0
1632 if special_elements[el.name] is el.namespace
1636 ins_mode_in_body = (t) ->
1637 if t.type is TYPE_TEXT and t.text is "\u0000"
1644 if t.type is TYPE_TEXT
1647 flag_frameset_ok = false
1649 if t.type is TYPE_COMMENT
1652 if t.type is TYPE_DOCTYPE
1655 if t.type is TYPE_START_TAG and t.name is 'html'
1657 return if template_tag_is_open()
1658 root_attrs = open_els[open_els.length - 1].attrs
1660 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1663 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1666 if t.type is TYPE_START_TAG and t.name is 'body'
1668 return if open_els.length < 2
1669 second = open_els[open_els.length - 2]
1670 return unless second.namespace is NS_HTML
1671 return unless second.name is 'body'
1672 return if template_tag_is_open()
1673 flag_frameset_ok = false
1675 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1677 if t.type is TYPE_START_TAG and t.name is 'frameset'
1679 return if open_els.length < 2
1680 second_i = open_els.length - 2
1681 second = open_els[second_i]
1682 return unless second.namespace is NS_HTML
1683 return unless second.name is 'body'
1684 if flag_frameset_ok is false
1687 for el, i in second.parent.children
1689 second.parent.children.splice i, 1
1691 open_els.splice second_i, 1
1692 # pop everything except the "root html element"
1693 while open_els.length > 1
1695 insert_html_element t
1696 ins_mode = ins_mode_in_frameset
1698 if t.type is TYPE_EOF
1700 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1701 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1702 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1705 unless ok_tags[t.name] is el.namespace
1708 if template_ins_modes.length > 0
1709 ins_mode_in_template t
1713 if t.type is TYPE_END_TAG and t.name is 'body'
1714 unless is_in_scope 'body', NS_HTML
1718 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1719 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1720 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1721 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1725 unless ok_tags[t.name] is el.namespace
1728 ins_mode = ins_mode_after_body
1730 if t.type is TYPE_END_TAG and t.name is 'html'
1731 unless is_in_scope 'body', NS_HTML
1735 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1736 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1737 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1738 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1742 unless ok_tags[t.name] is el.namespace
1745 ins_mode = ins_mode_after_body
1748 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1749 close_p_if_in_button_scope()
1750 insert_html_element t
1752 if t.type is TYPE_START_TAG and h_tags[t.name]?
1753 close_p_if_in_button_scope()
1754 if h_tags[open_els[0].name] is open_els[0].namespace
1757 insert_html_element t
1759 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1760 close_p_if_in_button_scope()
1761 insert_html_element t
1762 # spec: If the next token is a "LF" (U+000A) character token, then
1763 # ignore that token and move on to the next one. (Newlines at the
1764 # start of pre blocks are ignored as an authoring convenience.)
1765 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1767 flag_frameset_ok = false
1769 if t.type is TYPE_START_TAG and t.name is 'form'
1770 unless form_element_pointer is null or template_tag_is_open()
1773 close_p_if_in_button_scope()
1774 el = insert_html_element t
1775 unless template_tag_is_open()
1776 form_element_pointer = el
1778 if t.type is TYPE_START_TAG and t.name is 'li'
1779 flag_frameset_ok = false
1780 for node in open_els
1781 if node.name is 'li' and node.namespace is NS_HTML
1782 generate_implied_end_tags 'li' # arg is exception
1783 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1786 el = open_els.shift()
1787 if el.name is 'li' and el.namespace is NS_HTML
1790 if el_is_special_not_adp node
1792 close_p_if_in_button_scope()
1793 insert_html_element t
1795 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1796 flag_frameset_ok = false
1797 for node in open_els
1798 if node.name is 'dd' and node.namespace is NS_HTML
1799 generate_implied_end_tags 'dd' # arg is exception
1800 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1803 el = open_els.shift()
1804 if el.name is 'dd' and el.namespace is NS_HTML
1807 if node.name is 'dt' and node.namespace is NS_HTML
1808 generate_implied_end_tags 'dt' # arg is exception
1809 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1812 el = open_els.shift()
1813 if el.name is 'dt' and el.namespace is NS_HTML
1816 if el_is_special_not_adp node
1818 close_p_if_in_button_scope()
1819 insert_html_element t
1821 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1822 close_p_if_in_button_scope()
1823 insert_html_element t
1824 tok_state = tok_state_plaintext
1826 if t.type is TYPE_START_TAG and t.name is 'button'
1827 if is_in_scope 'button', NS_HTML
1829 generate_implied_end_tags()
1831 el = open_els.shift()
1832 if el.name is 'button' and el.namespace is NS_HTML
1835 insert_html_element t
1836 flag_frameset_ok = false
1838 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1839 unless is_in_scope t.name, NS_HTML
1842 generate_implied_end_tags()
1843 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1846 el = open_els.shift()
1847 if el.name is t.name and el.namespace is NS_HTML
1850 if t.type is TYPE_END_TAG and t.name is 'form'
1851 unless template_tag_is_open()
1852 node = form_element_pointer
1853 form_element_pointer = null
1854 if node is null or not el_is_in_scope node
1857 generate_implied_end_tags()
1858 if open_els[0] isnt node
1860 for el, i in open_els
1862 open_els.splice i, 1
1865 unless is_in_scope 'form', NS_HTML
1868 generate_implied_end_tags()
1869 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1872 el = open_els.shift()
1873 if el.name is 'form' and el.namespace is NS_HTML
1876 if t.type is TYPE_END_TAG and t.name is 'p'
1877 unless is_in_button_scope 'p', NS_HTML
1879 insert_html_element new_open_tag 'p'
1882 if t.type is TYPE_END_TAG and t.name is 'li'
1883 unless is_in_li_scope 'li', NS_HTML
1886 generate_implied_end_tags 'li' # arg is exception
1887 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1890 el = open_els.shift()
1891 if el.name is 'li' and el.namespace is NS_HTML
1894 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1895 unless is_in_scope t.name, NS_HTML
1898 generate_implied_end_tags t.name # arg is exception
1899 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1902 el = open_els.shift()
1903 if el.name is t.name and el.namespace is NS_HTML
1906 if t.type is TYPE_END_TAG and h_tags[t.name]?
1909 if h_tags[el.name] is el.namespace
1912 if standard_scopers[el.name] is el.namespace
1917 generate_implied_end_tags()
1918 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1921 el = open_els.shift()
1922 if h_tags[el.name] is el.namespace
1926 if t.type is TYPE_START_TAG and t.name is 'a'
1927 # If the list of active formatting elements contains an a element
1928 # between the end of the list and the last marker on the list (or
1929 # the start of the list if there is no marker on the list), then
1930 # this is a parse error; run the adoption agency algorithm for the
1931 # tag name "a", then remove that element from the list of active
1932 # formatting elements and the stack of open elements if the
1933 # adoption agency algorithm didn't already remove it (it might not
1934 # have if the element is not in table scope).
1937 if el.type is TYPE_AFE_MARKER
1939 if el.name is 'a' and el.namespace is NS_HTML
1947 for el, i in open_els
1949 open_els.splice i, 1
1951 el = insert_html_element t
1954 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1956 el = insert_html_element t
1959 if t.type is TYPE_START_TAG and t.name is 'nobr'
1961 el = insert_html_element t
1964 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1965 adoption_agency t.name
1967 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1969 insert_html_element t
1971 flag_frameset_ok = false
1973 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1974 unless is_in_scope t.name, NS_HTML
1977 generate_implied_end_tags()
1978 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1981 el = open_els.shift()
1982 if el.name is t.name and el.namespace is NS_HTML
1984 clear_afe_to_marker()
1986 if t.type is TYPE_START_TAG and t.name is 'table'
1987 close_p_if_in_button_scope() # fixfull quirksmode thing
1988 insert_html_element t
1989 flag_frameset_ok = false
1990 ins_mode = ins_mode_in_table
1992 if t.type is TYPE_END_TAG and t.name is 'br'
1994 t.type is TYPE_START_TAG
1996 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
1998 insert_html_element t
2000 t.acknowledge_self_closing()
2001 flag_frameset_ok = false
2003 if t.type is TYPE_START_TAG and t.name is 'input'
2005 insert_html_element t
2007 t.acknowledge_self_closing()
2008 unless is_input_hidden_tok t
2009 flag_frameset_ok = false
2011 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
2012 insert_html_element t
2014 t.acknowledge_self_closing()
2016 if t.type is TYPE_START_TAG and t.name is 'hr'
2017 close_p_if_in_button_scope()
2018 insert_html_element t
2020 t.acknowledge_self_closing()
2021 flag_frameset_ok = false
2023 if t.type is TYPE_START_TAG and t.name is 'image'
2028 if t.type is TYPE_START_TAG and t.name is 'isindex'
2030 if template_tag_is_open() is false and form_element_pointer isnt null
2032 t.acknowledge_self_closing()
2033 flag_frameset_ok = false
2034 close_p_if_in_button_scope()
2035 el = insert_html_element new_open_tag 'form'
2036 unless template_tag_is_open()
2037 form_element_pointer = el
2040 el.attrs['action'] = a[1]
2042 insert_html_element new_open_tag 'hr'
2045 insert_html_element new_open_tag 'label'
2046 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2047 input_el = new_open_tag 'input'
2052 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2053 input_el.attrs_a.push [a[0], a[1]]
2054 input_el.attrs_a.push ['name', 'isindex']
2055 # fixfull this next bit is in english... internationalize?
2056 prompt ?= "This is a searchable index. Enter search keywords: "
2057 insert_character new_character_token prompt # fixfull split
2058 # TODO submit typo "balue" in spec
2059 insert_html_element input_el
2061 # insert_character '' # you can put chars here if promt attr missing
2063 insert_html_element new_open_tag 'hr'
2066 unless template_tag_is_open()
2067 form_element_pointer = null
2069 if t.type is TYPE_START_TAG and t.name is 'textarea'
2070 insert_html_element t
2071 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2073 tok_state = tok_state_rcdata
2074 original_ins_mode = ins_mode
2075 flag_frameset_ok = false
2076 ins_mode = ins_mode_text
2078 if t.type is TYPE_START_TAG and t.name is 'xmp'
2079 close_p_if_in_button_scope()
2081 flag_frameset_ok = false
2082 parse_generic_raw_text t
2084 if t.type is TYPE_START_TAG and t.name is 'iframe'
2085 flag_frameset_ok = false
2086 parse_generic_raw_text t
2088 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2089 parse_generic_raw_text t
2091 if t.type is TYPE_START_TAG and t.name is 'select'
2093 insert_html_element t
2094 flag_frameset_ok = false
2095 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2096 ins_mode = ins_mode_in_select_in_table
2098 ins_mode = ins_mode_in_select
2100 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2101 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2104 insert_html_element t
2106 # this comment block implements the W3C spec
2107 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2108 # if is_in_scope 'ruby', NS_HTML
2109 # generate_implied_end_tags()
2110 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2112 # insert_html_element t
2114 # if t.type is TYPE_START_TAG and t.name is 'rt'
2115 # if is_in_scope 'ruby', NS_HTML
2116 # generate_implied_end_tags 'rtc' # arg is exception
2117 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2119 # insert_html_element t
2121 # below implements the WATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2122 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2123 if is_in_scope 'ruby', NS_HTML
2124 generate_implied_end_tags()
2125 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2127 insert_html_element t
2129 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2130 if is_in_scope 'ruby', NS_HTML
2131 generate_implied_end_tags 'rtc'
2132 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2134 insert_html_element t
2137 if t.type is TYPE_START_TAG and t.name is 'math'
2139 adjust_mathml_attributes t
2140 adjust_foreign_attributes t
2141 insert_foreign_element t, NS_MATHML
2142 if t.flag 'self-closing'
2144 t.acknowledge_self_closing()
2146 if t.type is TYPE_START_TAG and t.name is 'svg'
2148 adjust_svg_attributes t
2149 adjust_foreign_attributes t
2150 insert_foreign_element t, NS_SVG
2151 if t.flag 'self-closing'
2153 t.acknowledge_self_closing()
2155 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2158 if t.type is TYPE_START_TAG # any other start tag
2160 insert_html_element t
2162 if t.type is TYPE_END_TAG # any other end tag
2163 in_body_any_other_end_tag t.name
2167 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2168 ins_mode_text = (t) ->
2169 if t.type is TYPE_TEXT
2172 if t.type is TYPE_EOF
2174 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2175 open_els[0].flag 'already started', true
2177 ins_mode = original_ins_mode
2180 if t.type is TYPE_END_TAG and t.name is 'script'
2182 ins_mode = original_ins_mode
2183 # fixfull the spec seems to assume that I'm going to run the script
2184 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2186 if t.type is TYPE_END_TAG
2188 ins_mode = original_ins_mode
2190 console.log 'warning: end of ins_mode_text reached'
2192 # the functions below implement the tokenizer stats described here:
2193 # http://www.w3.org/TR/html5/syntax.html#tokenization
2195 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2196 ins_mode_in_table_else = (t) ->
2198 flag_foster_parenting = true
2200 flag_foster_parenting = false
2202 ins_mode_in_table = (t) ->
2205 if t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr'
2206 original_ins_mode = ins_mode
2207 ins_mode = ins_mode_in_table_text
2210 ins_mode_in_table_else t
2218 clear_stack_to_table_context()
2220 insert_html_element t
2221 ins_mode = ins_mode_in_caption
2223 clear_stack_to_table_context()
2224 insert_html_element t
2225 ins_mode = ins_mode_in_column_group
2227 clear_stack_to_table_context()
2228 insert_html_element new_open_tag 'colgroup'
2229 ins_mode = ins_mode_in_column_group
2231 when 'tbody', 'tfoot', 'thead'
2232 clear_stack_to_table_context()
2233 insert_html_element t
2234 ins_mode = ins_mode_in_table_body
2235 when 'td', 'th', 'tr'
2236 clear_stack_to_table_context()
2237 insert_html_element new_open_tag 'tbody'
2238 ins_mode = ins_mode_in_table_body
2242 if is_in_table_scope 'table', NS_HTML
2244 el = open_els.shift()
2245 if el.name is 'table' and el.namespace is NS_HTML
2249 when 'style', 'script', 'template'
2252 unless is_input_hidden_tok t
2253 ins_mode_in_table_else t
2256 el = insert_html_element t
2258 t.acknowledge_self_closing()
2261 if form_element_pointer?
2263 if template_tag_is_open()
2265 form_element_pointer = insert_html_element t
2268 ins_mode_in_table_else t
2272 if is_in_table_scope 'table', NS_HTML
2274 el = open_els.shift()
2275 if el.name is 'table' and el.namespace is NS_HTML
2280 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2285 ins_mode_in_table_else t
2289 ins_mode_in_table_else t
2292 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2293 ins_mode_in_table_text = (t) ->
2294 if t.type is TYPE_TEXT and t.text is "\u0000"
2295 # huh? I thought the tokenizer didn't emit these
2298 if t.type is TYPE_TEXT
2299 pending_table_character_tokens.push t
2303 for old in pending_table_character_tokens
2304 unless is_space_tok old
2308 for old in pending_table_character_tokens
2309 insert_character old
2311 for old in pending_table_character_tokens
2312 ins_mode_table_else old
2313 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
2314 ins_mode = original_ins_mode
2317 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2318 ins_mode_in_caption = (t) ->
2319 if t.type is TYPE_END_TAG and t.name is 'caption'
2320 if is_in_table_scope 'caption', NS_HTML
2321 generate_implied_end_tags()
2322 if open_els[0].name isnt 'caption'
2325 el = open_els.shift()
2326 if el.name is 'caption' and el.namespace is NS_HTML
2328 clear_afe_to_marker()
2329 ins_mode = ins_mode_in_table
2334 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2336 if is_in_table_scope 'caption', NS_HTML
2338 el = open_els.shift()
2339 if el.name is 'caption' and el.namespace is NS_HTML
2341 clear_afe_to_marker()
2342 ins_mode = ins_mode_in_table
2344 # else fragment case
2346 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2352 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2353 ins_mode_in_column_group = (t) ->
2357 if t.type is TYPE_COMMENT
2360 if t.type is TYPE_DOCTYPE
2363 if t.type is TYPE_START_TAG and t.name is 'html'
2366 if t.type is TYPE_START_TAG and t.name is 'col'
2367 el = insert_html_element t
2369 t.acknowledge_self_closing()
2371 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2372 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2374 ins_mode = ins_mode_in_table
2378 if t.type is TYPE_END_TAG and t.name is 'col'
2381 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2384 if t.type is TYPE_EOF
2388 if open_els[0].name isnt 'colgroup'
2392 ins_mode = ins_mode_in_table
2396 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2397 ins_mode_in_table_body = (t) ->
2398 if t.type is TYPE_START_TAG and t.name is 'tr'
2399 clear_stack_to_table_body_context()
2400 insert_html_element t
2401 ins_mode = ins_mode_in_row
2403 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2405 clear_stack_to_table_body_context()
2406 insert_html_element new_open_tag 'tr'
2407 ins_mode = ins_mode_in_row
2410 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2411 unless is_in_table_scope t.name, NS_HTML
2414 clear_stack_to_table_body_context()
2416 ins_mode = ins_mode_in_table
2418 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2421 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2424 if table_scopers[el.name] is el.namespace
2429 clear_stack_to_table_body_context()
2431 ins_mode = ins_mode_in_table
2434 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2440 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2441 ins_mode_in_row = (t) ->
2442 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2443 clear_stack_to_table_row_context()
2444 insert_html_element t
2445 ins_mode = ins_mode_in_cell
2448 if t.type is TYPE_END_TAG and t.name is 'tr'
2449 if is_in_table_scope 'tr', NS_HTML
2450 clear_stack_to_table_row_context()
2452 ins_mode = ins_mode_in_table_body
2456 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2457 if is_in_table_scope 'tr', NS_HTML
2458 clear_stack_to_table_row_context()
2460 ins_mode = ins_mode_in_table_body
2465 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2466 if is_in_table_scope t.name, NS_HTML
2467 if is_in_table_scope 'tr', NS_HTML
2468 clear_stack_to_table_row_context()
2470 ins_mode = ins_mode_in_table_body
2475 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2481 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2483 generate_implied_end_tags()
2484 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2487 el = open_els.shift()
2488 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2490 clear_afe_to_marker()
2491 ins_mode = ins_mode_in_row
2493 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2494 ins_mode_in_cell = (t) ->
2495 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2496 if is_in_table_scope t.name, NS_HTML
2497 generate_implied_end_tags()
2498 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2501 el = open_els.shift()
2502 if el.name is t.name and el.namespace is NS_HTML
2504 clear_afe_to_marker()
2505 ins_mode = ins_mode_in_row
2509 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2512 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2515 if table_scopers[el.name] is el.namespace
2523 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2526 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2527 if is_in_table_scope t.name, NS_HTML
2536 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2537 ins_mode_in_select = (t) ->
2538 if t.type is TYPE_TEXT and t.text is "\u0000"
2541 if t.type is TYPE_TEXT
2544 if t.type is TYPE_COMMENT
2547 if t.type is TYPE_DOCTYPE
2550 if t.type is TYPE_START_TAG and t.name is 'html'
2553 if t.type is TYPE_START_TAG and t.name is 'option'
2554 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2556 insert_html_element t
2558 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2559 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2561 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2563 insert_html_element t
2565 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2566 if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
2567 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2569 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2574 if t.type is TYPE_END_TAG and t.name is 'option'
2575 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2580 if t.type is TYPE_END_TAG and t.name is 'select'
2581 if is_in_select_scope 'select', NS_HTML
2583 el = open_els.shift()
2584 if el.name is 'select' and el.namespace is NS_HTML
2590 if t.type is TYPE_START_TAG and t.name is 'select'
2593 el = open_els.shift()
2594 if el.name is 'select' and el.namespace is NS_HTML
2597 # spec says that this is the same as </select> but it doesn't say
2598 # to check scope first
2600 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2602 if is_in_select_scope 'select', NS_HTML
2605 el = open_els.shift()
2606 if el.name is 'select' and el.namespace is NS_HTML
2611 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2614 if t.type is TYPE_EOF
2621 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2622 ins_mode_in_select_in_table = (t) ->
2623 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2626 el = open_els.shift()
2627 if el.name is 'select' and el.namespace is NS_HTML
2632 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2634 unless is_in_table_scope t.name, NS_HTML
2637 el = open_els.shift()
2638 if el.name is 'select' and el.namespace is NS_HTML
2644 ins_mode_in_select t
2647 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2648 ins_mode_in_template = (t) ->
2649 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2652 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2655 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2656 template_ins_modes.shift()
2657 template_ins_modes.unshift ins_mode_in_table
2658 ins_mode = ins_mode_in_table
2661 if t.type is TYPE_START_TAG and t.name is 'col'
2662 template_ins_modes.shift()
2663 template_ins_modes.unshift ins_mode_in_column_group
2664 ins_mode = ins_mode_in_column_group
2667 if t.type is TYPE_START_TAG and t.name is 'tr'
2668 template_ins_modes.shift()
2669 template_ins_modes.unshift ins_mode_in_table_body
2670 ins_mode = ins_mode_in_table_body
2673 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2674 template_ins_modes.shift()
2675 template_ins_modes.unshift ins_mode_in_row
2676 ins_mode = ins_mode_in_row
2679 if t.type is TYPE_START_TAG
2680 template_ins_modes.shift()
2681 template_ins_modes.unshift ins_mode_in_body
2682 ins_mode = ins_mode_in_body
2685 if t.type is TYPE_END_TAG
2688 if t.type is TYPE_EOF
2689 unless template_tag_is_open()
2694 el = open_els.shift()
2695 if el.name is 'template' and el.namespace is NS_HTML
2697 clear_afe_to_marker()
2698 template_ins_modes.shift()
2702 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2703 ins_mode_after_body = (t) ->
2707 if t.type is TYPE_COMMENT
2708 insert_comment t, [open_els[0], open_els[0].children.length]
2710 if t.type is TYPE_DOCTYPE
2713 if t.type is TYPE_START_TAG and t.name is 'html'
2716 if t.type is TYPE_END_TAG and t.name is 'html'
2717 # fixfull fragment case
2718 ins_mode = ins_mode_after_after_body
2720 if t.type is TYPE_EOF
2725 ins_mode = ins_mode_in_body
2728 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2729 ins_mode_in_frameset = (t) ->
2733 if t.type is TYPE_COMMENT
2736 if t.type is TYPE_DOCTYPE
2739 if t.type is TYPE_START_TAG and t.name is 'html'
2742 if t.type is TYPE_START_TAG and t.name is 'frameset'
2743 insert_html_element t
2745 if t.type is TYPE_END_TAG and t.name is 'frameset'
2746 if open_els.length is 1
2748 return # fragment case
2750 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2751 ins_mode = ins_mode_after_frameset
2753 if t.type is TYPE_START_TAG and t.name is 'frame'
2754 insert_html_element t
2756 t.acknowledge_self_closing()
2758 if t.type is TYPE_START_TAG and t.name is 'noframes'
2761 if t.type is TYPE_EOF
2762 if open_els.length isnt 1
2770 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2771 ins_mode_after_frameset = (t) ->
2775 if t.type is TYPE_COMMENT
2778 if t.type is TYPE_DOCTYPE
2781 if t.type is TYPE_START_TAG and t.name is 'html'
2784 if t.type is TYPE_END_TAG and t.name is 'html'
2785 insert_mode = ins_mode_after_after_frameset
2787 if t.type is TYPE_START_TAG and t.name is 'noframes'
2790 if t.type is TYPE_EOF
2797 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2798 ins_mode_after_after_body = (t) ->
2799 if t.type is TYPE_COMMENT
2800 insert_comment t, [doc, doc.children.length]
2802 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2805 if t.type is TYPE_EOF
2810 ins_mode = ins_mode_in_body
2813 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2814 ins_mode_after_after_frameset = (t) ->
2815 if t.type is TYPE_COMMENT
2816 insert_comment t, [doc, doc.children.length]
2818 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2821 if t.type is TYPE_EOF
2824 if t.type is TYPE_START_TAG and t.name is 'noframes'
2831 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2832 has_color_face_or_size = (t) ->
2834 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2837 in_foreign_content_end_script = ->
2841 in_foreign_content_other_start = (t) ->
2842 acn = adjusted_current_node()
2843 if acn.namespace is NS_MATHML
2844 adjust_mathml_attributes t
2845 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2846 t.name = svg_name_fixes[t.name]
2847 if acn.namespace is NS_SVG
2848 adjust_svg_attributes t
2849 adjust_foreign_attributes t
2850 insert_foreign_element t, acn.namespace
2851 if t.flag 'self-closing'
2852 if t.name is 'script'
2853 t.acknowledge_self_closing()
2854 in_foreign_content_end_script()
2857 t.acknowledge_self_closing()
2859 in_foreign_content = (t) ->
2860 if t.type is TYPE_TEXT and t.text is "\u0000"
2862 insert_character new_character_token "\ufffd"
2867 if t.type is TYPE_TEXT
2868 flag_frameset_ok = false
2871 if t.type is TYPE_COMMENT
2874 if t.type is TYPE_DOCTYPE
2877 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
2879 if flag_fragment_parsing
2880 in_foreign_content_other_start t
2882 loop # is this safe?
2885 if is_mathml_text_integration_point(cn) or is_html_integration(cn) or cn.namespace is NS_HTML
2889 if t.type is TYPE_START_TAG
2890 in_foreign_content_other_start t
2892 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
2893 in_foreign_content_end_script()
2895 if t.type is TYPE_END_TAG
2896 if open_els[0].name.toLowerCase() isnt t.name
2898 for node in open_els
2899 if node is open_els[open_els.length - 1]
2901 if node.name.toLowerCase() is t.name
2903 el = open_els.shift()
2906 if node.namespace is NS_HTML
2908 ins_mode t # explicitly call HTML insertion mode
2911 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2913 switch c = txt.charAt(cur++)
2915 return new_text_node parse_character_reference()
2917 tok_state = tok_state_tag_open
2920 return new_text_node "\ufffd"
2922 return new_eof_token()
2924 return new_text_node c
2927 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2928 # not needed: tok_state_character_reference_in_data = ->
2929 # just call parse_character_reference()
2931 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2932 tok_state_rcdata = ->
2933 switch c = txt.charAt(cur++)
2935 return new_text_node parse_character_reference()
2937 tok_state = tok_state_rcdata_less_than_sign
2940 return new_character_token "\ufffd"
2942 return new_eof_token()
2944 return new_character_token c
2947 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2948 # not needed: tok_state_character_reference_in_rcdata = ->
2949 # just call parse_character_reference()
2951 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2952 tok_state_rawtext = ->
2953 switch c = txt.charAt(cur++)
2955 tok_state = tok_state_rawtext_less_than_sign
2958 return new_character_token "\ufffd"
2960 return new_eof_token()
2962 return new_character_token c
2965 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2966 tok_state_script_data = ->
2967 switch c = txt.charAt(cur++)
2969 tok_state = tok_state_script_data_less_than_sign
2972 return new_character_token "\ufffd"
2974 return new_eof_token()
2976 return new_character_token c
2979 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2980 tok_state_plaintext = ->
2981 switch c = txt.charAt(cur++)
2984 return new_character_token "\ufffd"
2986 return new_eof_token()
2988 return new_character_token c
2992 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2993 tok_state_tag_open = ->
2994 switch c = txt.charAt(cur++)
2996 tok_state = tok_state_markup_declaration_open
2998 tok_state = tok_state_end_tag_open
3001 tok_cur_tag = new_comment_token '?'
3002 tok_state = tok_state_bogus_comment
3005 tok_cur_tag = new_open_tag c
3006 tok_state = tok_state_tag_name
3007 else if is_uc_alpha(c)
3008 tok_cur_tag = new_open_tag c.toLowerCase()
3009 tok_state = tok_state_tag_name
3012 tok_state = tok_state_data
3013 cur -= 1 # we didn't parse/handle the char after <
3014 return new_text_node '<'
3017 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3018 tok_state_end_tag_open = ->
3019 switch c = txt.charAt(cur++)
3022 tok_state = tok_state_data
3025 tok_state = tok_state_data
3026 return new_text_node '</'
3029 tok_cur_tag = new_end_tag c.toLowerCase()
3030 tok_state = tok_state_tag_name
3031 else if is_lc_alpha(c)
3032 tok_cur_tag = new_end_tag c
3033 tok_state = tok_state_tag_name
3036 tok_cur_tag = new_comment_token '/'
3037 tok_state = tok_state_bogus_comment
3040 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3041 tok_state_tag_name = ->
3042 switch c = txt.charAt(cur++)
3043 when "\t", "\n", "\u000c", ' '
3044 tok_state = tok_state_before_attribute_name
3046 tok_state = tok_state_self_closing_start_tag
3048 tok_state = tok_state_data
3054 tok_cur_tag.name += "\ufffd"
3057 tok_state = tok_state_data
3060 tok_cur_tag.name += c.toLowerCase()
3062 tok_cur_tag.name += c
3065 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3066 tok_state_rcdata_less_than_sign = ->
3067 c = txt.charAt(cur++)
3069 temporary_buffer = ''
3070 tok_state = tok_state_rcdata_end_tag_open
3073 tok_state = tok_state_rcdata
3074 cur -= 1 # reconsume the input character
3075 return new_character_token '<'
3077 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3078 tok_state_rcdata_end_tag_open = ->
3079 c = txt.charAt(cur++)
3081 tok_cur_tag = new_end_tag c.toLowerCase()
3082 temporary_buffer += c
3083 tok_state = tok_state_rcdata_end_tag_name
3086 tok_cur_tag = new_end_tag c
3087 temporary_buffer += c
3088 tok_state = tok_state_rcdata_end_tag_name
3091 tok_state = tok_state_rcdata
3092 cur -= 1 # reconsume the input character
3093 return new_character_token "</" # fixfull separate these
3095 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3096 is_appropriate_end_tag = (t) ->
3097 # spec says to check against "the tag name of the last start tag to
3098 # have been emitted from this tokenizer", but this is only called from
3099 # the various "raw" states, so it's hopefully ok to assume that
3100 # open_els[0].name will work instead TODO: verify this after the script
3101 # data states are implemented
3102 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3103 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3105 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3106 tok_state_rcdata_end_tag_name = ->
3107 c = txt.charAt(cur++)
3108 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3109 if is_appropriate_end_tag tok_cur_tag
3110 tok_state = tok_state_before_attribute_name
3112 # else fall through to "Anything else"
3114 if is_appropriate_end_tag tok_cur_tag
3115 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3117 # else fall through to "Anything else"
3119 if is_appropriate_end_tag tok_cur_tag
3120 tok_state = tok_state_data
3122 # else fall through to "Anything else"
3124 tok_cur_tag.name += c.toLowerCase()
3125 temporary_buffer += c
3128 tok_cur_tag.name += c
3129 temporary_buffer += c
3132 tok_state = tok_state_rcdata
3133 cur -= 1 # reconsume the input character
3134 return new_character_token '</' + temporary_buffer # fixfull separate these
3136 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3137 tok_state_rawtext_less_than_sign = ->
3138 c = txt.charAt(cur++)
3140 temporary_buffer = ''
3141 tok_state = tok_state_rawtext_end_tag_open
3144 tok_state = tok_state_rawtext
3145 cur -= 1 # reconsume the input character
3146 return new_character_token '<'
3148 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3149 tok_state_rawtext_end_tag_open = ->
3150 c = txt.charAt(cur++)
3152 tok_cur_tag = new_end_tag c.toLowerCase()
3153 temporary_buffer += c
3154 tok_state = tok_state_rawtext_end_tag_name
3157 tok_cur_tag = new_end_tag c
3158 temporary_buffer += c
3159 tok_state = tok_state_rawtext_end_tag_name
3162 tok_state = tok_state_rawtext
3163 cur -= 1 # reconsume the input character
3164 return new_character_token "</" # fixfull separate these
3166 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3167 tok_state_rawtext_end_tag_name = ->
3168 c = txt.charAt(cur++)
3169 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3170 if is_appropriate_end_tag tok_cur_tag
3171 tok_state = tok_state_before_attribute_name
3173 # else fall through to "Anything else"
3175 if is_appropriate_end_tag tok_cur_tag
3176 tok_state = tok_state_self_closing_start_tag
3178 # else fall through to "Anything else"
3180 if is_appropriate_end_tag tok_cur_tag
3181 tok_state = tok_state_data
3183 # else fall through to "Anything else"
3185 tok_cur_tag.name += c.toLowerCase()
3186 temporary_buffer += c
3189 tok_cur_tag.name += c
3190 temporary_buffer += c
3193 tok_state = tok_state_rawtext
3194 cur -= 1 # reconsume the input character
3195 return new_character_token '</' + temporary_buffer # fixfull separate these
3197 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3198 tok_state_script_data_less_than_sign = ->
3199 c = txt.charAt(cur++)
3201 temporary_buffer = ''
3202 tok_state = tok_state_script_data_end_tag_open
3205 tok_state = tok_state_script_data_escape_start
3206 return new_character_token '<!' # fixfull split
3208 tok_state = tok_state_script_data
3209 cur -= 1 # Reconsume
3210 return new_character_token '<'
3212 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3213 tok_state_script_data_end_tag_open = ->
3214 c = txt.charAt(cur++)
3216 tok_cur_tag = new_end_tag c.toLowerCase()
3217 temporary_buffer += c
3218 tok_state = tok_state_script_data_end_tag_name
3221 tok_cur_tag = new_end_tag c
3222 temporary_buffer += c
3223 tok_state = tok_state_script_data_end_tag_name
3226 tok_state = tok_state_script_data
3227 cur -= 1 # Reconsume
3228 return new_character_token '</'
3230 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3231 tok_state_script_data_end_tag_name = ->
3232 c = txt.charAt(cur++)
3233 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3234 if is_appropriate_end_tag tok_cur_tag
3235 tok_state = tok_state_before_attribute_name
3239 if is_appropriate_end_tag tok_cur_tag
3240 tok_state = tok_state_self_closing_start_tag
3244 if is_appropriate_end_tag tok_cur_tag
3245 tok_state = tok_state_data
3249 tok_cur_tag.name += c.toLowerCase()
3250 temporary_buffer += c
3253 tok_cur_tag.name += c
3254 temporary_buffer += c
3257 tok_state = tok_state_script_data
3258 cur -= 1 # Reconsume
3259 return new_character_token "</#{temporary_buffer}" # fixfull split
3261 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3262 tok_state_script_data_escape_start = ->
3263 c = txt.charAt(cur++)
3265 tok_state = tok_state_script_data_escape_start_dash
3266 return new_character_token '-'
3268 tok_state = tok_state_script_data
3269 cur -= 1 # Reconsume
3272 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3273 tok_state_script_data_escape_start_dash = ->
3274 c = txt.charAt(cur++)
3276 tok_state = tok_state_script_data_escaped_dash_dash
3277 return new_character_token '-'
3279 tok_state = tok_state_script_data
3280 cur -= 1 # Reconsume
3283 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3284 tok_state_script_data_escaped = ->
3285 c = txt.charAt(cur++)
3287 tok_state = tok_state_script_data_escaped_dash
3288 return new_character_token '-'
3290 tok_state = tok_state_script_data_escaped_less_than_sign
3294 return new_character_token "\ufffd"
3296 tok_state = tok_state_data
3298 cur -= 1 # Reconsume
3301 return new_character_token c
3303 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3304 tok_state_script_data_escaped_dash = ->
3305 c = txt.charAt(cur++)
3307 tok_state = tok_state_script_data_escaped_dash_dash
3308 return new_character_token '-'
3310 tok_state = tok_state_script_data_escaped_less_than_sign
3314 tok_state = tok_state_script_data_escaped
3315 return new_character_token "\ufffd"
3317 tok_state = tok_state_data
3319 cur -= 1 # Reconsume
3322 tok_state = tok_state_script_data_escaped
3323 return new_character_token c
3325 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3326 tok_state_script_data_escaped_dash_dash = ->
3327 c = txt.charAt(cur++)
3329 return new_character_token '-'
3331 tok_state = tok_state_script_data_escaped_less_than_sign
3334 tok_state = tok_state_script_data
3335 return new_character_token '>'
3338 tok_state = tok_state_script_data_escaped
3339 return new_character_token "\ufffd"
3342 tok_state = tok_state_data
3343 cur -= 1 # Reconsume
3346 tok_state = tok_state_script_data_escaped
3347 return new_character_token c
3349 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3350 tok_state_script_data_escaped_less_than_sign = ->
3351 c = txt.charAt(cur++)
3353 temporary_buffer = ''
3354 tok_state = tok_state_script_data_escaped_end_tag_open
3357 temporary_buffer = c.toLowerCase() # yes, really
3358 tok_state = tok_state_script_data_double_escape_start
3359 return new_character_token "<#{c}" # fixfull split
3361 temporary_buffer = c
3362 tok_state = tok_state_script_data_double_escape_start
3363 return new_character_token "<#{c}" # fixfull split
3365 tok_state = tok_state_script_data_escaped
3366 cur -= 1 # Reconsume
3367 return new_character_token c
3369 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3370 tok_state_script_data_escaped_end_tag_open = ->
3371 c = txt.charAt(cur++)
3373 tok_cur_tag = new_end_tag c.toLowerCase()
3374 temporary_buffer += c
3375 tok_state = tok_state_script_data_escaped_end_tag_name
3378 tok_cur_tag = new_end_tag c
3379 temporary_buffer += c
3380 tok_state = tok_state_script_data_escaped_end_tag_name
3383 tok_state = tok_state_script_data_escaped
3384 cur -= 1 # Reconsume
3385 return new_character_token '</' # fixfull split
3387 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3388 tok_state_script_data_escaped_end_tag_name = ->
3389 c = txt.charAt(cur++)
3390 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3391 if is_appropriate_end_tag tok_cur_tag
3392 tok_state = tok_state_before_attribute_name
3396 if is_appropriate_end_tag tok_cur_tag
3397 tok_state = tok_state_self_closing_start_tag
3401 if is_appropriate_end_tag tok_cur_tag
3402 tok_state = tok_state_data
3406 tok_cur_tag.name += c.toLowerCase()
3407 temporary_buffer += c.toLowerCase()
3410 tok_cur_tag.name += c
3411 temporary_buffer += c.toLowerCase()
3414 tok_state = tok_state_script_data_escaped
3415 cur -= 1 # Reconsume
3416 return new_character_token "</#{temporary_buffer}" # fixfull split
3418 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3419 tok_state_script_data_double_escape_start = ->
3420 c = txt.charAt(cur++)
3421 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3422 if temporary_buffer is 'script'
3423 tok_state = tok_state_script_data_double_escaped
3425 tok_state = tok_state_script_data_escaped
3426 return new_character_token c
3428 temporary_buffer += c.toLowerCase() # yes, really lowercase
3429 return new_character_token c
3431 temporary_buffer += c
3432 return new_character_token c
3434 tok_state = tok_state_script_data_escaped
3435 cur -= 1 # Reconsume
3438 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3439 tok_state_script_data_double_escaped = ->
3440 c = txt.charAt(cur++)
3442 tok_state = tok_state_script_data_double_escaped_dash
3443 return new_character_token '-'
3445 tok_state = tok_state_script_data_double_escaped_less_than_sign
3446 return new_character_token '<'
3449 return new_character_token "\ufffd"
3452 tok_state = tok_state_data
3453 cur -= 1 # Reconsume
3456 return new_character_token c
3458 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3459 tok_state_script_data_double_escaped_dash = ->
3460 c = txt.charAt(cur++)
3462 tok_state = tok_state_script_data_double_escaped_dash_dash
3463 return new_character_token '-'
3465 tok_state = tok_state_script_data_double_escaped_less_than_sign
3466 return new_character_token '<'
3469 tok_state = tok_state_script_data_double_escaped
3470 return new_character_token "\ufffd"
3473 tok_state = tok_state_data
3474 cur -= 1 # Reconsume
3477 tok_state = tok_state_script_data_double_escaped
3478 return new_character_token c
3480 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3481 tok_state_script_data_double_escaped_dash_dash = ->
3482 c = txt.charAt(cur++)
3484 return new_character_token '-'
3486 tok_state = tok_state_script_data_double_escaped_less_than_sign
3487 return new_character_token '<'
3489 tok_state = tok_state_script_data
3490 return new_character_token '>'
3493 tok_state = tok_state_script_data_double_escaped
3494 return new_character_token "\ufffd"
3497 tok_state = tok_state_data
3498 cur -= 1 # Reconsume
3501 tok_state = tok_state_script_data_double_escaped
3502 return new_character_token c
3504 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3505 tok_state_script_data_double_escaped_less_than_sign = ->
3506 c = txt.charAt(cur++)
3508 temporary_buffer = ''
3509 tok_state = tok_state_script_data_double_escape_end
3510 return new_character_token '/'
3512 tok_state = tok_state_script_data_double_escaped
3513 cur -= 1 # Reconsume
3516 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3517 tok_state_script_data_double_escape_end = ->
3518 c = txt.charAt(cur++)
3519 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3520 if temporary_buffer is 'script'
3521 tok_state = tok_state_script_data_escaped
3523 tok_state = tok_state_script_data_double_escaped
3524 return new_character_token c
3526 temporary_buffer += c.toLowerCase() # yes, really lowercase
3527 return new_character_token c
3529 temporary_buffer += c
3530 return new_character_token c
3532 tok_state = tok_state_script_data_double_escaped
3533 cur -= 1 # Reconsume
3536 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3537 tok_state_before_attribute_name = ->
3539 switch c = txt.charAt(cur++)
3540 when "\t", "\n", "\u000c", ' '
3543 tok_state = tok_state_self_closing_start_tag
3546 tok_state = tok_state_data
3552 attr_name = "\ufffd"
3553 when '"', "'", '<', '='
3558 tok_state = tok_state_data
3561 attr_name = c.toLowerCase()
3565 tok_cur_tag.attrs_a.unshift [attr_name, '']
3566 tok_state = tok_state_attribute_name
3569 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3570 tok_state_attribute_name = ->
3571 switch c = txt.charAt(cur++)
3572 when "\t", "\n", "\u000c", ' '
3573 tok_state = tok_state_after_attribute_name
3575 tok_state = tok_state_self_closing_start_tag
3577 tok_state = tok_state_before_attribute_value
3579 tok_state = tok_state_data
3585 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3588 tok_cur_tag.attrs_a[0][0] += c
3591 tok_state = tok_state_data
3594 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3596 tok_cur_tag.attrs_a[0][0] += c
3599 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3600 tok_state_after_attribute_name = ->
3601 c = txt.charAt(cur++)
3602 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3605 tok_state = tok_state_self_closing_start_tag
3608 tok_state = tok_state_before_attribute_value
3611 tok_state = tok_state_data
3614 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3615 tok_state = tok_state_attribute_name
3619 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3620 tok_state = tok_state_attribute_name
3624 tok_state = tok_state_data
3625 cur -= 1 # reconsume
3627 if c is '"' or c is "'" or c is '<'
3629 # fall through to Anything else
3631 tok_cur_tag.attrs_a.unshift [c, '']
3632 tok_state = tok_state_attribute_name
3634 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3635 tok_state_before_attribute_value = ->
3636 switch c = txt.charAt(cur++)
3637 when "\t", "\n", "\u000c", ' '
3640 tok_state = tok_state_attribute_value_double_quoted
3642 tok_state = tok_state_attribute_value_unquoted
3645 tok_state = tok_state_attribute_value_single_quoted
3648 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3649 tok_state = tok_state_attribute_value_unquoted
3652 tok_state = tok_state_data
3658 tok_state = tok_state_data
3660 tok_cur_tag.attrs_a[0][1] += c
3661 tok_state = tok_state_attribute_value_unquoted
3664 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3665 tok_state_attribute_value_double_quoted = ->
3666 switch c = txt.charAt(cur++)
3668 tok_state = tok_state_after_attribute_value_quoted
3670 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3673 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3676 tok_state = tok_state_data
3678 tok_cur_tag.attrs_a[0][1] += c
3681 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3682 tok_state_attribute_value_single_quoted = ->
3683 switch c = txt.charAt(cur++)
3685 tok_state = tok_state_after_attribute_value_quoted
3687 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3690 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3693 tok_state = tok_state_data
3695 tok_cur_tag.attrs_a[0][1] += c
3698 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3699 tok_state_attribute_value_unquoted = ->
3700 switch c = txt.charAt(cur++)
3701 when "\t", "\n", "\u000c", ' '
3702 tok_state = tok_state_before_attribute_name
3704 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3706 tok_state = tok_state_data
3711 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3714 tok_state = tok_state_data
3716 # Parse Error if ', <, = or ` (backtick)
3717 tok_cur_tag.attrs_a[0][1] += c
3720 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3721 tok_state_after_attribute_value_quoted = ->
3722 switch c = txt.charAt(cur++)
3723 when "\t", "\n", "\u000c", ' '
3724 tok_state = tok_state_before_attribute_name
3726 tok_state = tok_state_self_closing_start_tag
3728 tok_state = tok_state_data
3734 tok_state = tok_state_data
3737 tok_state = tok_state_before_attribute_name
3738 cur -= 1 # we didn't handle that char
3741 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3742 tok_state_self_closing_start_tag = ->
3743 c = txt.charAt(cur++)
3745 tok_cur_tag.flag 'self-closing'
3746 tok_state = tok_state_data
3750 tok_state = tok_state_data
3751 cur -= 1 # Reconsume
3755 tok_state = tok_state_before_attribute_name
3756 cur -= 1 # Reconsume
3759 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3760 # WARNING: put a comment token in tok_cur_tag before setting this state
3761 tok_state_bogus_comment = ->
3762 next_gt = txt.indexOf '>', cur
3764 val = txt.substr cur
3767 val = txt.substr cur, (next_gt - cur)
3769 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3770 tok_cur_tag.text += val
3771 tok_state = tok_state_data
3774 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3775 tok_state_markup_declaration_open = ->
3776 if txt.substr(cur, 2) is '--'
3778 tok_cur_tag = new_comment_token ''
3779 tok_state = tok_state_comment_start
3781 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3783 tok_state = tok_state_doctype
3785 acn = adjusted_current_node()
3786 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3788 tok_state = tok_state_cdata_section
3792 tok_cur_tag = new_comment_token ''
3793 tok_state = tok_state_bogus_comment
3796 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3797 tok_state_comment_start = ->
3798 switch c = txt.charAt(cur++)
3800 tok_state = tok_state_comment_start_dash
3803 tok_state = tok_state_comment
3804 return new_character_token "\ufffd"
3807 tok_state = tok_state_data
3811 tok_state = tok_state_data
3812 cur -= 1 # Reconsume
3815 tok_cur_tag.text += c
3816 tok_state = tok_state_comment
3819 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3820 tok_state_comment_start_dash = ->
3821 switch c = txt.charAt(cur++)
3823 tok_state = tok_state_comment_end
3826 tok_cur_tag.text += "-\ufffd"
3827 tok_state = tok_state_comment
3830 tok_state = tok_state_data
3834 tok_state = tok_state_data
3835 cur -= 1 # Reconsume
3838 tok_cur_tag.text += "-#{c}"
3839 tok_state = tok_state_comment
3842 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3843 tok_state_comment = ->
3844 switch c = txt.charAt(cur++)
3846 tok_state = tok_state_comment_end_dash
3849 tok_cur_tag.text += "\ufffd"
3852 tok_state = tok_state_data
3853 cur -= 1 # Reconsume
3856 tok_cur_tag.text += c
3859 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3860 tok_state_comment_end_dash = ->
3861 switch c = txt.charAt(cur++)
3863 tok_state = tok_state_comment_end
3866 tok_cur_tag.text += "-\ufffd"
3867 tok_state = tok_state_comment
3870 tok_state = tok_state_data
3871 cur -= 1 # Reconsume
3874 tok_cur_tag.text += "-#{c}"
3875 tok_state = tok_state_comment
3878 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3879 tok_state_comment_end = ->
3880 switch c = txt.charAt(cur++)
3882 tok_state = tok_state_data
3886 tok_cur_tag.text += "--\ufffd"
3887 tok_state = tok_state_comment
3890 tok_state = tok_state_comment_end_bang
3893 tok_cur_tag.text += '-'
3896 tok_state = tok_state_data
3897 cur -= 1 # Reconsume
3901 tok_cur_tag.text += "--#{c}"
3902 tok_state = tok_state_comment
3905 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3906 tok_state_comment_end_bang = ->
3907 switch c = txt.charAt(cur++)
3909 tok_cur_tag.text += "--!#{c}"
3910 tok_state = tok_state_comment_end_dash
3912 tok_state = tok_state_data
3916 tok_cur_tag.text += "--!\ufffd"
3917 tok_state = tok_state_comment
3920 tok_state = tok_state_data
3921 cur -= 1 # Reconsume
3924 tok_cur_tag.text += "--!#{c}"
3925 tok_state = tok_state_comment
3928 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3929 tok_state_doctype = ->
3930 switch c = txt.charAt(cur++)
3931 when "\t", "\u000a", "\u000c", ' '
3932 tok_state = tok_state_before_doctype_name
3935 tok_state = tok_state_data
3936 el = new_doctype_token ''
3937 el.flag 'force-quirks', true
3938 cur -= 1 # Reconsume
3942 tok_state = tok_state_before_doctype_name
3943 cur -= 1 # Reconsume
3946 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3947 tok_state_before_doctype_name = ->
3948 c = txt.charAt(cur++)
3949 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3952 tok_cur_tag = new_doctype_token c.toLowerCase()
3953 tok_state = tok_state_doctype_name
3957 tok_cur_tag = new_doctype_token "\ufffd"
3958 tok_state = tok_state_doctype_name
3962 el = new_doctype_token ''
3963 el.flag 'force-quirks', true
3964 tok_state = tok_state_data
3968 tok_state = tok_state_data
3969 el = new_doctype_token ''
3970 el.flag 'force-quirks', true
3971 cur -= 1 # Reconsume
3974 tok_cur_tag = new_doctype_token c
3975 tok_state = tok_state_doctype_name
3978 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3979 tok_state_doctype_name = ->
3980 c = txt.charAt(cur++)
3981 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3982 tok_state = tok_state_after_doctype_name
3985 tok_state = tok_state_data
3988 tok_cur_tag.name += c.toLowerCase()
3992 tok_cur_tag.name += "\ufffd"
3996 tok_state = tok_state_data
3997 tok_cur_tag.flag 'force-quirks', true
3998 cur -= 1 # Reconsume
4001 tok_cur_tag.name += c
4004 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4005 tok_state_after_doctype_name = ->
4006 c = txt.charAt(cur++)
4007 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4010 tok_state = tok_state_data
4014 tok_state = tok_state_data
4015 tok_cur_tag.flag 'force-quirks', true
4016 cur -= 1 # Reconsume
4019 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4021 tok_state = tok_state_after_doctype_public_keyword
4023 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4025 tok_state = tok_state_after_doctype_system_keyword
4028 tok_cur_tag.flag 'force-quirks', true
4029 tok_state = tok_state_bogus_doctype
4032 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4033 tok_state_after_doctype_public_keyword = ->
4034 c = txt.charAt(cur++)
4035 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4036 tok_state = tok_state_before_doctype_public_identifier
4040 tok_cur_tag.public_identifier = ''
4041 tok_state = tok_state_doctype_public_identifier_double_quoted
4045 tok_cur_tag.public_identifier = ''
4046 tok_state = tok_state_doctype_public_identifier_single_quoted
4050 tok_cur_tag.flag 'force-quirks', true
4051 tok_state = tok_state_data
4055 tok_state = tok_state_data
4056 tok_cur_tag.flag 'force-quirks', true
4057 cur -= 1 # Reconsume
4061 tok_cur_tag.flag 'force-quirks', true
4062 tok_state = tok_state_bogus_doctype
4065 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4066 tok_state_before_doctype_public_identifier = ->
4067 c = txt.charAt(cur++)
4068 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4072 tok_cur_tag.public_identifier = ''
4073 tok_state = tok_state_doctype_public_identifier_double_quoted
4077 tok_cur_tag.public_identifier = ''
4078 tok_state = tok_state_doctype_public_identifier_single_quoted
4082 tok_cur_tag.flag 'force-quirks', true
4083 tok_state = tok_state_data
4087 tok_state = tok_state_data
4088 tok_cur_tag.flag 'force-quirks', true
4089 cur -= 1 # Reconsume
4093 tok_cur_tag.flag 'force-quirks', true
4094 tok_state = tok_state_bogus_doctype
4098 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4099 tok_state_doctype_public_identifier_double_quoted = ->
4100 c = txt.charAt(cur++)
4102 tok_state = tok_state_after_doctype_public_identifier
4106 tok_cur_tag.public_identifier += "\ufffd"
4110 tok_cur_tag.flag 'force-quirks', true
4111 tok_state = tok_state_data
4115 tok_state = tok_state_data
4116 tok_cur_tag.flag 'force-quirks', true
4117 cur -= 1 # Reconsume
4120 tok_cur_tag.public_identifier += c
4123 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4124 tok_state_doctype_public_identifier_single_quoted = ->
4125 c = txt.charAt(cur++)
4127 tok_state = tok_state_after_doctype_public_identifier
4131 tok_cur_tag.public_identifier += "\ufffd"
4135 tok_cur_tag.flag 'force-quirks', true
4136 tok_state = tok_state_data
4140 tok_state = tok_state_data
4141 tok_cur_tag.flag 'force-quirks', true
4142 cur -= 1 # Reconsume
4145 tok_cur_tag.public_identifier += c
4148 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4149 tok_state_after_doctype_public_identifier = ->
4150 c = txt.charAt(cur++)
4151 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4152 tok_state = tok_state_between_doctype_public_and_system_identifiers
4155 tok_state = tok_state_data
4159 tok_cur_tag.system_identifier = ''
4160 tok_state = tok_state_doctype_system_identifier_double_quoted
4164 tok_cur_tag.system_identifier = ''
4165 tok_state = tok_state_doctype_system_identifier_single_quoted
4169 tok_state = tok_state_data
4170 tok_cur_tag.flag 'force-quirks', true
4171 cur -= 1 # Reconsume
4175 tok_cur_tag.flag 'force-quirks', true
4176 tok_state = tok_state_bogus_doctype
4179 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4180 tok_state_between_doctype_public_and_system_identifiers = ->
4181 c = txt.charAt(cur++)
4182 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4185 tok_state = tok_state_data
4189 tok_cur_tag.system_identifier = ''
4190 tok_state = tok_state_doctype_system_identifier_double_quoted
4194 tok_cur_tag.system_identifier = ''
4195 tok_state = tok_state_doctype_system_identifier_single_quoted
4199 tok_state = tok_state_data
4200 tok_cur_tag.flag 'force-quirks', true
4201 cur -= 1 # Reconsume
4205 tok_cur_tag.flag 'force-quirks', true
4206 tok_state = tok_state_bogus_doctype
4209 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4210 tok_state_after_doctype_system_keyword = ->
4211 c = txt.charAt(cur++)
4212 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4213 tok_state = tok_state_before_doctype_system_identifier
4217 tok_cur_tag.system_identifier = ''
4218 tok_state = tok_state_doctype_system_identifier_double_quoted
4222 tok_cur_tag.system_identifier = ''
4223 tok_state = tok_state_doctype_system_identifier_single_quoted
4227 tok_cur_tag.flag 'force-quirks', true
4228 tok_state = tok_state_data
4232 tok_state = tok_state_data
4233 tok_cur_tag.flag 'force-quirks', true
4234 cur -= 1 # Reconsume
4238 tok_cur_tag.flag 'force-quirks', true
4239 tok_state = tok_state_bogus_doctype
4242 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4243 tok_state_before_doctype_system_identifier = ->
4244 c = txt.charAt(cur++)
4245 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4248 tok_cur_tag.system_identifier = ''
4249 tok_state = tok_state_doctype_system_identifier_double_quoted
4252 tok_cur_tag.system_identifier = ''
4253 tok_state = tok_state_doctype_system_identifier_single_quoted
4257 tok_cur_tag.flag 'force-quirks', true
4258 tok_state = tok_state_data
4262 tok_state = tok_state_data
4263 tok_cur_tag.flag 'force-quirks', true
4264 cur -= 1 # Reconsume
4268 tok_cur_tag.flag 'force-quirks', true
4269 tok_state = tok_state_bogus_doctype
4272 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4273 tok_state_doctype_system_identifier_double_quoted = ->
4274 c = txt.charAt(cur++)
4276 tok_state = tok_state_after_doctype_system_identifier
4280 tok_cur_tag.system_identifier += "\ufffd"
4284 tok_cur_tag.flag 'force-quirks', true
4285 tok_state = tok_state_data
4289 tok_state = tok_state_data
4290 tok_cur_tag.flag 'force-quirks', true
4291 cur -= 1 # Reconsume
4294 tok_cur_tag.system_identifier += c
4297 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4298 tok_state_doctype_system_identifier_single_quoted = ->
4299 c = txt.charAt(cur++)
4301 tok_state = tok_state_after_doctype_system_identifier
4305 tok_cur_tag.system_identifier += "\ufffd"
4309 tok_cur_tag.flag 'force-quirks', true
4310 tok_state = tok_state_data
4314 tok_state = tok_state_data
4315 tok_cur_tag.flag 'force-quirks', true
4316 cur -= 1 # Reconsume
4319 tok_cur_tag.system_identifier += c
4322 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4323 tok_state_after_doctype_system_identifier = ->
4324 c = txt.charAt(cur++)
4325 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4328 tok_state = tok_state_data
4332 tok_state = tok_state_data
4333 tok_cur_tag.flag 'force-quirks', true
4334 cur -= 1 # Reconsume
4338 # do _not_ tok_cur_tag.flag 'force-quirks', true
4339 tok_state = tok_state_bogus_doctype
4342 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4343 tok_state_bogus_doctype = ->
4344 c = txt.charAt(cur++)
4346 tok_state = tok_state_data
4349 tok_state = tok_state_data
4350 cur -= 1 # Reconsume
4355 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4356 tok_state_cdata_section = ->
4357 tok_state = tok_state_data
4358 next_gt = txt.indexOf ']]>', cur
4360 val = txt.substr cur
4363 val = txt.substr cur, (next_gt - cur)
4365 return new_character_token val # fixfull split
4367 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4368 # Don't set this as a state, just call it
4369 # returns a string (NOT a text node)
4370 parse_character_reference = (allowed_char = null, in_attr = false) ->
4371 if cur >= txt.length
4373 switch c = txt.charAt(cur)
4374 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4375 # explicitly not a parse error
4378 # there has to be "one or more" alnums between & and ; to be a parse error
4381 if cur + 1 >= txt.length
4383 if txt.charAt(cur + 1).toLowerCase() is 'x'
4392 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4397 if txt.charAt(start + i) is ';'
4401 code_point = txt.substr(start, i)
4402 while code_point.charAt(0) is '0' and code_point.length > 1
4403 code_point = code_point.substr 1
4404 code_point = parseInt(code_point, base)
4405 if unicode_fixes[code_point]?
4407 return unicode_fixes[code_point]
4409 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4413 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4415 return from_code_point code_point
4419 if alnum.indexOf(txt.charAt(cur + i)) is -1
4422 # exit early, because parse_error() below needs at least one alnum
4424 if txt.charAt(cur + i) is ';'
4425 i += 1 # include ';' terminator in value
4426 decoded = decode_named_char_ref txt.substr(cur, i)
4433 # no ';' terminator (only legacy char refs)
4435 for i in [2..max] # no prefix matches, so ok to check shortest first
4436 c = legacy_char_refs[txt.substr(cur, i)]
4439 if txt.charAt(cur + i) is '='
4440 # "because some legacy user agents will
4441 # misinterpret the markup in those cases"
4444 if alnum.indexOf(txt.charAt(cur + i)) > -1
4445 # this makes attributes forgiving about url args
4447 # ok, and besides the weird exceptions for attributes...
4448 # return the matching char
4449 cur += i # consume entity chars
4450 parse_error() # because no terminating ";"
4454 return # never reached
4456 # tree constructor initialization
4457 # see comments on TYPE_TAG/etc for the structure of this data
4460 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4462 afe = [] # active formatting elements
4463 template_ins_modes = []
4464 ins_mode = ins_mode_initial
4465 original_ins_mode = ins_mode # TODO check spec
4466 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4467 flag_frameset_ok = true
4469 flag_foster_parenting = false
4470 form_element_pointer = null
4471 temporary_buffer = null
4472 pending_table_character_tokens = []
4473 head_element_pointer = null
4474 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4475 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4477 # tokenizer initialization
4478 tok_state = tok_state_data
4480 # text pre-processing
4481 # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4482 txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4483 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4484 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4486 if args.name is "plain-text-unsafe.dat #4"
4489 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4494 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4497 serialize_els = (els, shallow, show_ids) ->
4503 serialized += t.serialize shallow, show_ids
4506 module.exports.parse_html = parse_html
4507 module.exports.debug_log_reset = debug_log_reset
4508 module.exports.debug_log_each = debug_log_each
4509 module.exports.TYPE_TAG = TYPE_TAG
4510 module.exports.TYPE_TEXT = TYPE_TEXT
4511 module.exports.TYPE_COMMENT = TYPE_COMMENT
4512 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4513 module.exports.NS_HTML = NS_HTML
4514 module.exports.NS_MATHML = NS_MATHML
4515 module.exports.NS_SVG = NS_SVG