1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
25 # Deviations from that spec:
27 # Purposeful: search this file for "WTAG"
29 # Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
34 # the spec uses a many different words do indicate which ends of lists/stacks
35 # they are talking about (and relative movement within the lists/stacks). This
36 # section splains. I'm implementing "lists" (afe and open_els) the same way
39 # stacks grow downward (current element is index=0)
41 # example: open_els = [a, b, c, d, e, f, g]
43 # "grows downwards" means it's visualized like this: (index: el, names)
45 # 6: g "start of the list", "topmost", "first"
47 # 4: e "previous" (to d), "above", "before"
48 # 3: d (previous/next are relative to this element)
49 # 2: c "next", "after", "lower", "below"
51 # 0: a "end of the list", "current node", "bottommost", "last"
55 # note: to get this to run outside a browser, you'll have to write a native
56 # implementation of decode_named_char_ref()
57 unless module?.exports?
59 module = exports: window.wheic
61 from_code_point = (x) ->
62 if String.fromCodePoint?
63 return String.fromCodePoint x
66 return String.fromCharCode x
68 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
70 # Each node is an obect of the Node class. Here are the Node types:
71 TYPE_TAG = 0 # name, {attributes}, [children]
72 TYPE_TEXT = 1 # "text"
75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
77 TYPE_END_TAG = 5 # name
79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
92 debug_log_each = (cb) ->
93 for str in g_debug_log
98 constructor: (type, args = {}) ->
99 @type = type # one of the TYPE_* constants above
100 @name = args.name ? '' # tag name
101 @text = args.text ? '' # contents for text/comment nodes
102 @attrs = args.attrs ? {}
103 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
104 @children = args.children ? []
105 @namespace = args.namespace ? NS_HTML
106 @parent = args.parent ? null
107 @token = args.token ? null
108 @flags = args.flags ? {}
112 @id = "#{++prev_node_id}"
113 acknowledge_self_closing: ->
115 @token.flag 'did_self_close'
117 @flag 'did_self_close', true
118 flag: (key, value = null) ->
123 serialize: (shallow = false, show_ids = false) -> # for unit tests
128 ret += JSON.stringify @name
143 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
149 ret += c.serialize shallow, show_ids
153 ret += JSON.stringify @text
156 ret += JSON.stringify @text
158 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
161 when TYPE_AAA_BOOKMARK
162 ret += 'aaa_bookmark'
165 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
168 # helpers: (only take args that are normally known when parser creates nodes)
169 new_open_tag = (name) ->
170 return new Node TYPE_START_TAG, name: name
171 new_end_tag = (name) ->
172 return new Node TYPE_END_TAG, name: name
173 new_element = (name) ->
174 return new Node TYPE_TAG, name: name
175 new_text_node = (txt) ->
176 return new Node TYPE_TEXT, text: txt
177 new_character_token = new_text_node
178 new_comment_token = (txt) ->
179 return new Node TYPE_COMMENT, text: txt
180 new_doctype_token = (name) ->
181 return new Node TYPE_DOCTYPE, name: name
183 return new Node TYPE_EOF
185 return new Node TYPE_AFE_MARKER
186 new_aaa_bookmark = ->
187 return new Node TYPE_AAA_BOOKMARK
189 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
190 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
191 digits = "0123456789"
192 alnum = lc_alpha + uc_alpha + digits
193 hex_chars = digits + "abcdefABCDEF"
195 is_uc_alpha = (str) ->
196 return str.length is 1 and uc_alpha.indexOf(str) > -1
197 is_lc_alpha = (str) ->
198 return str.length is 1 and lc_alpha.indexOf(str) > -1
200 # some SVG elements have dashes in them
201 tag_name_chars = alnum + "-"
203 # http://www.w3.org/TR/html5/infrastructure.html#space-character
204 space_chars = "\u0009\u000a\u000c\u000d\u0020"
206 return txt.length is 1 and space_chars.indexOf(txt) > -1
207 is_space_tok = (t) ->
208 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
210 is_input_hidden_tok = (t) ->
211 return false unless t.type is TYPE_START_TAG
214 if a[1].toLowerCase() is 'hidden'
219 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
220 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
223 unicode_fixes[0x00] = "\uFFFD"
224 unicode_fixes[0x80] = "\u20AC"
225 unicode_fixes[0x82] = "\u201A"
226 unicode_fixes[0x83] = "\u0192"
227 unicode_fixes[0x84] = "\u201E"
228 unicode_fixes[0x85] = "\u2026"
229 unicode_fixes[0x86] = "\u2020"
230 unicode_fixes[0x87] = "\u2021"
231 unicode_fixes[0x88] = "\u02C6"
232 unicode_fixes[0x89] = "\u2030"
233 unicode_fixes[0x8A] = "\u0160"
234 unicode_fixes[0x8B] = "\u2039"
235 unicode_fixes[0x8C] = "\u0152"
236 unicode_fixes[0x8E] = "\u017D"
237 unicode_fixes[0x91] = "\u2018"
238 unicode_fixes[0x92] = "\u2019"
239 unicode_fixes[0x93] = "\u201C"
240 unicode_fixes[0x94] = "\u201D"
241 unicode_fixes[0x95] = "\u2022"
242 unicode_fixes[0x96] = "\u2013"
243 unicode_fixes[0x97] = "\u2014"
244 unicode_fixes[0x98] = "\u02DC"
245 unicode_fixes[0x99] = "\u2122"
246 unicode_fixes[0x9A] = "\u0161"
247 unicode_fixes[0x9B] = "\u203A"
248 unicode_fixes[0x9C] = "\u0153"
249 unicode_fixes[0x9E] = "\u017E"
250 unicode_fixes[0x9F] = "\u0178"
252 # These are the character references that don't need a terminating semicolon
253 # min length: 2, max: 6, none are a prefix of any other.
255 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
256 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
257 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
258 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
259 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
260 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
261 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
262 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
263 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
264 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
265 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
266 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
267 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
268 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
269 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
270 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
271 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
275 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
276 raw_text_elements = ['script', 'style']
277 escapable_raw_text_elements = ['textarea', 'title']
278 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
280 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
281 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
282 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
283 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
284 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
285 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
286 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
287 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
288 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
289 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
290 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
291 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
292 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
293 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
297 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
299 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
300 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
301 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
302 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
303 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
304 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
305 'determinant', 'diff', 'divergence', 'divide', 'domain',
306 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
307 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
308 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
309 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
310 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
311 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
312 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
313 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
314 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
315 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
316 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
317 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
318 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
319 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
320 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
321 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
322 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
323 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
324 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
325 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
326 'vectorproduct', 'xor'
328 # foreign_elements = [svg_elements..., mathml_elements...]
329 #normal_elements = All other allowed HTML elements are normal elements.
333 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
334 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
335 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
336 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
337 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
338 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
339 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
340 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
341 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
342 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
343 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
344 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
345 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
346 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
347 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
348 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
349 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
350 wbr:NS_HTML, xmp:NS_HTML,
353 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
354 'annotation-xml':NS_MATHML,
357 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
360 formatting_elements = {
361 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
362 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
366 mathml_text_integration = {
367 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
369 is_mathml_text_integration_point = (el) ->
370 return mathml_text_integration[el.name] is el.namespace
371 is_html_integration = (el) -> # DON'T PASS A TOKEN
372 if el.namespace is NS_MATHML
373 if el.name is 'annotation-xml'
374 if el.attrs.encoding?
375 if el.attrs.encoding.toLowerCase() is 'text/html'
377 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
380 if el.namespace is NS_SVG
381 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
386 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
389 foster_parenting_targets = {
410 el_is_special = (e) ->
411 return special_elements[e.name] is e.namespace
413 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
414 el_is_special_not_adp = (el) ->
415 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
419 altglyphdef: 'altGlyphDef'
420 altglyphitem: 'altGlyphItem'
421 animatecolor: 'animateColor'
422 animatemotion: 'animateMotion'
423 animatetransform: 'animateTransform'
426 fecolormatrix: 'feColorMatrix'
427 fecomponenttransfer: 'feComponentTransfer'
428 fecomposite: 'feComposite'
429 feconvolvematrix: 'feConvolveMatrix'
430 fediffuselighting: 'feDiffuseLighting'
431 fedisplacementmap: 'feDisplacementMap'
432 fedistantlight: 'feDistantLight'
433 fedropshadow: 'feDropShadow'
439 fegaussianblur: 'feGaussianBlur'
442 femergenode: 'feMergeNode'
443 femorphology: 'feMorphology'
445 fepointlight: 'fePointLight'
446 fespecularlighting: 'feSpecularLighting'
447 fespotlight: 'feSpotLight'
449 feturbulence: 'feTurbulence'
450 foreignobject: 'foreignObject'
452 lineargradient: 'linearGradient'
453 radialgradient: 'radialGradient'
456 svg_attribute_fixes = {
457 attributename: 'attributeName'
458 attributetype: 'attributeType'
459 basefrequency: 'baseFrequency'
460 baseprofile: 'baseProfile'
462 clippathunits: 'clipPathUnits'
463 contentscripttype: 'contentScriptType'
464 contentstyletype: 'contentStyleType'
465 diffuseconstant: 'diffuseConstant'
467 externalresourcesrequired: 'externalResourcesRequired'
468 filterres: 'filterRes'
469 filterunits: 'filterUnits'
471 gradienttransform: 'gradientTransform'
472 gradientunits: 'gradientUnits'
473 kernelmatrix: 'kernelMatrix'
474 kernelunitlength: 'kernelUnitLength'
475 keypoints: 'keyPoints'
476 keysplines: 'keySplines'
478 lengthadjust: 'lengthAdjust'
479 limitingconeangle: 'limitingConeAngle'
480 markerheight: 'markerHeight'
481 markerunits: 'markerUnits'
482 markerwidth: 'markerWidth'
483 maskcontentunits: 'maskContentUnits'
484 maskunits: 'maskUnits'
485 numoctaves: 'numOctaves'
486 pathlength: 'pathLength'
487 patterncontentunits: 'patternContentUnits'
488 patterntransform: 'patternTransform'
489 patternunits: 'patternUnits'
490 pointsatx: 'pointsAtX'
491 pointsaty: 'pointsAtY'
492 pointsatz: 'pointsAtZ'
493 preservealpha: 'preserveAlpha'
494 preserveaspectratio: 'preserveAspectRatio'
495 primitiveunits: 'primitiveUnits'
498 repeatcount: 'repeatCount'
499 repeatdur: 'repeatDur'
500 requiredextensions: 'requiredExtensions'
501 requiredfeatures: 'requiredFeatures'
502 specularconstant: 'specularConstant'
503 specularexponent: 'specularExponent'
504 spreadmethod: 'spreadMethod'
505 startoffset: 'startOffset'
506 stddeviation: 'stdDeviation'
507 stitchtiles: 'stitchTiles'
508 surfacescale: 'surfaceScale'
509 systemlanguage: 'systemLanguage'
510 tablevalues: 'tableValues'
513 textlength: 'textLength'
515 viewtarget: 'viewTarget'
516 xchannelselector: 'xChannelSelector'
517 ychannelselector: 'yChannelSelector'
518 zoomandpan: 'zoomAndPan'
520 adjust_mathml_attributes = (t) ->
522 if a[0] is 'definitionurl'
523 a[0] = 'definitionURL'
525 adjust_svg_attributes = (t) ->
527 if svg_attribute_fixes[a[0]]?
528 a[0] = svg_attribute_fixes[a[0]]
530 adjust_foreign_attributes = (t) ->
534 # decode_named_char_ref()
536 # The list of named character references is _huge_ so ask the browser to decode
537 # for us instead of wasting bandwidth/space on including the table here.
539 # Pass without the "&" but with the ";" examples:
540 # for "&" pass "amp;"
541 # for "′" pass "x2032;"
544 textarea: document.createElement('textarea')
546 # TODO test this in IE8
547 decode_named_char_ref = (txt) ->
549 decoded = g_dncr.cache[txt]
550 return decoded if decoded?
551 g_dncr.textarea.innerHTML = txt
552 decoded = g_dncr.textarea.value
553 return null if decoded is txt
554 return g_dncr.cache[txt] = decoded
556 parse_html = (args) ->
558 cur = null # index of next char in txt to be parsed
559 # declare doc and tokenizer variables so they're in scope below
561 open_els = null # stack of open elements
562 afe = null # active formatting elements
563 template_ins_modes = null
565 original_ins_mode = null
567 tok_cur_tag = null # partially parsed tag
568 flag_scripting = null
569 flag_frameset_ok = null
571 flag_foster_parenting = null
572 form_element_pointer = null
573 temporary_buffer = null
574 pending_table_character_tokens = null
575 head_element_pointer = null
576 flag_fragment_parsing = null
577 context_element = null
586 console.log "Parse error at character #{cur} of #{txt.length}"
588 afe_push = (new_el) ->
591 if el.name is new_el.name and el.namespace is new_el.namespace
593 continue unless new_el.attrs[k] is v
594 for k, v of new_el.attrs
595 continue unless el.attrs[k] is v
602 afe.unshift new_afe_marker()
604 # the functions below impliment the Tree Contstruction algorithm
605 # http://www.w3.org/TR/html5/syntax.html#tree-construction
607 # But first... the helpers
608 template_tag_is_open = ->
610 if t.name is 'template' and t.namespace is NS_HTML
613 is_in_scope_x = (tag_name, scope, namespace) ->
615 if t.name is tag_name and (namespace is null or namespace is t.namespace)
617 if scope[t.name] is t.namespace
620 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
622 if t.name is tag_name and (namespace is null or namespace is t.namespace)
624 if scope[t.name] is t.namespace
626 if scope2[t.name] is t.namespace
630 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
631 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
632 template: NS_HTML, mi: NS_MATHML,
634 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
635 'annotation-xml': NS_MATHML,
637 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
639 button_scopers = button: NS_HTML
640 li_scopers = ol: NS_HTML, ul: NS_HTML
641 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
642 is_in_scope = (tag_name, namespace = null) ->
643 return is_in_scope_x tag_name, standard_scopers, namespace
644 is_in_button_scope = (tag_name, namespace = null) ->
645 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
646 is_in_table_scope = (tag_name, namespace = null) ->
647 return is_in_scope_x tag_name, table_scopers, namespace
648 # aka is_in_list_item_scope
649 is_in_li_scope = (tag_name, namespace = null) ->
650 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
651 is_in_select_scope = (tag_name, namespace = null) ->
653 if t.name is tag_name and (namespace is null or namespace is t.namespace)
655 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
658 # this checks for a particular element, not by name
659 # this requires a namespace match
660 el_is_in_scope = (needle) ->
664 if standard_scopers[el.name] is el.namespace
668 clear_to_table_stopers = {
673 clear_stack_to_table_context = ->
675 if clear_to_table_stopers[open_els[0].name]?
679 clear_to_table_body_stopers = {
686 clear_stack_to_table_body_context = ->
688 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
692 clear_to_table_row_stopers = {
697 clear_stack_to_table_row_context = ->
699 if clear_to_table_row_stopers[open_els[0].name]?
703 clear_afe_to_marker = ->
705 return unless afe.length > 0 # this happens in fragment case, ?spec error
707 if el.type is TYPE_AFE_MARKER
712 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
714 # 1. Let last be false.
716 # 2. Let node be the last node in the stack of open elements.
718 node = open_els[node_i]
719 # 3. Loop: If node is the first node in the stack of open elements,
720 # then set last to true, and, if the parser was originally created as
721 # part of the HTML fragment parsing algorithm (fragment case) set node
722 # to the context element.
724 if node_i is open_els.length - 1
726 # fixfull (fragment case)
728 # 4. If node is a select element, run these substeps:
729 if node.name is 'select' and node.namespace is NS_HTML
730 # 1. If last is true, jump to the step below labeled done.
732 # 2. Let ancestor be node.
735 # 3. Loop: If ancestor is the first node in the stack of
736 # open elements, jump to the step below labeled done.
738 if ancestor_i is open_els.length - 1
740 # 4. Let ancestor be the node before ancestor in the stack
743 ancestor = open_els[ancestor_i]
744 # 5. If ancestor is a template node, jump to the step below
746 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
748 # 6. If ancestor is a table node, switch the insertion mode
749 # to "in select in table" and abort these steps.
750 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
751 ins_mode = ins_mode_in_select_in_table
753 # 7. Jump back to the step labeled loop.
754 # 8. Done: Switch the insertion mode to "in select" and abort
756 ins_mode = ins_mode_in_select
758 # 5. If node is a td or th element and last is false, then switch
759 # the insertion mode to "in cell" and abort these steps.
760 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
761 ins_mode = ins_mode_in_cell
763 # 6. If node is a tr element, then switch the insertion mode to "in
764 # row" and abort these steps.
765 if node.name is 'tr' and node.namespace is NS_HTML
766 ins_mode = ins_mode_in_row
768 # 7. If node is a tbody, thead, or tfoot element, then switch the
769 # insertion mode to "in table body" and abort these steps.
770 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
771 ins_mode = ins_mode_in_table_body
773 # 8. If node is a caption element, then switch the insertion mode
774 # to "in caption" and abort these steps.
775 if node.name is 'caption' and node.namespace is NS_HTML
776 ins_mode = ins_mode_in_caption
778 # 9. If node is a colgroup element, then switch the insertion mode
779 # to "in column group" and abort these steps.
780 if node.name is 'colgroup' and node.namespace is NS_HTML
781 ins_mode = ins_mode_in_column_group
783 # 10. If node is a table element, then switch the insertion mode to
784 # "in table" and abort these steps.
785 if node.name is 'table' and node.namespace is NS_HTML
786 ins_mode = ins_mode_in_table
788 # 11. If node is a template element, then switch the insertion mode
789 # to the current template insertion mode and abort these steps.
790 if node.name is 'template' and node.namespace is NS_HTML
791 ins_mode = template_ins_modes[0]
793 # 12. If node is a head element and last is true, then switch the
794 # insertion mode to "in body" ("in body"! not "in head"!) and abort
795 # these steps. (fragment case)
796 if node.name is 'head' and node.namespace is NS_HTML and last
797 ins_mode = ins_mode_in_body
799 # 13. If node is a head element and last is false, then switch the
800 # insertion mode to "in head" and abort these steps.
801 if node.name is 'head' and node.namespace is NS_HTML and last is false
802 ins_mode = ins_mode_in_head
804 # 14. If node is a body element, then switch the insertion mode to
805 # "in body" and abort these steps.
806 if node.name is 'body' and node.namespace is NS_HTML
807 ins_mode = ins_mode_in_body
809 # 15. If node is a frameset element, then switch the insertion mode
810 # to "in frameset" and abort these steps. (fragment case)
811 if node.name is 'frameset' and node.namespace is NS_HTML
812 ins_mode = ins_mode_in_frameset
814 # 16. If node is an html element, run these substeps:
815 if node.name is 'html' and node.namespace is NS_HTML
816 # 1. If the head element pointer is null, switch the insertion
817 # mode to "before head" and abort these steps. (fragment case)
818 if head_element_pointer is null
819 ins_mode = ins_mode_before_head
821 # 2. Otherwise, the head element pointer is not null,
822 # switch the insertion mode to "after head" and abort these
824 ins_mode = ins_mode_after_head
826 # 17. If last is true, then switch the insertion mode to "in body"
827 # and abort these steps. (fragment case)
829 ins_mode = ins_mode_in_body
831 # 18. Let node now be the node before node in the stack of open
834 node = open_els[node_i]
835 # 19. Return to the step labeled loop.
839 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
840 adjusted_current_node = ->
841 if open_els.length is 1 and flag_fragment_parsing
842 return context_element
845 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
846 # this implementation is structured (mostly) as described at the link above.
847 # capitalized comments are the "labels" described at the link above.
849 return if afe.length is 0
850 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
855 if i is afe.length - 1
858 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
863 el = insert_html_element afe[i].token
868 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
869 # adoption agency algorithm
871 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
872 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
873 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
874 adoption_agency = (subject) ->
875 debug_log "adoption_agency()"
876 debug_log "tree: #{serialize_els doc.children, false, true}"
877 debug_log "open_els: #{serialize_els open_els, true, true}"
878 debug_log "afe: #{serialize_els afe, true, true}"
879 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
882 # remove it from the list of active formatting elements (if found)
887 debug_log "aaa: starting off with subject on top of stack, exiting"
894 # 5. Let formatting element be the last element in the list of
895 # active formatting elements that: is between the end of the list
896 # and the last scope marker in the list, if any, or the start of
897 # the list otherwise, and has the tag name subject.
899 for t, fe_of_afe in afe
900 if t.type is TYPE_AFE_MARKER
905 # If there is no such element, then abort these steps and instead
906 # act as described in the "any other end tag" entry above.
908 debug_log "aaa: fe not found in afe"
909 in_body_any_other_end_tag subject
911 # 6. If formatting element is not in the stack of open elements,
912 # then this is a parse error; remove the element from the list, and
915 for t, fe_of_open_els in open_els
920 debug_log "aaa: fe not found in open_els"
922 # "remove it from the list" must mean afe, since it's not in open_els
923 afe.splice fe_of_afe, 1
925 # 7. If formatting element is in the stack of open elements, but
926 # the element is not in scope, then this is a parse error; abort
928 unless el_is_in_scope fe
929 debug_log "aaa: fe not in scope"
932 # 8. If formatting element is not the current node, this is a parse
933 # error. (But do not abort these steps.)
934 unless open_els[0] is fe
937 # 9. Let furthest block be the topmost node in the stack of open
938 # elements that is lower in the stack than formatting element, and
939 # is an element in the special category. There might not be one.
941 fb_of_open_els = null
948 # and continue, to see if there's one that's more "topmost"
949 # 10. If there is no furthest block, then the UA must first pop all
950 # the nodes from the bottom of the stack of open elements, from the
951 # current node up to and including formatting element, then remove
952 # formatting element from the list of active formatting elements,
953 # and finally abort these steps.
955 debug_log "aaa: no fb"
959 afe.splice fe_of_afe, 1
961 # 11. Let common ancestor be the element immediately above
962 # formatting element in the stack of open elements.
963 ca = open_els[fe_of_open_els + 1] # common ancestor
965 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
966 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
967 bookmark = new_aaa_bookmark()
970 afe.splice i, 0, bookmark
972 node = last_node = fb
976 # 3. Let node be the element immediately above node in the
977 # stack of open elements, or if node is no longer in the stack
978 # of open elements (e.g. because it got removed by this
979 # algorithm), the element that was immediately above node in
980 # the stack of open elements before node was removed.
984 node_next = open_els[i + 1]
986 node = node_next ? node_above
987 debug_log "inner loop #{inner}"
988 debug_log "tree: #{serialize_els doc.children, false, true}"
989 debug_log "open_els: #{serialize_els open_els, true, true}"
990 debug_log "afe: #{serialize_els afe, true, true}"
991 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
992 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
993 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
994 debug_log "node: #{node.serialize true, true}"
995 # TODO make sure node_above gets re-set if/when node is removed from open_els
997 # 4. If node is formatting element, then go to the next step in
998 # the overall algorithm.
1001 debug_log "the meat"
1002 # 5. If inner loop counter is greater than three and node is in
1003 # the list of active formatting elements, then remove node from
1004 # the list of active formatting elements.
1010 debug_log "max out inner"
1015 # 6. If node is not in the list of active formatting elements,
1016 # then remove node from the stack of open elements and then go
1017 # back to the step labeled inner loop.
1019 debug_log "not in afe"
1020 for t, i in open_els
1022 node_above = open_els[i + 1]
1023 open_els.splice i, 1
1026 debug_log "the bones"
1027 # 7. create an element for the token for which the element node
1028 # was created, in the HTML namespace, with common ancestor as
1029 # the intended parent; replace the entry for node in the list
1030 # of active formatting elements with an entry for the new
1031 # element, replace the entry for node in the stack of open
1032 # elements with an entry for the new element, and let node be
1034 new_node = token_to_element node.token, NS_HTML, ca
1038 debug_log "replaced in afe"
1040 for t, i in open_els
1042 node_above = open_els[i + 1]
1043 open_els[i] = new_node
1044 debug_log "replaced in open_els"
1047 # 8. If last node is furthest block, then move the
1048 # aforementioned bookmark to be immediately after the new node
1049 # in the list of active formatting elements.
1054 debug_log "removed bookmark"
1058 # "after" means lower
1059 afe.splice i, 0, bookmark # "after as <-
1060 debug_log "placed bookmark after node"
1061 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1063 # 9. Insert last node into node, first removing it from its
1064 # previous parent node if any.
1065 if last_node.parent?
1066 debug_log "last_node has parent"
1067 for c, i in last_node.parent.children
1069 debug_log "removing last_node from parent"
1070 last_node.parent.children.splice i, 1
1072 node.children.push last_node
1073 last_node.parent = node
1074 # 10. Let last node be node.
1077 # 11. Return to the step labeled inner loop.
1078 # 14. Insert whatever last node ended up being in the previous step
1079 # at the appropriate place for inserting a node, but using common
1080 # ancestor as the override target.
1082 # In the case where fe is immediately followed by fb:
1083 # * inner loop exits out early (node==fe)
1085 # * last_node is still in the tree (not a duplicate)
1086 if last_node.parent?
1087 debug_log "FEFIRST? last_node has parent"
1088 for c, i in last_node.parent.children
1090 debug_log "removing last_node from parent"
1091 last_node.parent.children.splice i, 1
1094 debug_log "after aaa inner loop"
1095 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1096 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1097 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1098 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1099 debug_log "tree: #{serialize_els doc.children, false, true}"
1104 # can't use standard insert token thing, because it's already in
1105 # open_els and must stay at it's current position in open_els
1106 dest = adjusted_insertion_location ca
1107 dest[0].children.splice dest[1], 0, last_node
1108 last_node.parent = dest[0]
1111 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1112 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1113 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1114 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1115 debug_log "tree: #{serialize_els doc.children, false, true}"
1117 # 15. Create an element for the token for which formatting element
1118 # was created, in the HTML namespace, with furthest block as the
1120 new_element = token_to_element fe.token, NS_HTML, fb
1121 # 16. Take all of the child nodes of furthest block and append them
1122 # to the element created in the last step.
1123 while fb.children.length
1124 t = fb.children.shift()
1125 t.parent = new_element
1126 new_element.children.push t
1127 # 17. Append that new element to furthest block.
1128 new_element.parent = fb
1129 fb.children.push new_element
1130 # 18. Remove formatting element from the list of active formatting
1131 # elements, and insert the new element into the list of active
1132 # formatting elements at the position of the aforementioned
1140 afe[i] = new_element
1142 # 19. Remove formatting element from the stack of open elements,
1143 # and insert the new element into the stack of open elements
1144 # immediately below the position of furthest block in that stack.
1145 for t, i in open_els
1147 open_els.splice i, 1
1149 for t, i in open_els
1151 open_els.splice i, 0, new_element
1153 # 20. Jump back to the step labeled outer loop.
1154 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1155 debug_log "tree: #{serialize_els doc.children, false, true}"
1156 debug_log "open_els: #{serialize_els open_els, true, true}"
1157 debug_log "afe: #{serialize_els afe, true, true}"
1158 debug_log "AAA DONE"
1160 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1161 close_p_element = ->
1162 generate_implied_end_tags 'p' # arg is exception
1163 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1165 while open_els.length > 1 # just in case
1166 el = open_els.shift()
1167 if el.name is 'p' and el.namespace is NS_HTML
1169 close_p_if_in_button_scope = ->
1170 if is_in_button_scope 'p', NS_HTML
1173 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1174 # aka insert_a_character = (t) ->
1175 insert_character = (t) ->
1176 dest = adjusted_insertion_location()
1177 # fixfull check for Document node
1179 prev = dest[0].children[dest[1] - 1]
1180 if prev.type is TYPE_TEXT
1183 dest[0].children.splice dest[1], 0, t
1186 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1187 process_token = (t) ->
1188 acn = adjusted_current_node()
1192 if acn.namespace is NS_HTML
1195 if is_mathml_text_integration_point(acn)
1196 if t.type is TYPE_START_TAG and (t.name is 'mglyph' or t.name is 'malignmark')
1199 if t.type is TYPE_TEXT
1202 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1205 if is_html_integration acn
1206 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1209 if t.type is TYPE_EOF
1212 in_foreign_content t
1216 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1217 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1218 adjusted_insertion_location = (override_target = null) ->
1219 # 1. If there was an override target specified, then let target be the
1222 target = override_target
1223 else # Otherwise, let target be the current node.
1224 target = open_els[0]
1225 # 2. Determine the adjusted insertion location using the first matching
1226 # steps from the following list:
1228 # If foster parenting is enabled and target is a table, tbody, tfoot,
1229 # thead, or tr element Foster parenting happens when content is
1230 # misnested in tables.
1231 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1232 loop # once. this is here so we can ``break`` to "abort these substeps"
1233 # 1. Let last template be the last template element in the
1234 # stack of open elements, if any.
1235 last_template = null
1236 last_template_i = null
1237 for el, i in open_els
1238 if el.name is 'template' and el.namespace is NS_HTML
1242 # 2. Let last table be the last table element in the stack of
1243 # open elements, if any.
1246 for el, i in open_els
1247 if el.name is 'table' and el.namespace is NS_HTML
1251 # 3. If there is a last template and either there is no last
1252 # table, or there is one, but last template is lower (more
1253 # recently added) than last table in the stack of open
1254 # elements, then: let adjusted insertion location be inside
1255 # last template's template contents, after its last child (if
1256 # any), and abort these substeps.
1257 if last_template and (last_table is null or last_template_i < last_table_i)
1258 target = last_template # fixfull should be it's contents
1259 target_i = target.children.length
1261 # 4. If there is no last table, then let adjusted insertion
1262 # location be inside the first element in the stack of open
1263 # elements (the html element), after its last child (if any),
1264 # and abort these substeps. (fragment case)
1265 if last_table is null
1267 target = open_els[open_els.length - 1]
1268 target_i = target.children.length
1270 # 5. If last table has a parent element, then let adjusted
1271 # insertion location be inside last table's parent element,
1272 # immediately before last table, and abort these substeps.
1273 if last_table.parent?
1274 for c, i in last_table.parent.children
1276 target = last_table.parent
1280 # 6. Let previous element be the element immediately above last
1281 # table in the stack of open elements.
1283 # huh? how could it not have a parent?
1284 previous_element = open_els[last_table_i + 1]
1285 # 7. Let adjusted insertion location be inside previous
1286 # element, after its last child (if any).
1287 target = previous_element
1288 target_i = target.children.length
1289 # Note: These steps are involved in part because it's possible
1290 # for elements, the table element in this case in particular,
1291 # to have been moved by a script around in the DOM, or indeed
1292 # removed from the DOM entirely, after the element was inserted
1294 break # don't really loop
1296 # Otherwise Let adjusted insertion location be inside target, after
1297 # its last child (if any).
1298 target_i = target.children.length
1300 # 3. If the adjusted insertion location is inside a template element,
1301 # let it instead be inside the template element's template contents,
1302 # after its last child (if any).
1303 # fixfull (template)
1305 # 4. Return the adjusted insertion location.
1306 return [target, target_i]
1308 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1309 # aka create_an_element_for_token
1310 token_to_element = (t, namespace, intended_parent) ->
1311 # convert attributes into a hash
1314 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1315 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1317 # TODO 2. If the newly created element has an xmlns attribute in the
1318 # XMLNS namespace whose value is not exactly the same as the element's
1319 # namespace, that is a parse error. Similarly, if the newly created
1320 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1321 # value is not the XLink Namespace, that is a parse error.
1323 # fixfull: the spec says stuff about form pointers and ownerDocument
1327 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1328 insert_foreign_element = (token, namespace) ->
1329 ail = adjusted_insertion_location()
1332 el = token_to_element token, namespace, ail_el
1333 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1335 ail_el.children.splice ail_i, 0, el
1338 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1339 insert_html_element = (token) ->
1340 insert_foreign_element token, NS_HTML
1342 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1343 # position should be [node, index_within_children]
1344 insert_comment = (t, position = null) ->
1345 position ?= adjusted_insertion_location()
1346 position[0].children.splice position[1], 0, t
1349 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1350 parse_generic_raw_text = (t) ->
1351 insert_html_element t
1352 tok_state = tok_state_rawtext
1353 original_ins_mode = ins_mode
1354 ins_mode = ins_mode_text
1355 parse_generic_rcdata_text = (t) ->
1356 insert_html_element t
1357 tok_state = tok_state_rcdata
1358 original_ins_mode = ins_mode
1359 ins_mode = ins_mode_text
1361 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1362 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1363 generate_implied_end_tags = (except = null) ->
1364 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1367 # 8.2.5.4 The rules for parsing tokens in HTML content
1368 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1370 # 8.2.5.4.1 The "initial" insertion mode
1371 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1372 ins_mode_initial = (t) ->
1375 if t.type is TYPE_COMMENT
1379 if t.type is TYPE_DOCTYPE
1380 # FIXME check identifiers, set quirks, etc
1383 ins_mode = ins_mode_before_html
1386 #fixfull (iframe, quirks)
1387 ins_mode = ins_mode_before_html
1391 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1392 ins_mode_before_html = (t) ->
1393 if t.type is TYPE_DOCTYPE
1396 if t.type is TYPE_COMMENT
1401 if t.type is TYPE_START_TAG and t.name is 'html'
1402 el = token_to_element t, NS_HTML, doc
1403 doc.children.push el
1404 open_els.unshift(el)
1405 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1406 ins_mode = ins_mode_before_head
1408 if t.type is TYPE_END_TAG
1409 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1410 # fall through to "anything else"
1415 html_tok = new_open_tag 'html'
1416 el = token_to_element html_tok, NS_HTML, doc
1417 doc.children.push el
1419 # ?fixfull browsing context
1420 ins_mode = ins_mode_before_head
1424 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1425 ins_mode_before_head = (t) ->
1428 if t.type is TYPE_COMMENT
1431 if t.type is TYPE_DOCTYPE
1434 if t.type is TYPE_START_TAG and t.name is 'html'
1437 if t.type is TYPE_START_TAG and t.name is 'head'
1438 el = insert_html_element t
1439 head_element_pointer = el
1440 ins_mode = ins_mode_in_head
1442 if t.type is TYPE_END_TAG
1443 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1444 # fall through to Anything else below
1449 head_tok = new_open_tag 'head'
1450 el = insert_html_element head_tok
1451 head_element_pointer = el
1452 ins_mode = ins_mode_in_head
1455 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1456 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1457 open_els.shift() # spec says this will be a 'head' node
1458 ins_mode = ins_mode_after_head
1460 ins_mode_in_head = (t) ->
1461 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1464 if t.type is TYPE_COMMENT
1467 if t.type is TYPE_DOCTYPE
1470 if t.type is TYPE_START_TAG and t.name is 'html'
1473 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1474 el = insert_html_element t
1476 t.acknowledge_self_closing()
1478 if t.type is TYPE_START_TAG and t.name is 'meta'
1479 el = insert_html_element t
1481 t.acknowledge_self_closing()
1482 # fixfull encoding stuff
1484 if t.type is TYPE_START_TAG and t.name is 'title'
1485 parse_generic_rcdata_text t
1487 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1488 parse_generic_raw_text t
1490 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1491 insert_html_element t
1492 ins_mode = ins_mode_in_head_noscript
1494 if t.type is TYPE_START_TAG and t.name is 'script'
1495 ail = adjusted_insertion_location()
1496 el = token_to_element t, NS_HTML, ail
1497 el.flag 'parser-inserted', true
1498 # fixfull frament case
1499 ail[0].children.splice ail[1], 0, el
1501 tok_state = tok_state_script_data
1502 original_ins_mode = ins_mode # make sure orig... is defined
1503 ins_mode = ins_mode_text
1505 if t.type is TYPE_END_TAG and t.name is 'head'
1506 open_els.shift() # will be a head element... spec says so
1507 ins_mode = ins_mode_after_head
1509 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1510 ins_mode_in_head_else t
1512 if t.type is TYPE_START_TAG and t.name is 'template'
1513 insert_html_element t
1515 flag_frameset_ok = false
1516 ins_mode = ins_mode_in_template
1517 template_ins_modes.unshift ins_mode_in_template
1519 if t.type is TYPE_END_TAG and t.name is 'template'
1520 if template_tag_is_open()
1521 generate_implied_end_tags
1522 if open_els[0].name isnt 'template'
1525 el = open_els.shift()
1526 if el.name is 'template' and el.namespace is NS_HTML
1528 clear_afe_to_marker()
1529 template_ins_modes.shift()
1534 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1537 ins_mode_in_head_else t
1539 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1540 ins_mode_in_head_noscript_else = (t) ->
1543 ins_mode = ins_mode_in_head
1545 ins_mode_in_head_noscript = (t) ->
1546 if t.type is TYPE_DOCTYPE
1549 if t.type is TYPE_START_TAG and t.name is 'html'
1552 if t.type is TYPE_END_TAG and t.name is 'noscript'
1554 ins_mode = ins_mode_in_head
1556 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1559 if t.type is TYPE_END_TAG and t.name is 'br'
1560 ins_mode_in_head_noscript_else t
1562 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1566 ins_mode_in_head_noscript_else t
1571 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1572 ins_mode_after_head_else = (t) ->
1573 body_tok = new_open_tag 'body'
1574 insert_html_element body_tok
1575 ins_mode = ins_mode_in_body
1578 ins_mode_after_head = (t) ->
1582 if t.type is TYPE_COMMENT
1585 if t.type is TYPE_DOCTYPE
1588 if t.type is TYPE_START_TAG and t.name is 'html'
1591 if t.type is TYPE_START_TAG and t.name is 'body'
1592 insert_html_element t
1593 flag_frameset_ok = false
1594 ins_mode = ins_mode_in_body
1596 if t.type is TYPE_START_TAG and t.name is 'frameset'
1597 insert_html_element t
1598 ins_mode = ins_mode_in_frameset
1600 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1602 open_els.unshift head_element_pointer
1604 for el, i of open_els
1605 if el is head_element_pointer
1606 open_els.splice i, 1
1608 console.log "warning: 23904 couldn't find head element in open_els"
1610 if t.type is TYPE_END_TAG and t.name is 'template'
1613 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1614 ins_mode_after_head_else t
1616 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1620 ins_mode_after_head_else t
1622 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1623 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1624 for el, i in open_els
1625 if el.name is name and el.namespace is NS_HTML
1626 generate_implied_end_tags name # arg is exception
1627 parse_error() unless i is 0
1632 if special_elements[el.name] is el.namespace
1636 ins_mode_in_body = (t) ->
1637 if t.type is TYPE_TEXT and t.text is "\u0000"
1644 if t.type is TYPE_TEXT
1647 flag_frameset_ok = false
1649 if t.type is TYPE_COMMENT
1652 if t.type is TYPE_DOCTYPE
1655 if t.type is TYPE_START_TAG and t.name is 'html'
1657 return if template_tag_is_open()
1658 root_attrs = open_els[open_els.length - 1].attrs
1660 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1663 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1666 if t.type is TYPE_START_TAG and t.name is 'body'
1668 return if open_els.length < 2
1669 second = open_els[open_els.length - 2]
1670 return unless second.namespace is NS_HTML
1671 return unless second.name is 'body'
1672 return if template_tag_is_open()
1673 flag_frameset_ok = false
1675 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1677 if t.type is TYPE_START_TAG and t.name is 'frameset'
1679 return if open_els.length < 2
1680 second_i = open_els.length - 2
1681 second = open_els[second_i]
1682 return unless second.namespace is NS_HTML
1683 return unless second.name is 'body'
1684 if flag_frameset_ok is false
1687 for el, i in second.parent.children
1689 second.parent.children.splice i, 1
1691 open_els.splice second_i, 1
1692 # pop everything except the "root html element"
1693 while open_els.length > 1
1695 insert_html_element t
1696 ins_mode = ins_mode_in_frameset
1698 if t.type is TYPE_EOF
1700 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1701 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1702 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1705 unless ok_tags[t.name] is el.namespace
1708 if template_ins_modes.length > 0
1709 ins_mode_in_template t
1713 if t.type is TYPE_END_TAG and t.name is 'body'
1714 unless is_in_scope 'body', NS_HTML
1718 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1719 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1720 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1721 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1725 unless ok_tags[t.name] is el.namespace
1728 ins_mode = ins_mode_after_body
1730 if t.type is TYPE_END_TAG and t.name is 'html'
1731 unless is_in_scope 'body', NS_HTML
1735 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1736 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1737 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1738 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1742 unless ok_tags[t.name] is el.namespace
1745 ins_mode = ins_mode_after_body
1748 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1749 close_p_if_in_button_scope()
1750 insert_html_element t
1752 if t.type is TYPE_START_TAG and h_tags[t.name]?
1753 close_p_if_in_button_scope()
1754 if h_tags[open_els[0].name] is open_els[0].namespace
1757 insert_html_element t
1759 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1760 close_p_if_in_button_scope()
1761 insert_html_element t
1762 # spec: If the next token is a "LF" (U+000A) character token, then
1763 # ignore that token and move on to the next one. (Newlines at the
1764 # start of pre blocks are ignored as an authoring convenience.)
1765 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1767 flag_frameset_ok = false
1769 if t.type is TYPE_START_TAG and t.name is 'form'
1770 unless form_element_pointer is null or template_tag_is_open()
1773 close_p_if_in_button_scope()
1774 el = insert_html_element t
1775 unless template_tag_is_open()
1776 form_element_pointer = el
1778 if t.type is TYPE_START_TAG and t.name is 'li'
1779 flag_frameset_ok = false
1780 for node in open_els
1781 if node.name is 'li' and node.namespace is NS_HTML
1782 generate_implied_end_tags 'li' # arg is exception
1783 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1786 el = open_els.shift()
1787 if el.name is 'li' and el.namespace is NS_HTML
1790 if el_is_special_not_adp node
1792 close_p_if_in_button_scope()
1793 insert_html_element t
1795 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1796 flag_frameset_ok = false
1797 for node in open_els
1798 if node.name is 'dd' and node.namespace is NS_HTML
1799 generate_implied_end_tags 'dd' # arg is exception
1800 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1803 el = open_els.shift()
1804 if el.name is 'dd' and el.namespace is NS_HTML
1807 if node.name is 'dt' and node.namespace is NS_HTML
1808 generate_implied_end_tags 'dt' # arg is exception
1809 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1812 el = open_els.shift()
1813 if el.name is 'dt' and el.namespace is NS_HTML
1816 if el_is_special_not_adp node
1818 close_p_if_in_button_scope()
1819 insert_html_element t
1821 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1822 close_p_if_in_button_scope()
1823 insert_html_element t
1824 tok_state = tok_state_plaintext
1826 if t.type is TYPE_START_TAG and t.name is 'button'
1827 if is_in_scope 'button', NS_HTML
1829 generate_implied_end_tags()
1831 el = open_els.shift()
1832 if el.name is 'button' and el.namespace is NS_HTML
1835 insert_html_element t
1836 flag_frameset_ok = false
1838 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1839 unless is_in_scope t.name, NS_HTML
1842 generate_implied_end_tags()
1843 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1846 el = open_els.shift()
1847 if el.name is t.name and el.namespace is NS_HTML
1850 if t.type is TYPE_END_TAG and t.name is 'form'
1851 unless template_tag_is_open()
1852 node = form_element_pointer
1853 form_element_pointer = null
1854 if node is null or not el_is_in_scope node
1857 generate_implied_end_tags()
1858 if open_els[0] isnt node
1860 for el, i in open_els
1862 open_els.splice i, 1
1865 unless is_in_scope 'form', NS_HTML
1868 generate_implied_end_tags()
1869 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1872 el = open_els.shift()
1873 if el.name is 'form' and el.namespace is NS_HTML
1876 if t.type is TYPE_END_TAG and t.name is 'p'
1877 unless is_in_button_scope 'p', NS_HTML
1879 insert_html_element new_open_tag 'p'
1882 if t.type is TYPE_END_TAG and t.name is 'li'
1883 unless is_in_li_scope 'li', NS_HTML
1886 generate_implied_end_tags 'li' # arg is exception
1887 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1890 el = open_els.shift()
1891 if el.name is 'li' and el.namespace is NS_HTML
1894 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1895 unless is_in_scope t.name, NS_HTML
1898 generate_implied_end_tags t.name # arg is exception
1899 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1902 el = open_els.shift()
1903 if el.name is t.name and el.namespace is NS_HTML
1906 if t.type is TYPE_END_TAG and h_tags[t.name]?
1909 if h_tags[el.name] is el.namespace
1912 if standard_scopers[el.name] is el.namespace
1917 generate_implied_end_tags()
1918 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1921 el = open_els.shift()
1922 if h_tags[el.name] is el.namespace
1926 if t.type is TYPE_START_TAG and t.name is 'a'
1927 # If the list of active formatting elements contains an a element
1928 # between the end of the list and the last marker on the list (or
1929 # the start of the list if there is no marker on the list), then
1930 # this is a parse error; run the adoption agency algorithm for the
1931 # tag name "a", then remove that element from the list of active
1932 # formatting elements and the stack of open elements if the
1933 # adoption agency algorithm didn't already remove it (it might not
1934 # have if the element is not in table scope).
1937 if el.type is TYPE_AFE_MARKER
1939 if el.name is 'a' and el.namespace is NS_HTML
1947 for el, i in open_els
1949 open_els.splice i, 1
1951 el = insert_html_element t
1954 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1956 el = insert_html_element t
1959 if t.type is TYPE_START_TAG and t.name is 'nobr'
1961 el = insert_html_element t
1964 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1965 adoption_agency t.name
1967 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1969 insert_html_element t
1971 flag_frameset_ok = false
1973 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1974 unless is_in_scope t.name, NS_HTML
1977 generate_implied_end_tags()
1978 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1981 el = open_els.shift()
1982 if el.name is t.name and el.namespace is NS_HTML
1984 clear_afe_to_marker()
1986 if t.type is TYPE_START_TAG and t.name is 'table'
1987 close_p_if_in_button_scope() # fixfull quirksmode thing
1988 insert_html_element t
1989 flag_frameset_ok = false
1990 ins_mode = ins_mode_in_table
1992 if t.type is TYPE_END_TAG and t.name is 'br'
1994 t.type is TYPE_START_TAG
1996 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
1998 insert_html_element t
2000 t.acknowledge_self_closing()
2001 flag_frameset_ok = false
2003 if t.type is TYPE_START_TAG and t.name is 'input'
2005 insert_html_element t
2007 t.acknowledge_self_closing()
2008 unless is_input_hidden_tok t
2009 flag_frameset_ok = false
2011 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
2012 insert_html_element t
2014 t.acknowledge_self_closing()
2016 if t.type is TYPE_START_TAG and t.name is 'hr'
2017 close_p_if_in_button_scope()
2018 insert_html_element t
2020 t.acknowledge_self_closing()
2021 flag_frameset_ok = false
2023 if t.type is TYPE_START_TAG and t.name is 'image'
2028 if t.type is TYPE_START_TAG and t.name is 'isindex'
2030 if template_tag_is_open() is false and form_element_pointer isnt null
2032 t.acknowledge_self_closing()
2033 flag_frameset_ok = false
2034 close_p_if_in_button_scope()
2035 el = insert_html_element new_open_tag 'form'
2036 unless template_tag_is_open()
2037 form_element_pointer = el
2040 el.attrs['action'] = a[1]
2042 insert_html_element new_open_tag 'hr'
2045 insert_html_element new_open_tag 'label'
2046 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2047 input_el = new_open_tag 'input'
2052 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2053 input_el.attrs_a.push [a[0], a[1]]
2054 input_el.attrs_a.push ['name', 'isindex']
2055 # fixfull this next bit is in english... internationalize?
2056 prompt ?= "This is a searchable index. Enter search keywords: "
2057 insert_character new_character_token prompt # fixfull split
2058 # TODO submit typo "balue" in spec
2059 insert_html_element input_el
2061 # insert_character '' # you can put chars here if promt attr missing
2063 insert_html_element new_open_tag 'hr'
2066 unless template_tag_is_open()
2067 form_element_pointer = null
2069 if t.type is TYPE_START_TAG and t.name is 'textarea'
2070 insert_html_element t
2071 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2073 tok_state = tok_state_rcdata
2074 original_ins_mode = ins_mode
2075 flag_frameset_ok = false
2076 ins_mode = ins_mode_text
2078 if t.type is TYPE_START_TAG and t.name is 'xmp'
2079 close_p_if_in_button_scope()
2081 flag_frameset_ok = false
2082 parse_generic_raw_text t
2084 if t.type is TYPE_START_TAG and t.name is 'iframe'
2085 flag_frameset_ok = false
2086 parse_generic_raw_text t
2088 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2089 parse_generic_raw_text t
2091 if t.type is TYPE_START_TAG and t.name is 'select'
2093 insert_html_element t
2094 flag_frameset_ok = false
2095 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2096 ins_mode = ins_mode_in_select_in_table
2098 ins_mode = ins_mode_in_select
2100 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2101 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2104 insert_html_element t
2106 # this comment block implements the W3C spec
2107 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2108 # if is_in_scope 'ruby', NS_HTML
2109 # generate_implied_end_tags()
2110 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2112 # insert_html_element t
2114 # if t.type is TYPE_START_TAG and t.name is 'rt'
2115 # if is_in_scope 'ruby', NS_HTML
2116 # generate_implied_end_tags 'rtc' # arg is exception
2117 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2119 # insert_html_element t
2121 # below implements the WATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2122 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2123 if is_in_scope 'ruby', NS_HTML
2124 generate_implied_end_tags()
2125 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2127 insert_html_element t
2129 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2130 if is_in_scope 'ruby', NS_HTML
2131 generate_implied_end_tags 'rtc'
2132 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2134 insert_html_element t
2137 if t.type is TYPE_START_TAG and t.name is 'math'
2139 adjust_mathml_attributes t
2140 adjust_foreign_attributes t
2141 insert_foreign_element t, NS_MATHML
2142 if t.flag 'self-closing'
2144 t.acknowledge_self_closing()
2146 if t.type is TYPE_START_TAG and t.name is 'svg'
2148 adjust_svg_attributes t
2149 adjust_foreign_attributes t
2150 insert_foreign_element t, NS_SVG
2151 if t.flag 'self-closing'
2153 t.acknowledge_self_closing()
2155 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2158 if t.type is TYPE_START_TAG # any other start tag
2160 insert_html_element t
2162 if t.type is TYPE_END_TAG # any other end tag
2163 in_body_any_other_end_tag t.name
2167 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2168 ins_mode_text = (t) ->
2169 if t.type is TYPE_TEXT
2172 if t.type is TYPE_EOF
2174 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2175 open_els[0].flag 'already started', true
2177 ins_mode = original_ins_mode
2180 if t.type is TYPE_END_TAG and t.name is 'script'
2182 ins_mode = original_ins_mode
2183 # fixfull the spec seems to assume that I'm going to run the script
2184 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2186 if t.type is TYPE_END_TAG
2188 ins_mode = original_ins_mode
2190 console.log 'warning: end of ins_mode_text reached'
2192 # the functions below implement the tokenizer stats described here:
2193 # http://www.w3.org/TR/html5/syntax.html#tokenization
2195 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2196 ins_mode_in_table_else = (t) ->
2198 flag_foster_parenting = true
2200 flag_foster_parenting = false
2202 ins_mode_in_table = (t) ->
2205 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2206 pending_table_character_tokens = []
2207 original_ins_mode = ins_mode
2208 ins_mode = ins_mode_in_table_text
2211 ins_mode_in_table_else t
2219 clear_stack_to_table_context()
2221 insert_html_element t
2222 ins_mode = ins_mode_in_caption
2224 clear_stack_to_table_context()
2225 insert_html_element t
2226 ins_mode = ins_mode_in_column_group
2228 clear_stack_to_table_context()
2229 insert_html_element new_open_tag 'colgroup'
2230 ins_mode = ins_mode_in_column_group
2232 when 'tbody', 'tfoot', 'thead'
2233 clear_stack_to_table_context()
2234 insert_html_element t
2235 ins_mode = ins_mode_in_table_body
2236 when 'td', 'th', 'tr'
2237 clear_stack_to_table_context()
2238 insert_html_element new_open_tag 'tbody'
2239 ins_mode = ins_mode_in_table_body
2243 if is_in_table_scope 'table', NS_HTML
2245 el = open_els.shift()
2246 if el.name is 'table' and el.namespace is NS_HTML
2250 when 'style', 'script', 'template'
2253 unless is_input_hidden_tok t
2254 ins_mode_in_table_else t
2257 el = insert_html_element t
2259 t.acknowledge_self_closing()
2262 if form_element_pointer?
2264 if template_tag_is_open()
2266 form_element_pointer = insert_html_element t
2269 ins_mode_in_table_else t
2273 if is_in_table_scope 'table', NS_HTML
2275 el = open_els.shift()
2276 if el.name is 'table' and el.namespace is NS_HTML
2281 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2286 ins_mode_in_table_else t
2290 ins_mode_in_table_else t
2293 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2294 ins_mode_in_table_text = (t) ->
2295 if t.type is TYPE_TEXT and t.text is "\u0000"
2299 if t.type is TYPE_TEXT
2300 pending_table_character_tokens.push t
2304 for old in pending_table_character_tokens
2305 unless is_space_tok old
2309 for old in pending_table_character_tokens
2310 insert_character old
2312 for old in pending_table_character_tokens
2313 ins_mode_in_table_else old
2314 pending_table_character_tokens = []
2315 ins_mode = original_ins_mode
2318 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2319 ins_mode_in_caption = (t) ->
2320 if t.type is TYPE_END_TAG and t.name is 'caption'
2321 if is_in_table_scope 'caption', NS_HTML
2322 generate_implied_end_tags()
2323 if open_els[0].name isnt 'caption'
2326 el = open_els.shift()
2327 if el.name is 'caption' and el.namespace is NS_HTML
2329 clear_afe_to_marker()
2330 ins_mode = ins_mode_in_table
2335 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2337 if is_in_table_scope 'caption', NS_HTML
2339 el = open_els.shift()
2340 if el.name is 'caption' and el.namespace is NS_HTML
2342 clear_afe_to_marker()
2343 ins_mode = ins_mode_in_table
2345 # else fragment case
2347 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2353 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2354 ins_mode_in_column_group = (t) ->
2358 if t.type is TYPE_COMMENT
2361 if t.type is TYPE_DOCTYPE
2364 if t.type is TYPE_START_TAG and t.name is 'html'
2367 if t.type is TYPE_START_TAG and t.name is 'col'
2368 el = insert_html_element t
2370 t.acknowledge_self_closing()
2372 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2373 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2375 ins_mode = ins_mode_in_table
2379 if t.type is TYPE_END_TAG and t.name is 'col'
2382 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2385 if t.type is TYPE_EOF
2389 if open_els[0].name isnt 'colgroup'
2393 ins_mode = ins_mode_in_table
2397 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2398 ins_mode_in_table_body = (t) ->
2399 if t.type is TYPE_START_TAG and t.name is 'tr'
2400 clear_stack_to_table_body_context()
2401 insert_html_element t
2402 ins_mode = ins_mode_in_row
2404 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2406 clear_stack_to_table_body_context()
2407 insert_html_element new_open_tag 'tr'
2408 ins_mode = ins_mode_in_row
2411 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2412 unless is_in_table_scope t.name, NS_HTML
2415 clear_stack_to_table_body_context()
2417 ins_mode = ins_mode_in_table
2419 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2422 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2425 if table_scopers[el.name] is el.namespace
2430 clear_stack_to_table_body_context()
2432 ins_mode = ins_mode_in_table
2435 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2441 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2442 ins_mode_in_row = (t) ->
2443 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2444 clear_stack_to_table_row_context()
2445 insert_html_element t
2446 ins_mode = ins_mode_in_cell
2449 if t.type is TYPE_END_TAG and t.name is 'tr'
2450 if is_in_table_scope 'tr', NS_HTML
2451 clear_stack_to_table_row_context()
2453 ins_mode = ins_mode_in_table_body
2457 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2458 if is_in_table_scope 'tr', NS_HTML
2459 clear_stack_to_table_row_context()
2461 ins_mode = ins_mode_in_table_body
2466 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2467 if is_in_table_scope t.name, NS_HTML
2468 if is_in_table_scope 'tr', NS_HTML
2469 clear_stack_to_table_row_context()
2471 ins_mode = ins_mode_in_table_body
2476 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2482 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2484 generate_implied_end_tags()
2485 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2488 el = open_els.shift()
2489 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2491 clear_afe_to_marker()
2492 ins_mode = ins_mode_in_row
2494 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2495 ins_mode_in_cell = (t) ->
2496 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2497 if is_in_table_scope t.name, NS_HTML
2498 generate_implied_end_tags()
2499 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2502 el = open_els.shift()
2503 if el.name is t.name and el.namespace is NS_HTML
2505 clear_afe_to_marker()
2506 ins_mode = ins_mode_in_row
2510 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2513 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2516 if table_scopers[el.name] is el.namespace
2524 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2527 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2528 if is_in_table_scope t.name, NS_HTML
2537 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2538 ins_mode_in_select = (t) ->
2539 if t.type is TYPE_TEXT and t.text is "\u0000"
2542 if t.type is TYPE_TEXT
2545 if t.type is TYPE_COMMENT
2548 if t.type is TYPE_DOCTYPE
2551 if t.type is TYPE_START_TAG and t.name is 'html'
2554 if t.type is TYPE_START_TAG and t.name is 'option'
2555 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2557 insert_html_element t
2559 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2560 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2562 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2564 insert_html_element t
2566 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2567 if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
2568 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2570 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2575 if t.type is TYPE_END_TAG and t.name is 'option'
2576 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2581 if t.type is TYPE_END_TAG and t.name is 'select'
2582 if is_in_select_scope 'select', NS_HTML
2584 el = open_els.shift()
2585 if el.name is 'select' and el.namespace is NS_HTML
2591 if t.type is TYPE_START_TAG and t.name is 'select'
2594 el = open_els.shift()
2595 if el.name is 'select' and el.namespace is NS_HTML
2598 # spec says that this is the same as </select> but it doesn't say
2599 # to check scope first
2601 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2603 if is_in_select_scope 'select', NS_HTML
2606 el = open_els.shift()
2607 if el.name is 'select' and el.namespace is NS_HTML
2612 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2615 if t.type is TYPE_EOF
2622 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2623 ins_mode_in_select_in_table = (t) ->
2624 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2627 el = open_els.shift()
2628 if el.name is 'select' and el.namespace is NS_HTML
2633 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2635 unless is_in_table_scope t.name, NS_HTML
2638 el = open_els.shift()
2639 if el.name is 'select' and el.namespace is NS_HTML
2645 ins_mode_in_select t
2648 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2649 ins_mode_in_template = (t) ->
2650 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2653 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2656 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2657 template_ins_modes.shift()
2658 template_ins_modes.unshift ins_mode_in_table
2659 ins_mode = ins_mode_in_table
2662 if t.type is TYPE_START_TAG and t.name is 'col'
2663 template_ins_modes.shift()
2664 template_ins_modes.unshift ins_mode_in_column_group
2665 ins_mode = ins_mode_in_column_group
2668 if t.type is TYPE_START_TAG and t.name is 'tr'
2669 template_ins_modes.shift()
2670 template_ins_modes.unshift ins_mode_in_table_body
2671 ins_mode = ins_mode_in_table_body
2674 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2675 template_ins_modes.shift()
2676 template_ins_modes.unshift ins_mode_in_row
2677 ins_mode = ins_mode_in_row
2680 if t.type is TYPE_START_TAG
2681 template_ins_modes.shift()
2682 template_ins_modes.unshift ins_mode_in_body
2683 ins_mode = ins_mode_in_body
2686 if t.type is TYPE_END_TAG
2689 if t.type is TYPE_EOF
2690 unless template_tag_is_open()
2695 el = open_els.shift()
2696 if el.name is 'template' and el.namespace is NS_HTML
2698 clear_afe_to_marker()
2699 template_ins_modes.shift()
2703 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2704 ins_mode_after_body = (t) ->
2708 if t.type is TYPE_COMMENT
2709 insert_comment t, [open_els[0], open_els[0].children.length]
2711 if t.type is TYPE_DOCTYPE
2714 if t.type is TYPE_START_TAG and t.name is 'html'
2717 if t.type is TYPE_END_TAG and t.name is 'html'
2718 # fixfull fragment case
2719 ins_mode = ins_mode_after_after_body
2721 if t.type is TYPE_EOF
2726 ins_mode = ins_mode_in_body
2729 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2730 ins_mode_in_frameset = (t) ->
2734 if t.type is TYPE_COMMENT
2737 if t.type is TYPE_DOCTYPE
2740 if t.type is TYPE_START_TAG and t.name is 'html'
2743 if t.type is TYPE_START_TAG and t.name is 'frameset'
2744 insert_html_element t
2746 if t.type is TYPE_END_TAG and t.name is 'frameset'
2747 if open_els.length is 1
2749 return # fragment case
2751 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2752 ins_mode = ins_mode_after_frameset
2754 if t.type is TYPE_START_TAG and t.name is 'frame'
2755 insert_html_element t
2757 t.acknowledge_self_closing()
2759 if t.type is TYPE_START_TAG and t.name is 'noframes'
2762 if t.type is TYPE_EOF
2763 if open_els.length isnt 1
2771 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2772 ins_mode_after_frameset = (t) ->
2776 if t.type is TYPE_COMMENT
2779 if t.type is TYPE_DOCTYPE
2782 if t.type is TYPE_START_TAG and t.name is 'html'
2785 if t.type is TYPE_END_TAG and t.name is 'html'
2786 insert_mode = ins_mode_after_after_frameset
2788 if t.type is TYPE_START_TAG and t.name is 'noframes'
2791 if t.type is TYPE_EOF
2798 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2799 ins_mode_after_after_body = (t) ->
2800 if t.type is TYPE_COMMENT
2801 insert_comment t, [doc, doc.children.length]
2803 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2806 if t.type is TYPE_EOF
2811 ins_mode = ins_mode_in_body
2814 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2815 ins_mode_after_after_frameset = (t) ->
2816 if t.type is TYPE_COMMENT
2817 insert_comment t, [doc, doc.children.length]
2819 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2822 if t.type is TYPE_EOF
2825 if t.type is TYPE_START_TAG and t.name is 'noframes'
2832 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2833 has_color_face_or_size = (t) ->
2835 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2838 in_foreign_content_end_script = ->
2842 in_foreign_content_other_start = (t) ->
2843 acn = adjusted_current_node()
2844 if acn.namespace is NS_MATHML
2845 adjust_mathml_attributes t
2846 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2847 t.name = svg_name_fixes[t.name]
2848 if acn.namespace is NS_SVG
2849 adjust_svg_attributes t
2850 adjust_foreign_attributes t
2851 insert_foreign_element t, acn.namespace
2852 if t.flag 'self-closing'
2853 if t.name is 'script'
2854 t.acknowledge_self_closing()
2855 in_foreign_content_end_script()
2858 t.acknowledge_self_closing()
2860 in_foreign_content = (t) ->
2861 if t.type is TYPE_TEXT and t.text is "\u0000"
2863 insert_character new_character_token "\ufffd"
2868 if t.type is TYPE_TEXT
2869 flag_frameset_ok = false
2872 if t.type is TYPE_COMMENT
2875 if t.type is TYPE_DOCTYPE
2878 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
2880 if flag_fragment_parsing
2881 in_foreign_content_other_start t
2883 loop # is this safe?
2886 if is_mathml_text_integration_point(cn) or is_html_integration(cn) or cn.namespace is NS_HTML
2890 if t.type is TYPE_START_TAG
2891 in_foreign_content_other_start t
2893 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
2894 in_foreign_content_end_script()
2896 if t.type is TYPE_END_TAG
2897 if open_els[0].name.toLowerCase() isnt t.name
2899 for node in open_els
2900 if node is open_els[open_els.length - 1]
2902 if node.name.toLowerCase() is t.name
2904 el = open_els.shift()
2907 if node.namespace is NS_HTML
2909 ins_mode t # explicitly call HTML insertion mode
2912 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2914 switch c = txt.charAt(cur++)
2916 return new_text_node parse_character_reference()
2918 tok_state = tok_state_tag_open
2921 return new_text_node "\ufffd"
2923 return new_eof_token()
2925 return new_text_node c
2928 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2929 # not needed: tok_state_character_reference_in_data = ->
2930 # just call parse_character_reference()
2932 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2933 tok_state_rcdata = ->
2934 switch c = txt.charAt(cur++)
2936 return new_text_node parse_character_reference()
2938 tok_state = tok_state_rcdata_less_than_sign
2941 return new_character_token "\ufffd"
2943 return new_eof_token()
2945 return new_character_token c
2948 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2949 # not needed: tok_state_character_reference_in_rcdata = ->
2950 # just call parse_character_reference()
2952 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2953 tok_state_rawtext = ->
2954 switch c = txt.charAt(cur++)
2956 tok_state = tok_state_rawtext_less_than_sign
2959 return new_character_token "\ufffd"
2961 return new_eof_token()
2963 return new_character_token c
2966 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2967 tok_state_script_data = ->
2968 switch c = txt.charAt(cur++)
2970 tok_state = tok_state_script_data_less_than_sign
2973 return new_character_token "\ufffd"
2975 return new_eof_token()
2977 return new_character_token c
2980 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2981 tok_state_plaintext = ->
2982 switch c = txt.charAt(cur++)
2985 return new_character_token "\ufffd"
2987 return new_eof_token()
2989 return new_character_token c
2993 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2994 tok_state_tag_open = ->
2995 switch c = txt.charAt(cur++)
2997 tok_state = tok_state_markup_declaration_open
2999 tok_state = tok_state_end_tag_open
3002 tok_cur_tag = new_comment_token '?'
3003 tok_state = tok_state_bogus_comment
3006 tok_cur_tag = new_open_tag c
3007 tok_state = tok_state_tag_name
3008 else if is_uc_alpha(c)
3009 tok_cur_tag = new_open_tag c.toLowerCase()
3010 tok_state = tok_state_tag_name
3013 tok_state = tok_state_data
3014 cur -= 1 # we didn't parse/handle the char after <
3015 return new_text_node '<'
3018 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3019 tok_state_end_tag_open = ->
3020 switch c = txt.charAt(cur++)
3023 tok_state = tok_state_data
3026 tok_state = tok_state_data
3027 return new_text_node '</'
3030 tok_cur_tag = new_end_tag c.toLowerCase()
3031 tok_state = tok_state_tag_name
3032 else if is_lc_alpha(c)
3033 tok_cur_tag = new_end_tag c
3034 tok_state = tok_state_tag_name
3037 tok_cur_tag = new_comment_token '/'
3038 tok_state = tok_state_bogus_comment
3041 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3042 tok_state_tag_name = ->
3043 switch c = txt.charAt(cur++)
3044 when "\t", "\n", "\u000c", ' '
3045 tok_state = tok_state_before_attribute_name
3047 tok_state = tok_state_self_closing_start_tag
3049 tok_state = tok_state_data
3055 tok_cur_tag.name += "\ufffd"
3058 tok_state = tok_state_data
3061 tok_cur_tag.name += c.toLowerCase()
3063 tok_cur_tag.name += c
3066 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3067 tok_state_rcdata_less_than_sign = ->
3068 c = txt.charAt(cur++)
3070 temporary_buffer = ''
3071 tok_state = tok_state_rcdata_end_tag_open
3074 tok_state = tok_state_rcdata
3075 cur -= 1 # reconsume the input character
3076 return new_character_token '<'
3078 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3079 tok_state_rcdata_end_tag_open = ->
3080 c = txt.charAt(cur++)
3082 tok_cur_tag = new_end_tag c.toLowerCase()
3083 temporary_buffer += c
3084 tok_state = tok_state_rcdata_end_tag_name
3087 tok_cur_tag = new_end_tag c
3088 temporary_buffer += c
3089 tok_state = tok_state_rcdata_end_tag_name
3092 tok_state = tok_state_rcdata
3093 cur -= 1 # reconsume the input character
3094 return new_character_token "</" # fixfull separate these
3096 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3097 is_appropriate_end_tag = (t) ->
3098 # spec says to check against "the tag name of the last start tag to
3099 # have been emitted from this tokenizer", but this is only called from
3100 # the various "raw" states, so it's hopefully ok to assume that
3101 # open_els[0].name will work instead TODO: verify this after the script
3102 # data states are implemented
3103 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3104 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3106 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3107 tok_state_rcdata_end_tag_name = ->
3108 c = txt.charAt(cur++)
3109 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3110 if is_appropriate_end_tag tok_cur_tag
3111 tok_state = tok_state_before_attribute_name
3113 # else fall through to "Anything else"
3115 if is_appropriate_end_tag tok_cur_tag
3116 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3118 # else fall through to "Anything else"
3120 if is_appropriate_end_tag tok_cur_tag
3121 tok_state = tok_state_data
3123 # else fall through to "Anything else"
3125 tok_cur_tag.name += c.toLowerCase()
3126 temporary_buffer += c
3129 tok_cur_tag.name += c
3130 temporary_buffer += c
3133 tok_state = tok_state_rcdata
3134 cur -= 1 # reconsume the input character
3135 return new_character_token '</' + temporary_buffer # fixfull separate these
3137 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3138 tok_state_rawtext_less_than_sign = ->
3139 c = txt.charAt(cur++)
3141 temporary_buffer = ''
3142 tok_state = tok_state_rawtext_end_tag_open
3145 tok_state = tok_state_rawtext
3146 cur -= 1 # reconsume the input character
3147 return new_character_token '<'
3149 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3150 tok_state_rawtext_end_tag_open = ->
3151 c = txt.charAt(cur++)
3153 tok_cur_tag = new_end_tag c.toLowerCase()
3154 temporary_buffer += c
3155 tok_state = tok_state_rawtext_end_tag_name
3158 tok_cur_tag = new_end_tag c
3159 temporary_buffer += c
3160 tok_state = tok_state_rawtext_end_tag_name
3163 tok_state = tok_state_rawtext
3164 cur -= 1 # reconsume the input character
3165 return new_character_token "</" # fixfull separate these
3167 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3168 tok_state_rawtext_end_tag_name = ->
3169 c = txt.charAt(cur++)
3170 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3171 if is_appropriate_end_tag tok_cur_tag
3172 tok_state = tok_state_before_attribute_name
3174 # else fall through to "Anything else"
3176 if is_appropriate_end_tag tok_cur_tag
3177 tok_state = tok_state_self_closing_start_tag
3179 # else fall through to "Anything else"
3181 if is_appropriate_end_tag tok_cur_tag
3182 tok_state = tok_state_data
3184 # else fall through to "Anything else"
3186 tok_cur_tag.name += c.toLowerCase()
3187 temporary_buffer += c
3190 tok_cur_tag.name += c
3191 temporary_buffer += c
3194 tok_state = tok_state_rawtext
3195 cur -= 1 # reconsume the input character
3196 return new_character_token '</' + temporary_buffer # fixfull separate these
3198 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3199 tok_state_script_data_less_than_sign = ->
3200 c = txt.charAt(cur++)
3202 temporary_buffer = ''
3203 tok_state = tok_state_script_data_end_tag_open
3206 tok_state = tok_state_script_data_escape_start
3207 return new_character_token '<!' # fixfull split
3209 tok_state = tok_state_script_data
3210 cur -= 1 # Reconsume
3211 return new_character_token '<'
3213 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3214 tok_state_script_data_end_tag_open = ->
3215 c = txt.charAt(cur++)
3217 tok_cur_tag = new_end_tag c.toLowerCase()
3218 temporary_buffer += c
3219 tok_state = tok_state_script_data_end_tag_name
3222 tok_cur_tag = new_end_tag c
3223 temporary_buffer += c
3224 tok_state = tok_state_script_data_end_tag_name
3227 tok_state = tok_state_script_data
3228 cur -= 1 # Reconsume
3229 return new_character_token '</'
3231 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3232 tok_state_script_data_end_tag_name = ->
3233 c = txt.charAt(cur++)
3234 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3235 if is_appropriate_end_tag tok_cur_tag
3236 tok_state = tok_state_before_attribute_name
3240 if is_appropriate_end_tag tok_cur_tag
3241 tok_state = tok_state_self_closing_start_tag
3245 if is_appropriate_end_tag tok_cur_tag
3246 tok_state = tok_state_data
3250 tok_cur_tag.name += c.toLowerCase()
3251 temporary_buffer += c
3254 tok_cur_tag.name += c
3255 temporary_buffer += c
3258 tok_state = tok_state_script_data
3259 cur -= 1 # Reconsume
3260 return new_character_token "</#{temporary_buffer}" # fixfull split
3262 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3263 tok_state_script_data_escape_start = ->
3264 c = txt.charAt(cur++)
3266 tok_state = tok_state_script_data_escape_start_dash
3267 return new_character_token '-'
3269 tok_state = tok_state_script_data
3270 cur -= 1 # Reconsume
3273 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3274 tok_state_script_data_escape_start_dash = ->
3275 c = txt.charAt(cur++)
3277 tok_state = tok_state_script_data_escaped_dash_dash
3278 return new_character_token '-'
3280 tok_state = tok_state_script_data
3281 cur -= 1 # Reconsume
3284 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3285 tok_state_script_data_escaped = ->
3286 c = txt.charAt(cur++)
3288 tok_state = tok_state_script_data_escaped_dash
3289 return new_character_token '-'
3291 tok_state = tok_state_script_data_escaped_less_than_sign
3295 return new_character_token "\ufffd"
3297 tok_state = tok_state_data
3299 cur -= 1 # Reconsume
3302 return new_character_token c
3304 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3305 tok_state_script_data_escaped_dash = ->
3306 c = txt.charAt(cur++)
3308 tok_state = tok_state_script_data_escaped_dash_dash
3309 return new_character_token '-'
3311 tok_state = tok_state_script_data_escaped_less_than_sign
3315 tok_state = tok_state_script_data_escaped
3316 return new_character_token "\ufffd"
3318 tok_state = tok_state_data
3320 cur -= 1 # Reconsume
3323 tok_state = tok_state_script_data_escaped
3324 return new_character_token c
3326 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3327 tok_state_script_data_escaped_dash_dash = ->
3328 c = txt.charAt(cur++)
3330 return new_character_token '-'
3332 tok_state = tok_state_script_data_escaped_less_than_sign
3335 tok_state = tok_state_script_data
3336 return new_character_token '>'
3339 tok_state = tok_state_script_data_escaped
3340 return new_character_token "\ufffd"
3343 tok_state = tok_state_data
3344 cur -= 1 # Reconsume
3347 tok_state = tok_state_script_data_escaped
3348 return new_character_token c
3350 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3351 tok_state_script_data_escaped_less_than_sign = ->
3352 c = txt.charAt(cur++)
3354 temporary_buffer = ''
3355 tok_state = tok_state_script_data_escaped_end_tag_open
3358 temporary_buffer = c.toLowerCase() # yes, really
3359 tok_state = tok_state_script_data_double_escape_start
3360 return new_character_token "<#{c}" # fixfull split
3362 temporary_buffer = c
3363 tok_state = tok_state_script_data_double_escape_start
3364 return new_character_token "<#{c}" # fixfull split
3366 tok_state = tok_state_script_data_escaped
3367 cur -= 1 # Reconsume
3368 return new_character_token c
3370 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3371 tok_state_script_data_escaped_end_tag_open = ->
3372 c = txt.charAt(cur++)
3374 tok_cur_tag = new_end_tag c.toLowerCase()
3375 temporary_buffer += c
3376 tok_state = tok_state_script_data_escaped_end_tag_name
3379 tok_cur_tag = new_end_tag c
3380 temporary_buffer += c
3381 tok_state = tok_state_script_data_escaped_end_tag_name
3384 tok_state = tok_state_script_data_escaped
3385 cur -= 1 # Reconsume
3386 return new_character_token '</' # fixfull split
3388 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3389 tok_state_script_data_escaped_end_tag_name = ->
3390 c = txt.charAt(cur++)
3391 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3392 if is_appropriate_end_tag tok_cur_tag
3393 tok_state = tok_state_before_attribute_name
3397 if is_appropriate_end_tag tok_cur_tag
3398 tok_state = tok_state_self_closing_start_tag
3402 if is_appropriate_end_tag tok_cur_tag
3403 tok_state = tok_state_data
3407 tok_cur_tag.name += c.toLowerCase()
3408 temporary_buffer += c.toLowerCase()
3411 tok_cur_tag.name += c
3412 temporary_buffer += c.toLowerCase()
3415 tok_state = tok_state_script_data_escaped
3416 cur -= 1 # Reconsume
3417 return new_character_token "</#{temporary_buffer}" # fixfull split
3419 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3420 tok_state_script_data_double_escape_start = ->
3421 c = txt.charAt(cur++)
3422 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3423 if temporary_buffer is 'script'
3424 tok_state = tok_state_script_data_double_escaped
3426 tok_state = tok_state_script_data_escaped
3427 return new_character_token c
3429 temporary_buffer += c.toLowerCase() # yes, really lowercase
3430 return new_character_token c
3432 temporary_buffer += c
3433 return new_character_token c
3435 tok_state = tok_state_script_data_escaped
3436 cur -= 1 # Reconsume
3439 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3440 tok_state_script_data_double_escaped = ->
3441 c = txt.charAt(cur++)
3443 tok_state = tok_state_script_data_double_escaped_dash
3444 return new_character_token '-'
3446 tok_state = tok_state_script_data_double_escaped_less_than_sign
3447 return new_character_token '<'
3450 return new_character_token "\ufffd"
3453 tok_state = tok_state_data
3454 cur -= 1 # Reconsume
3457 return new_character_token c
3459 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3460 tok_state_script_data_double_escaped_dash = ->
3461 c = txt.charAt(cur++)
3463 tok_state = tok_state_script_data_double_escaped_dash_dash
3464 return new_character_token '-'
3466 tok_state = tok_state_script_data_double_escaped_less_than_sign
3467 return new_character_token '<'
3470 tok_state = tok_state_script_data_double_escaped
3471 return new_character_token "\ufffd"
3474 tok_state = tok_state_data
3475 cur -= 1 # Reconsume
3478 tok_state = tok_state_script_data_double_escaped
3479 return new_character_token c
3481 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3482 tok_state_script_data_double_escaped_dash_dash = ->
3483 c = txt.charAt(cur++)
3485 return new_character_token '-'
3487 tok_state = tok_state_script_data_double_escaped_less_than_sign
3488 return new_character_token '<'
3490 tok_state = tok_state_script_data
3491 return new_character_token '>'
3494 tok_state = tok_state_script_data_double_escaped
3495 return new_character_token "\ufffd"
3498 tok_state = tok_state_data
3499 cur -= 1 # Reconsume
3502 tok_state = tok_state_script_data_double_escaped
3503 return new_character_token c
3505 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3506 tok_state_script_data_double_escaped_less_than_sign = ->
3507 c = txt.charAt(cur++)
3509 temporary_buffer = ''
3510 tok_state = tok_state_script_data_double_escape_end
3511 return new_character_token '/'
3513 tok_state = tok_state_script_data_double_escaped
3514 cur -= 1 # Reconsume
3517 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3518 tok_state_script_data_double_escape_end = ->
3519 c = txt.charAt(cur++)
3520 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3521 if temporary_buffer is 'script'
3522 tok_state = tok_state_script_data_escaped
3524 tok_state = tok_state_script_data_double_escaped
3525 return new_character_token c
3527 temporary_buffer += c.toLowerCase() # yes, really lowercase
3528 return new_character_token c
3530 temporary_buffer += c
3531 return new_character_token c
3533 tok_state = tok_state_script_data_double_escaped
3534 cur -= 1 # Reconsume
3537 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3538 tok_state_before_attribute_name = ->
3540 switch c = txt.charAt(cur++)
3541 when "\t", "\n", "\u000c", ' '
3544 tok_state = tok_state_self_closing_start_tag
3547 tok_state = tok_state_data
3553 attr_name = "\ufffd"
3554 when '"', "'", '<', '='
3559 tok_state = tok_state_data
3562 attr_name = c.toLowerCase()
3566 tok_cur_tag.attrs_a.unshift [attr_name, '']
3567 tok_state = tok_state_attribute_name
3570 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3571 tok_state_attribute_name = ->
3572 switch c = txt.charAt(cur++)
3573 when "\t", "\n", "\u000c", ' '
3574 tok_state = tok_state_after_attribute_name
3576 tok_state = tok_state_self_closing_start_tag
3578 tok_state = tok_state_before_attribute_value
3580 tok_state = tok_state_data
3586 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3589 tok_cur_tag.attrs_a[0][0] += c
3592 tok_state = tok_state_data
3595 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3597 tok_cur_tag.attrs_a[0][0] += c
3600 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3601 tok_state_after_attribute_name = ->
3602 c = txt.charAt(cur++)
3603 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3606 tok_state = tok_state_self_closing_start_tag
3609 tok_state = tok_state_before_attribute_value
3612 tok_state = tok_state_data
3615 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3616 tok_state = tok_state_attribute_name
3620 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3621 tok_state = tok_state_attribute_name
3625 tok_state = tok_state_data
3626 cur -= 1 # reconsume
3628 if c is '"' or c is "'" or c is '<'
3630 # fall through to Anything else
3632 tok_cur_tag.attrs_a.unshift [c, '']
3633 tok_state = tok_state_attribute_name
3635 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3636 tok_state_before_attribute_value = ->
3637 switch c = txt.charAt(cur++)
3638 when "\t", "\n", "\u000c", ' '
3641 tok_state = tok_state_attribute_value_double_quoted
3643 tok_state = tok_state_attribute_value_unquoted
3646 tok_state = tok_state_attribute_value_single_quoted
3649 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3650 tok_state = tok_state_attribute_value_unquoted
3653 tok_state = tok_state_data
3659 tok_state = tok_state_data
3661 tok_cur_tag.attrs_a[0][1] += c
3662 tok_state = tok_state_attribute_value_unquoted
3665 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3666 tok_state_attribute_value_double_quoted = ->
3667 switch c = txt.charAt(cur++)
3669 tok_state = tok_state_after_attribute_value_quoted
3671 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3674 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3677 tok_state = tok_state_data
3679 tok_cur_tag.attrs_a[0][1] += c
3682 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3683 tok_state_attribute_value_single_quoted = ->
3684 switch c = txt.charAt(cur++)
3686 tok_state = tok_state_after_attribute_value_quoted
3688 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3691 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3694 tok_state = tok_state_data
3696 tok_cur_tag.attrs_a[0][1] += c
3699 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3700 tok_state_attribute_value_unquoted = ->
3701 switch c = txt.charAt(cur++)
3702 when "\t", "\n", "\u000c", ' '
3703 tok_state = tok_state_before_attribute_name
3705 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3707 tok_state = tok_state_data
3712 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3715 tok_state = tok_state_data
3717 # Parse Error if ', <, = or ` (backtick)
3718 tok_cur_tag.attrs_a[0][1] += c
3721 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3722 tok_state_after_attribute_value_quoted = ->
3723 switch c = txt.charAt(cur++)
3724 when "\t", "\n", "\u000c", ' '
3725 tok_state = tok_state_before_attribute_name
3727 tok_state = tok_state_self_closing_start_tag
3729 tok_state = tok_state_data
3735 tok_state = tok_state_data
3738 tok_state = tok_state_before_attribute_name
3739 cur -= 1 # we didn't handle that char
3742 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3743 tok_state_self_closing_start_tag = ->
3744 c = txt.charAt(cur++)
3746 tok_cur_tag.flag 'self-closing'
3747 tok_state = tok_state_data
3751 tok_state = tok_state_data
3752 cur -= 1 # Reconsume
3756 tok_state = tok_state_before_attribute_name
3757 cur -= 1 # Reconsume
3760 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3761 # WARNING: put a comment token in tok_cur_tag before setting this state
3762 tok_state_bogus_comment = ->
3763 next_gt = txt.indexOf '>', cur
3765 val = txt.substr cur
3768 val = txt.substr cur, (next_gt - cur)
3770 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3771 tok_cur_tag.text += val
3772 tok_state = tok_state_data
3775 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3776 tok_state_markup_declaration_open = ->
3777 if txt.substr(cur, 2) is '--'
3779 tok_cur_tag = new_comment_token ''
3780 tok_state = tok_state_comment_start
3782 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3784 tok_state = tok_state_doctype
3786 acn = adjusted_current_node()
3787 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3789 tok_state = tok_state_cdata_section
3793 tok_cur_tag = new_comment_token ''
3794 tok_state = tok_state_bogus_comment
3797 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3798 tok_state_comment_start = ->
3799 switch c = txt.charAt(cur++)
3801 tok_state = tok_state_comment_start_dash
3804 tok_state = tok_state_comment
3805 return new_character_token "\ufffd"
3808 tok_state = tok_state_data
3812 tok_state = tok_state_data
3813 cur -= 1 # Reconsume
3816 tok_cur_tag.text += c
3817 tok_state = tok_state_comment
3820 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3821 tok_state_comment_start_dash = ->
3822 switch c = txt.charAt(cur++)
3824 tok_state = tok_state_comment_end
3827 tok_cur_tag.text += "-\ufffd"
3828 tok_state = tok_state_comment
3831 tok_state = tok_state_data
3835 tok_state = tok_state_data
3836 cur -= 1 # Reconsume
3839 tok_cur_tag.text += "-#{c}"
3840 tok_state = tok_state_comment
3843 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3844 tok_state_comment = ->
3845 switch c = txt.charAt(cur++)
3847 tok_state = tok_state_comment_end_dash
3850 tok_cur_tag.text += "\ufffd"
3853 tok_state = tok_state_data
3854 cur -= 1 # Reconsume
3857 tok_cur_tag.text += c
3860 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3861 tok_state_comment_end_dash = ->
3862 switch c = txt.charAt(cur++)
3864 tok_state = tok_state_comment_end
3867 tok_cur_tag.text += "-\ufffd"
3868 tok_state = tok_state_comment
3871 tok_state = tok_state_data
3872 cur -= 1 # Reconsume
3875 tok_cur_tag.text += "-#{c}"
3876 tok_state = tok_state_comment
3879 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3880 tok_state_comment_end = ->
3881 switch c = txt.charAt(cur++)
3883 tok_state = tok_state_data
3887 tok_cur_tag.text += "--\ufffd"
3888 tok_state = tok_state_comment
3891 tok_state = tok_state_comment_end_bang
3894 tok_cur_tag.text += '-'
3897 tok_state = tok_state_data
3898 cur -= 1 # Reconsume
3902 tok_cur_tag.text += "--#{c}"
3903 tok_state = tok_state_comment
3906 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3907 tok_state_comment_end_bang = ->
3908 switch c = txt.charAt(cur++)
3910 tok_cur_tag.text += "--!#{c}"
3911 tok_state = tok_state_comment_end_dash
3913 tok_state = tok_state_data
3917 tok_cur_tag.text += "--!\ufffd"
3918 tok_state = tok_state_comment
3921 tok_state = tok_state_data
3922 cur -= 1 # Reconsume
3925 tok_cur_tag.text += "--!#{c}"
3926 tok_state = tok_state_comment
3929 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3930 tok_state_doctype = ->
3931 switch c = txt.charAt(cur++)
3932 when "\t", "\u000a", "\u000c", ' '
3933 tok_state = tok_state_before_doctype_name
3936 tok_state = tok_state_data
3937 el = new_doctype_token ''
3938 el.flag 'force-quirks', true
3939 cur -= 1 # Reconsume
3943 tok_state = tok_state_before_doctype_name
3944 cur -= 1 # Reconsume
3947 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3948 tok_state_before_doctype_name = ->
3949 c = txt.charAt(cur++)
3950 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3953 tok_cur_tag = new_doctype_token c.toLowerCase()
3954 tok_state = tok_state_doctype_name
3958 tok_cur_tag = new_doctype_token "\ufffd"
3959 tok_state = tok_state_doctype_name
3963 el = new_doctype_token ''
3964 el.flag 'force-quirks', true
3965 tok_state = tok_state_data
3969 tok_state = tok_state_data
3970 el = new_doctype_token ''
3971 el.flag 'force-quirks', true
3972 cur -= 1 # Reconsume
3975 tok_cur_tag = new_doctype_token c
3976 tok_state = tok_state_doctype_name
3979 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3980 tok_state_doctype_name = ->
3981 c = txt.charAt(cur++)
3982 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3983 tok_state = tok_state_after_doctype_name
3986 tok_state = tok_state_data
3989 tok_cur_tag.name += c.toLowerCase()
3993 tok_cur_tag.name += "\ufffd"
3997 tok_state = tok_state_data
3998 tok_cur_tag.flag 'force-quirks', true
3999 cur -= 1 # Reconsume
4002 tok_cur_tag.name += c
4005 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4006 tok_state_after_doctype_name = ->
4007 c = txt.charAt(cur++)
4008 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4011 tok_state = tok_state_data
4015 tok_state = tok_state_data
4016 tok_cur_tag.flag 'force-quirks', true
4017 cur -= 1 # Reconsume
4020 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4022 tok_state = tok_state_after_doctype_public_keyword
4024 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4026 tok_state = tok_state_after_doctype_system_keyword
4029 tok_cur_tag.flag 'force-quirks', true
4030 tok_state = tok_state_bogus_doctype
4033 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4034 tok_state_after_doctype_public_keyword = ->
4035 c = txt.charAt(cur++)
4036 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4037 tok_state = tok_state_before_doctype_public_identifier
4041 tok_cur_tag.public_identifier = ''
4042 tok_state = tok_state_doctype_public_identifier_double_quoted
4046 tok_cur_tag.public_identifier = ''
4047 tok_state = tok_state_doctype_public_identifier_single_quoted
4051 tok_cur_tag.flag 'force-quirks', true
4052 tok_state = tok_state_data
4056 tok_state = tok_state_data
4057 tok_cur_tag.flag 'force-quirks', true
4058 cur -= 1 # Reconsume
4062 tok_cur_tag.flag 'force-quirks', true
4063 tok_state = tok_state_bogus_doctype
4066 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4067 tok_state_before_doctype_public_identifier = ->
4068 c = txt.charAt(cur++)
4069 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4073 tok_cur_tag.public_identifier = ''
4074 tok_state = tok_state_doctype_public_identifier_double_quoted
4078 tok_cur_tag.public_identifier = ''
4079 tok_state = tok_state_doctype_public_identifier_single_quoted
4083 tok_cur_tag.flag 'force-quirks', true
4084 tok_state = tok_state_data
4088 tok_state = tok_state_data
4089 tok_cur_tag.flag 'force-quirks', true
4090 cur -= 1 # Reconsume
4094 tok_cur_tag.flag 'force-quirks', true
4095 tok_state = tok_state_bogus_doctype
4099 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4100 tok_state_doctype_public_identifier_double_quoted = ->
4101 c = txt.charAt(cur++)
4103 tok_state = tok_state_after_doctype_public_identifier
4107 tok_cur_tag.public_identifier += "\ufffd"
4111 tok_cur_tag.flag 'force-quirks', true
4112 tok_state = tok_state_data
4116 tok_state = tok_state_data
4117 tok_cur_tag.flag 'force-quirks', true
4118 cur -= 1 # Reconsume
4121 tok_cur_tag.public_identifier += c
4124 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4125 tok_state_doctype_public_identifier_single_quoted = ->
4126 c = txt.charAt(cur++)
4128 tok_state = tok_state_after_doctype_public_identifier
4132 tok_cur_tag.public_identifier += "\ufffd"
4136 tok_cur_tag.flag 'force-quirks', true
4137 tok_state = tok_state_data
4141 tok_state = tok_state_data
4142 tok_cur_tag.flag 'force-quirks', true
4143 cur -= 1 # Reconsume
4146 tok_cur_tag.public_identifier += c
4149 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4150 tok_state_after_doctype_public_identifier = ->
4151 c = txt.charAt(cur++)
4152 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4153 tok_state = tok_state_between_doctype_public_and_system_identifiers
4156 tok_state = tok_state_data
4160 tok_cur_tag.system_identifier = ''
4161 tok_state = tok_state_doctype_system_identifier_double_quoted
4165 tok_cur_tag.system_identifier = ''
4166 tok_state = tok_state_doctype_system_identifier_single_quoted
4170 tok_state = tok_state_data
4171 tok_cur_tag.flag 'force-quirks', true
4172 cur -= 1 # Reconsume
4176 tok_cur_tag.flag 'force-quirks', true
4177 tok_state = tok_state_bogus_doctype
4180 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4181 tok_state_between_doctype_public_and_system_identifiers = ->
4182 c = txt.charAt(cur++)
4183 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4186 tok_state = tok_state_data
4190 tok_cur_tag.system_identifier = ''
4191 tok_state = tok_state_doctype_system_identifier_double_quoted
4195 tok_cur_tag.system_identifier = ''
4196 tok_state = tok_state_doctype_system_identifier_single_quoted
4200 tok_state = tok_state_data
4201 tok_cur_tag.flag 'force-quirks', true
4202 cur -= 1 # Reconsume
4206 tok_cur_tag.flag 'force-quirks', true
4207 tok_state = tok_state_bogus_doctype
4210 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4211 tok_state_after_doctype_system_keyword = ->
4212 c = txt.charAt(cur++)
4213 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4214 tok_state = tok_state_before_doctype_system_identifier
4218 tok_cur_tag.system_identifier = ''
4219 tok_state = tok_state_doctype_system_identifier_double_quoted
4223 tok_cur_tag.system_identifier = ''
4224 tok_state = tok_state_doctype_system_identifier_single_quoted
4228 tok_cur_tag.flag 'force-quirks', true
4229 tok_state = tok_state_data
4233 tok_state = tok_state_data
4234 tok_cur_tag.flag 'force-quirks', true
4235 cur -= 1 # Reconsume
4239 tok_cur_tag.flag 'force-quirks', true
4240 tok_state = tok_state_bogus_doctype
4243 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4244 tok_state_before_doctype_system_identifier = ->
4245 c = txt.charAt(cur++)
4246 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4249 tok_cur_tag.system_identifier = ''
4250 tok_state = tok_state_doctype_system_identifier_double_quoted
4253 tok_cur_tag.system_identifier = ''
4254 tok_state = tok_state_doctype_system_identifier_single_quoted
4258 tok_cur_tag.flag 'force-quirks', true
4259 tok_state = tok_state_data
4263 tok_state = tok_state_data
4264 tok_cur_tag.flag 'force-quirks', true
4265 cur -= 1 # Reconsume
4269 tok_cur_tag.flag 'force-quirks', true
4270 tok_state = tok_state_bogus_doctype
4273 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4274 tok_state_doctype_system_identifier_double_quoted = ->
4275 c = txt.charAt(cur++)
4277 tok_state = tok_state_after_doctype_system_identifier
4281 tok_cur_tag.system_identifier += "\ufffd"
4285 tok_cur_tag.flag 'force-quirks', true
4286 tok_state = tok_state_data
4290 tok_state = tok_state_data
4291 tok_cur_tag.flag 'force-quirks', true
4292 cur -= 1 # Reconsume
4295 tok_cur_tag.system_identifier += c
4298 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4299 tok_state_doctype_system_identifier_single_quoted = ->
4300 c = txt.charAt(cur++)
4302 tok_state = tok_state_after_doctype_system_identifier
4306 tok_cur_tag.system_identifier += "\ufffd"
4310 tok_cur_tag.flag 'force-quirks', true
4311 tok_state = tok_state_data
4315 tok_state = tok_state_data
4316 tok_cur_tag.flag 'force-quirks', true
4317 cur -= 1 # Reconsume
4320 tok_cur_tag.system_identifier += c
4323 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4324 tok_state_after_doctype_system_identifier = ->
4325 c = txt.charAt(cur++)
4326 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4329 tok_state = tok_state_data
4333 tok_state = tok_state_data
4334 tok_cur_tag.flag 'force-quirks', true
4335 cur -= 1 # Reconsume
4339 # do _not_ tok_cur_tag.flag 'force-quirks', true
4340 tok_state = tok_state_bogus_doctype
4343 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4344 tok_state_bogus_doctype = ->
4345 c = txt.charAt(cur++)
4347 tok_state = tok_state_data
4350 tok_state = tok_state_data
4351 cur -= 1 # Reconsume
4356 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4357 tok_state_cdata_section = ->
4358 tok_state = tok_state_data
4359 next_gt = txt.indexOf ']]>', cur
4361 val = txt.substr cur
4364 val = txt.substr cur, (next_gt - cur)
4366 return new_character_token val # fixfull split
4368 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4369 # Don't set this as a state, just call it
4370 # returns a string (NOT a text node)
4371 parse_character_reference = (allowed_char = null, in_attr = false) ->
4372 if cur >= txt.length
4374 switch c = txt.charAt(cur)
4375 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4376 # explicitly not a parse error
4379 # there has to be "one or more" alnums between & and ; to be a parse error
4382 if cur + 1 >= txt.length
4384 if txt.charAt(cur + 1).toLowerCase() is 'x'
4393 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4398 if txt.charAt(start + i) is ';'
4402 code_point = txt.substr(start, i)
4403 while code_point.charAt(0) is '0' and code_point.length > 1
4404 code_point = code_point.substr 1
4405 code_point = parseInt(code_point, base)
4406 if unicode_fixes[code_point]?
4408 return unicode_fixes[code_point]
4410 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4414 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4416 return from_code_point code_point
4420 if alnum.indexOf(txt.charAt(cur + i)) is -1
4423 # exit early, because parse_error() below needs at least one alnum
4425 if txt.charAt(cur + i) is ';'
4426 i += 1 # include ';' terminator in value
4427 decoded = decode_named_char_ref txt.substr(cur, i)
4434 # no ';' terminator (only legacy char refs)
4436 for i in [2..max] # no prefix matches, so ok to check shortest first
4437 c = legacy_char_refs[txt.substr(cur, i)]
4440 if txt.charAt(cur + i) is '='
4441 # "because some legacy user agents will
4442 # misinterpret the markup in those cases"
4445 if alnum.indexOf(txt.charAt(cur + i)) > -1
4446 # this makes attributes forgiving about url args
4448 # ok, and besides the weird exceptions for attributes...
4449 # return the matching char
4450 cur += i # consume entity chars
4451 parse_error() # because no terminating ";"
4455 return # never reached
4457 # tree constructor initialization
4458 # see comments on TYPE_TAG/etc for the structure of this data
4461 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4463 afe = [] # active formatting elements
4464 template_ins_modes = []
4465 ins_mode = ins_mode_initial
4466 original_ins_mode = ins_mode # TODO check spec
4467 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4468 flag_frameset_ok = true
4470 flag_foster_parenting = false
4471 form_element_pointer = null
4472 temporary_buffer = null
4473 pending_table_character_tokens = []
4474 head_element_pointer = null
4475 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4476 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4478 # tokenizer initialization
4479 tok_state = tok_state_data
4481 # text pre-processing
4482 # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4483 txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4484 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4485 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4487 if args.name is "plain-text-unsafe.dat #4"
4490 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4495 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4498 serialize_els = (els, shallow, show_ids) ->
4504 serialized += t.serialize shallow, show_ids
4507 module.exports.parse_html = parse_html
4508 module.exports.debug_log_reset = debug_log_reset
4509 module.exports.debug_log_each = debug_log_each
4510 module.exports.TYPE_TAG = TYPE_TAG
4511 module.exports.TYPE_TEXT = TYPE_TEXT
4512 module.exports.TYPE_COMMENT = TYPE_COMMENT
4513 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4514 module.exports.NS_HTML = NS_HTML
4515 module.exports.NS_MATHML = NS_MATHML
4516 module.exports.NS_SVG = NS_SVG