1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
25 # Deviations from that spec:
27 # Purposeful: search this file for "WHATWG"
29 # Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
34 # the spec uses a many different words do indicate which ends of lists/stacks
35 # they are talking about (and relative movement within the lists/stacks). This
36 # section splains. I'm implementing "lists" (afe and open_els) the same way
39 # stacks grow downward (current element is index=0)
41 # example: open_els = [a, b, c, d, e, f, g]
43 # "grows downwards" means it's visualized like this: (index: el, names)
45 # 6: g "start of the list", "topmost", "first"
47 # 4: e "previous" (to d), "above", "before"
48 # 3: d (previous/next are relative to this element)
49 # 2: c "next", "after", "lower", "below"
51 # 0: a "end of the list", "current node", "bottommost", "last"
55 # note: to get this to run outside a browser, you'll have to write a native
56 # implementation of decode_named_char_ref()
57 unless module?.exports?
59 module = exports: window.wheic
61 from_code_point = (x) ->
62 if String.fromCodePoint?
63 return String.fromCodePoint x
66 return String.fromCharCode x
68 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
70 # Each node is an obect of the Node class. Here are the Node types:
71 TYPE_TAG = 0 # name, {attributes}, [children]
72 TYPE_TEXT = 1 # "text"
75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
77 TYPE_END_TAG = 5 # name
79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
87 # quirks mode constants
97 debug_log_each = (cb) ->
98 for str in g_debug_log
103 constructor: (type, args = {}) ->
104 @type = type # one of the TYPE_* constants above
105 @name = args.name ? '' # tag name
106 @text = args.text ? '' # contents for text/comment nodes
107 @attrs = args.attrs ? {}
108 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
109 @children = args.children ? []
110 @namespace = args.namespace ? NS_HTML
111 @parent = args.parent ? null
112 @token = args.token ? null
113 @flags = args.flags ? {}
117 @id = "#{++prev_node_id}"
118 acknowledge_self_closing: ->
120 @token.flag 'did_self_close', true
122 @flag 'did_self_close', true
123 flag: (key, value = null) ->
128 serialize: (shallow = false, show_ids = false) -> # for unit tests
133 ret += JSON.stringify @name
148 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
154 ret += c.serialize shallow, show_ids
158 ret += JSON.stringify @text
161 ret += JSON.stringify @text
163 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
166 when TYPE_AAA_BOOKMARK
167 ret += 'aaa_bookmark'
170 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
173 # helpers: (only take args that are normally known when parser creates nodes)
174 new_open_tag = (name) ->
175 return new Node TYPE_START_TAG, name: name
176 new_end_tag = (name) ->
177 return new Node TYPE_END_TAG, name: name
178 new_element = (name) ->
179 return new Node TYPE_TAG, name: name
180 new_text_node = (txt) ->
181 return new Node TYPE_TEXT, text: txt
182 new_character_token = new_text_node
183 new_comment_token = (txt) ->
184 return new Node TYPE_COMMENT, text: txt
185 new_doctype_token = (name) ->
186 return new Node TYPE_DOCTYPE, name: name
188 return new Node TYPE_EOF
190 return new Node TYPE_AFE_MARKER
191 new_aaa_bookmark = ->
192 return new Node TYPE_AAA_BOOKMARK
194 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
195 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
196 digits = "0123456789"
197 alnum = lc_alpha + uc_alpha + digits
198 hex_chars = digits + "abcdefABCDEF"
200 is_uc_alpha = (str) ->
201 return str.length is 1 and uc_alpha.indexOf(str) > -1
202 is_lc_alpha = (str) ->
203 return str.length is 1 and lc_alpha.indexOf(str) > -1
205 # some SVG elements have dashes in them
206 tag_name_chars = alnum + "-"
208 # http://www.w3.org/TR/html5/infrastructure.html#space-character
209 space_chars = "\u0009\u000a\u000c\u000d\u0020"
211 return txt.length is 1 and space_chars.indexOf(txt) > -1
212 is_space_tok = (t) ->
213 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
215 is_input_hidden_tok = (t) ->
216 return false unless t.type is TYPE_START_TAG
219 if a[1].toLowerCase() is 'hidden'
224 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
225 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
228 unicode_fixes[0x00] = "\uFFFD"
229 unicode_fixes[0x80] = "\u20AC"
230 unicode_fixes[0x82] = "\u201A"
231 unicode_fixes[0x83] = "\u0192"
232 unicode_fixes[0x84] = "\u201E"
233 unicode_fixes[0x85] = "\u2026"
234 unicode_fixes[0x86] = "\u2020"
235 unicode_fixes[0x87] = "\u2021"
236 unicode_fixes[0x88] = "\u02C6"
237 unicode_fixes[0x89] = "\u2030"
238 unicode_fixes[0x8A] = "\u0160"
239 unicode_fixes[0x8B] = "\u2039"
240 unicode_fixes[0x8C] = "\u0152"
241 unicode_fixes[0x8E] = "\u017D"
242 unicode_fixes[0x91] = "\u2018"
243 unicode_fixes[0x92] = "\u2019"
244 unicode_fixes[0x93] = "\u201C"
245 unicode_fixes[0x94] = "\u201D"
246 unicode_fixes[0x95] = "\u2022"
247 unicode_fixes[0x96] = "\u2013"
248 unicode_fixes[0x97] = "\u2014"
249 unicode_fixes[0x98] = "\u02DC"
250 unicode_fixes[0x99] = "\u2122"
251 unicode_fixes[0x9A] = "\u0161"
252 unicode_fixes[0x9B] = "\u203A"
253 unicode_fixes[0x9C] = "\u0153"
254 unicode_fixes[0x9E] = "\u017E"
255 unicode_fixes[0x9F] = "\u0178"
257 quirks_yes_pi_prefixes = [
258 "+//silmaril//dtd html pro v0r11 19970101//"
259 "-//as//dtd html 3.0 aswedit + extensions//"
260 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
261 "-//ietf//dtd html 2.0 level 1//"
262 "-//ietf//dtd html 2.0 level 2//"
263 "-//ietf//dtd html 2.0 strict level 1//"
264 "-//ietf//dtd html 2.0 strict level 2//"
265 "-//ietf//dtd html 2.0 strict//"
266 "-//ietf//dtd html 2.0//"
267 "-//ietf//dtd html 2.1e//"
268 "-//ietf//dtd html 3.0//"
269 "-//ietf//dtd html 3.2 final//"
270 "-//ietf//dtd html 3.2//"
271 "-//ietf//dtd html 3//"
272 "-//ietf//dtd html level 0//"
273 "-//ietf//dtd html level 1//"
274 "-//ietf//dtd html level 2//"
275 "-//ietf//dtd html level 3//"
276 "-//ietf//dtd html strict level 0//"
277 "-//ietf//dtd html strict level 1//"
278 "-//ietf//dtd html strict level 2//"
279 "-//ietf//dtd html strict level 3//"
280 "-//ietf//dtd html strict//"
281 "-//ietf//dtd html//"
282 "-//metrius//dtd metrius presentational//"
283 "-//microsoft//dtd internet explorer 2.0 html strict//"
284 "-//microsoft//dtd internet explorer 2.0 html//"
285 "-//microsoft//dtd internet explorer 2.0 tables//"
286 "-//microsoft//dtd internet explorer 3.0 html strict//"
287 "-//microsoft//dtd internet explorer 3.0 html//"
288 "-//microsoft//dtd internet explorer 3.0 tables//"
289 "-//netscape comm. corp.//dtd html//"
290 "-//netscape comm. corp.//dtd strict html//"
291 "-//o'reilly and associates//dtd html 2.0//"
292 "-//o'reilly and associates//dtd html extended 1.0//"
293 "-//o'reilly and associates//dtd html extended relaxed 1.0//"
294 "-//sq//dtd html 2.0 hotmetal + extensions//"
295 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
296 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
297 "-//spyglass//dtd html 2.0 extended//"
298 "-//sun microsystems corp.//dtd hotjava html//"
299 "-//sun microsystems corp.//dtd hotjava strict html//"
300 "-//w3c//dtd html 3 1995-03-24//"
301 "-//w3c//dtd html 3.2 draft//"
302 "-//w3c//dtd html 3.2 final//"
303 "-//w3c//dtd html 3.2//"
304 "-//w3c//dtd html 3.2s draft//"
305 "-//w3c//dtd html 4.0 frameset//"
306 "-//w3c//dtd html 4.0 transitional//"
307 "-//w3c//dtd html experimental 19960712//"
308 "-//w3c//dtd html experimental 970421//"
309 "-//w3c//dtd w3 html//"
310 "-//w3o//dtd w3 html 3.0//"
311 "-//webtechs//dtd mozilla html 2.0//"
312 "-//webtechs//dtd mozilla html//"
315 # These are the character references that don't need a terminating semicolon
316 # min length: 2, max: 6, none are a prefix of any other.
318 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
319 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
320 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
321 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
322 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
323 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
324 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
325 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
326 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
327 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
328 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
329 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
330 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
331 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
332 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
333 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
334 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
338 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
339 raw_text_elements = ['script', 'style']
340 escapable_raw_text_elements = ['textarea', 'title']
341 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
343 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
344 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
345 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
346 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
347 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
348 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
349 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
350 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
351 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
352 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
353 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
354 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
355 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
356 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
360 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
362 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
363 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
364 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
365 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
366 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
367 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
368 'determinant', 'diff', 'divergence', 'divide', 'domain',
369 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
370 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
371 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
372 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
373 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
374 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
375 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
376 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
377 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
378 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
379 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
380 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
381 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
382 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
383 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
384 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
385 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
386 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
387 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
388 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
389 'vectorproduct', 'xor'
391 # foreign_elements = [svg_elements..., mathml_elements...]
392 #normal_elements = All other allowed HTML elements are normal elements.
396 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
397 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
398 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
399 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
400 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
401 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
402 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
403 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
404 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
405 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
406 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
408 menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
410 meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
411 noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
412 plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
413 select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
414 table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
415 textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
416 tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
419 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
420 'annotation-xml':NS_MATHML,
423 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
426 formatting_elements = {
427 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
428 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
432 mathml_text_integration = {
433 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
435 is_mathml_text_integration_point = (el) ->
436 return mathml_text_integration[el.name] is el.namespace
437 is_html_integration = (el) -> # DON'T PASS A TOKEN
438 if el.namespace is NS_MATHML
439 if el.name is 'annotation-xml'
440 if el.attrs.encoding?
441 if el.attrs.encoding.toLowerCase() is 'text/html'
443 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
446 if el.namespace is NS_SVG
447 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
452 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
455 foster_parenting_targets = {
476 el_is_special = (e) ->
477 return special_elements[e.name] is e.namespace
479 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
480 el_is_special_not_adp = (el) ->
481 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
485 altglyphdef: 'altGlyphDef'
486 altglyphitem: 'altGlyphItem'
487 animatecolor: 'animateColor'
488 animatemotion: 'animateMotion'
489 animatetransform: 'animateTransform'
492 fecolormatrix: 'feColorMatrix'
493 fecomponenttransfer: 'feComponentTransfer'
494 fecomposite: 'feComposite'
495 feconvolvematrix: 'feConvolveMatrix'
496 fediffuselighting: 'feDiffuseLighting'
497 fedisplacementmap: 'feDisplacementMap'
498 fedistantlight: 'feDistantLight'
499 fedropshadow: 'feDropShadow'
505 fegaussianblur: 'feGaussianBlur'
508 femergenode: 'feMergeNode'
509 femorphology: 'feMorphology'
511 fepointlight: 'fePointLight'
512 fespecularlighting: 'feSpecularLighting'
513 fespotlight: 'feSpotLight'
515 feturbulence: 'feTurbulence'
516 foreignobject: 'foreignObject'
518 lineargradient: 'linearGradient'
519 radialgradient: 'radialGradient'
522 svg_attribute_fixes = {
523 attributename: 'attributeName'
524 attributetype: 'attributeType'
525 basefrequency: 'baseFrequency'
526 baseprofile: 'baseProfile'
528 clippathunits: 'clipPathUnits'
529 contentscripttype: 'contentScriptType'
530 contentstyletype: 'contentStyleType'
531 diffuseconstant: 'diffuseConstant'
533 externalresourcesrequired: 'externalResourcesRequired'
534 # WHATWG removes this: filterres: 'filterRes'
535 filterunits: 'filterUnits'
537 gradienttransform: 'gradientTransform'
538 gradientunits: 'gradientUnits'
539 kernelmatrix: 'kernelMatrix'
540 kernelunitlength: 'kernelUnitLength'
541 keypoints: 'keyPoints'
542 keysplines: 'keySplines'
544 lengthadjust: 'lengthAdjust'
545 limitingconeangle: 'limitingConeAngle'
546 markerheight: 'markerHeight'
547 markerunits: 'markerUnits'
548 markerwidth: 'markerWidth'
549 maskcontentunits: 'maskContentUnits'
550 maskunits: 'maskUnits'
551 numoctaves: 'numOctaves'
552 pathlength: 'pathLength'
553 patterncontentunits: 'patternContentUnits'
554 patterntransform: 'patternTransform'
555 patternunits: 'patternUnits'
556 pointsatx: 'pointsAtX'
557 pointsaty: 'pointsAtY'
558 pointsatz: 'pointsAtZ'
559 preservealpha: 'preserveAlpha'
560 preserveaspectratio: 'preserveAspectRatio'
561 primitiveunits: 'primitiveUnits'
564 repeatcount: 'repeatCount'
565 repeatdur: 'repeatDur'
566 requiredextensions: 'requiredExtensions'
567 requiredfeatures: 'requiredFeatures'
568 specularconstant: 'specularConstant'
569 specularexponent: 'specularExponent'
570 spreadmethod: 'spreadMethod'
571 startoffset: 'startOffset'
572 stddeviation: 'stdDeviation'
573 stitchtiles: 'stitchTiles'
574 surfacescale: 'surfaceScale'
575 systemlanguage: 'systemLanguage'
576 tablevalues: 'tableValues'
579 textlength: 'textLength'
581 viewtarget: 'viewTarget'
582 xchannelselector: 'xChannelSelector'
583 ychannelselector: 'yChannelSelector'
584 zoomandpan: 'zoomAndPan'
586 foreign_attr_fixes = {
587 'xlink:actuate': 'xlink actuate'
588 'xlink:arcrole': 'xlink arcrole'
589 'xlink:href': 'xlink href'
590 'xlink:role': 'xlink role'
591 'xlink:show': 'xlink show'
592 'xlink:title': 'xlink title'
593 'xlink:type': 'xlink type'
594 'xml:base': 'xml base'
595 'xml:lang': 'xml lang'
596 'xml:space': 'xml space'
598 'xmlns:xlink': 'xmlns xlink'
600 adjust_mathml_attributes = (t) ->
602 if a[0] is 'definitionurl'
603 a[0] = 'definitionURL'
605 adjust_svg_attributes = (t) ->
607 if svg_attribute_fixes[a[0]]?
608 a[0] = svg_attribute_fixes[a[0]]
610 adjust_foreign_attributes = (t) ->
613 if foreign_attr_fixes[a[0]]?
614 a[0] = foreign_attr_fixes[a[0]]
617 # decode_named_char_ref()
619 # The list of named character references is _huge_ so ask the browser to decode
620 # for us instead of wasting bandwidth/space on including the table here.
622 # Pass without the "&" but with the ";" examples:
623 # for "&" pass "amp;"
624 # for "′" pass "x2032;"
627 textarea: document.createElement('textarea')
629 # TODO test this in IE8
630 decode_named_char_ref = (txt) ->
632 decoded = g_dncr.cache[txt]
633 return decoded if decoded?
634 g_dncr.textarea.innerHTML = txt
635 decoded = g_dncr.textarea.value
636 return null if decoded is txt
637 return g_dncr.cache[txt] = decoded
639 parse_html = (args) ->
641 cur = null # index of next char in txt to be parsed
642 # declare doc and tokenizer variables so they're in scope below
644 open_els = null # stack of open elements
645 afe = null # active formatting elements
646 template_ins_modes = null
648 original_ins_mode = null
650 tok_cur_tag = null # partially parsed tag
651 flag_scripting = null
652 flag_frameset_ok = null
654 flag_foster_parenting = null
655 form_element_pointer = null
656 temporary_buffer = null
657 pending_table_character_tokens = null
658 head_element_pointer = null
659 flag_fragment_parsing = null
660 context_element = null
669 console.log "Parse error at character #{cur} of #{txt.length}"
671 # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
672 # "Noah's Ark clause" but with three
673 afe_push = (new_el) ->
676 if el.type is TYPE_AFE_MARKER
678 if el.name is new_el.name and el.namespace is new_el.namespace
681 unless new_el.attrs[k] is v
685 for k, v of new_el.attrs
686 unless el.attrs[k] is v
696 afe.unshift new_afe_marker()
698 # the functions below impliment the Tree Contstruction algorithm
699 # http://www.w3.org/TR/html5/syntax.html#tree-construction
701 # But first... the helpers
702 template_tag_is_open = ->
704 if el.name is 'template' and el.namespace is NS_HTML
707 is_in_scope_x = (tag_name, scope, namespace) ->
709 if el.name is tag_name and (namespace is null or namespace is el.namespace)
711 if scope[el.name] is el.namespace
714 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
716 if el.name is tag_name and (namespace is null or namespace is el.namespace)
718 if scope[el.name] is el.namespace
720 if scope2[el.name] is el.namespace
724 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
725 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
728 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
729 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
731 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
733 button_scopers = button: NS_HTML
734 li_scopers = ol: NS_HTML, ul: NS_HTML
735 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
736 is_in_scope = (tag_name, namespace = null) ->
737 return is_in_scope_x tag_name, standard_scopers, namespace
738 is_in_button_scope = (tag_name, namespace = null) ->
739 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
740 is_in_table_scope = (tag_name, namespace = null) ->
741 return is_in_scope_x tag_name, table_scopers, namespace
742 # aka is_in_list_item_scope
743 is_in_li_scope = (tag_name, namespace = null) ->
744 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
745 is_in_select_scope = (tag_name, namespace = null) ->
747 if t.name is tag_name and (namespace is null or namespace is t.namespace)
749 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
752 # this checks for a particular element, not by name
753 # this requires a namespace match
754 el_is_in_scope = (needle) ->
758 if standard_scopers[el.name] is el.namespace
762 clear_to_table_stopers = {
767 clear_stack_to_table_context = ->
769 if clear_to_table_stopers[open_els[0].name]?
773 clear_to_table_body_stopers = {
780 clear_stack_to_table_body_context = ->
782 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
786 clear_to_table_row_stopers = {
791 clear_stack_to_table_row_context = ->
793 if clear_to_table_row_stopers[open_els[0].name]?
797 clear_afe_to_marker = ->
799 return unless afe.length > 0 # this happens in fragment case, ?spec error
801 if el.type is TYPE_AFE_MARKER
806 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
808 # 1. Let last be false.
810 # 2. Let node be the last node in the stack of open elements.
812 node = open_els[node_i]
813 # 3. Loop: If node is the first node in the stack of open elements,
814 # then set last to true, and, if the parser was originally created as
815 # part of the HTML fragment parsing algorithm (fragment case) set node
816 # to the context element.
818 if node_i is open_els.length - 1
820 if flag_fragment_parsing
821 node = context_element
822 # 4. If node is a select element, run these substeps:
823 if node.name is 'select' and node.namespace is NS_HTML
824 # 1. If last is true, jump to the step below labeled done.
826 # 2. Let ancestor be node.
829 # 3. Loop: If ancestor is the first node in the stack of
830 # open elements, jump to the step below labeled done.
832 if ancestor_i is open_els.length - 1
834 # 4. Let ancestor be the node before ancestor in the stack
837 ancestor = open_els[ancestor_i]
838 # 5. If ancestor is a template node, jump to the step below
840 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
842 # 6. If ancestor is a table node, switch the insertion mode
843 # to "in select in table" and abort these steps.
844 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
845 ins_mode = ins_mode_in_select_in_table
847 # 7. Jump back to the step labeled loop.
848 # 8. Done: Switch the insertion mode to "in select" and abort
850 ins_mode = ins_mode_in_select
852 # 5. If node is a td or th element and last is false, then switch
853 # the insertion mode to "in cell" and abort these steps.
854 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
855 ins_mode = ins_mode_in_cell
857 # 6. If node is a tr element, then switch the insertion mode to "in
858 # row" and abort these steps.
859 if node.name is 'tr' and node.namespace is NS_HTML
860 ins_mode = ins_mode_in_row
862 # 7. If node is a tbody, thead, or tfoot element, then switch the
863 # insertion mode to "in table body" and abort these steps.
864 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
865 ins_mode = ins_mode_in_table_body
867 # 8. If node is a caption element, then switch the insertion mode
868 # to "in caption" and abort these steps.
869 if node.name is 'caption' and node.namespace is NS_HTML
870 ins_mode = ins_mode_in_caption
872 # 9. If node is a colgroup element, then switch the insertion mode
873 # to "in column group" and abort these steps.
874 if node.name is 'colgroup' and node.namespace is NS_HTML
875 ins_mode = ins_mode_in_column_group
877 # 10. If node is a table element, then switch the insertion mode to
878 # "in table" and abort these steps.
879 if node.name is 'table' and node.namespace is NS_HTML
880 ins_mode = ins_mode_in_table
882 # 11. If node is a template element, then switch the insertion mode
883 # to the current template insertion mode and abort these steps.
884 if node.name is 'template' and node.namespace is NS_HTML
885 ins_mode = template_ins_modes[0]
887 # 12. If node is a head element and last is true, then switch the
888 # insertion mode to "in body" ("in body"! not "in head"!) and abort
889 # these steps. (fragment case)
890 if node.name is 'head' and node.namespace is NS_HTML and last
891 ins_mode = ins_mode_in_body
893 # 13. If node is a head element and last is false, then switch the
894 # insertion mode to "in head" and abort these steps.
895 if node.name is 'head' and node.namespace is NS_HTML and last is false
896 ins_mode = ins_mode_in_head
898 # 14. If node is a body element, then switch the insertion mode to
899 # "in body" and abort these steps.
900 if node.name is 'body' and node.namespace is NS_HTML
901 ins_mode = ins_mode_in_body
903 # 15. If node is a frameset element, then switch the insertion mode
904 # to "in frameset" and abort these steps. (fragment case)
905 if node.name is 'frameset' and node.namespace is NS_HTML
906 ins_mode = ins_mode_in_frameset
908 # 16. If node is an html element, run these substeps:
909 if node.name is 'html' and node.namespace is NS_HTML
910 # 1. If the head element pointer is null, switch the insertion
911 # mode to "before head" and abort these steps. (fragment case)
912 if head_element_pointer is null
913 ins_mode = ins_mode_before_head
915 # 2. Otherwise, the head element pointer is not null,
916 # switch the insertion mode to "after head" and abort these
918 ins_mode = ins_mode_after_head
920 # 17. If last is true, then switch the insertion mode to "in body"
921 # and abort these steps. (fragment case)
923 ins_mode = ins_mode_in_body
925 # 18. Let node now be the node before node in the stack of open
928 node = open_els[node_i]
929 # 19. Return to the step labeled loop.
933 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
934 adjusted_current_node = ->
935 if open_els.length is 1 and flag_fragment_parsing
936 return context_element
939 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
940 # this implementation is structured (mostly) as described at the link above.
941 # capitalized comments are the "labels" described at the link above.
943 return if afe.length is 0
944 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
949 if i is afe.length - 1
952 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
957 el = insert_html_element afe[i].token
962 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
963 # adoption agency algorithm
965 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
966 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
967 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
968 adoption_agency = (subject) ->
969 debug_log "adoption_agency()"
970 debug_log "tree: #{serialize_els doc.children, false, true}"
971 debug_log "open_els: #{serialize_els open_els, true, true}"
972 debug_log "afe: #{serialize_els afe, true, true}"
973 # this block implements tha W3C spec
974 # # 1. If the current node is an HTML element whose tag name is subject,
975 # # then run these substeps:
977 # # 1. Let element be the current node.
979 # # 2. Pop element off the stack of open elements.
981 # # 3. If element is also in the list of active formatting elements,
982 # # remove the element from the list.
984 # # 4. Abort the adoption agency algorithm.
985 # if open_els[0].name is subject and open_els[0].namespace is NS_HTML
986 # el = open_els.shift()
987 # # remove it from the list of active formatting elements (if found)
992 # debug_log "aaa: starting off with subject on top of stack, exiting"
994 # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
995 # If the current node is an HTML element whose tag name is subject, and
996 # the current node is not in the list of active formatting elements,
997 # then pop the current node off the stack of open elements, and abort
999 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
1000 debug_log "aaa: starting off with subject on top of stack, exiting"
1001 # remove it from the list of active formatting elements (if found)
1004 if el is open_els[0]
1008 debug_log "aaa: ...and not in afe, aaa done"
1018 # 5. Let formatting element be the last element in the list of
1019 # active formatting elements that: is between the end of the list
1020 # and the last scope marker in the list, if any, or the start of
1021 # the list otherwise, and has the tag name subject.
1023 for t, fe_of_afe in afe
1024 if t.type is TYPE_AFE_MARKER
1026 if t.name is subject
1029 # If there is no such element, then abort these steps and instead
1030 # act as described in the "any other end tag" entry above.
1032 debug_log "aaa: fe not found in afe"
1033 in_body_any_other_end_tag subject
1035 # 6. If formatting element is not in the stack of open elements,
1036 # then this is a parse error; remove the element from the list, and
1037 # abort these steps.
1039 for t, fe_of_open_els in open_els
1044 debug_log "aaa: fe not found in open_els"
1046 # "remove it from the list" must mean afe, since it's not in open_els
1047 afe.splice fe_of_afe, 1
1049 # 7. If formatting element is in the stack of open elements, but
1050 # the element is not in scope, then this is a parse error; abort
1052 unless el_is_in_scope fe
1053 debug_log "aaa: fe not in scope"
1056 # 8. If formatting element is not the current node, this is a parse
1057 # error. (But do not abort these steps.)
1058 unless open_els[0] is fe
1061 # 9. Let furthest block be the topmost node in the stack of open
1062 # elements that is lower in the stack than formatting element, and
1063 # is an element in the special category. There might not be one.
1065 fb_of_open_els = null
1066 for t, i in open_els
1072 # and continue, to see if there's one that's more "topmost"
1073 # 10. If there is no furthest block, then the UA must first pop all
1074 # the nodes from the bottom of the stack of open elements, from the
1075 # current node up to and including formatting element, then remove
1076 # formatting element from the list of active formatting elements,
1077 # and finally abort these steps.
1079 debug_log "aaa: no fb"
1081 t = open_els.shift()
1083 afe.splice fe_of_afe, 1
1085 # 11. Let common ancestor be the element immediately above
1086 # formatting element in the stack of open elements.
1087 ca = open_els[fe_of_open_els + 1] # common ancestor
1089 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
1090 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
1091 bookmark = new_aaa_bookmark()
1094 afe.splice i, 0, bookmark
1096 node = last_node = fb
1100 # 3. Let node be the element immediately above node in the
1101 # stack of open elements, or if node is no longer in the stack
1102 # of open elements (e.g. because it got removed by this
1103 # algorithm), the element that was immediately above node in
1104 # the stack of open elements before node was removed.
1106 for t, i in open_els
1108 node_next = open_els[i + 1]
1110 node = node_next ? node_above
1111 debug_log "inner loop #{inner}"
1112 debug_log "tree: #{serialize_els doc.children, false, true}"
1113 debug_log "open_els: #{serialize_els open_els, true, true}"
1114 debug_log "afe: #{serialize_els afe, true, true}"
1115 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1116 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1117 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1118 debug_log "node: #{node.serialize true, true}"
1119 # TODO make sure node_above gets re-set if/when node is removed from open_els
1121 # 4. If node is formatting element, then go to the next step in
1122 # the overall algorithm.
1125 debug_log "the meat"
1126 # 5. If inner loop counter is greater than three and node is in
1127 # the list of active formatting elements, then remove node from
1128 # the list of active formatting elements.
1134 debug_log "max out inner"
1139 # 6. If node is not in the list of active formatting elements,
1140 # then remove node from the stack of open elements and then go
1141 # back to the step labeled inner loop.
1143 debug_log "not in afe"
1144 for t, i in open_els
1146 node_above = open_els[i + 1]
1147 open_els.splice i, 1
1150 debug_log "the bones"
1151 # 7. create an element for the token for which the element node
1152 # was created, in the HTML namespace, with common ancestor as
1153 # the intended parent; replace the entry for node in the list
1154 # of active formatting elements with an entry for the new
1155 # element, replace the entry for node in the stack of open
1156 # elements with an entry for the new element, and let node be
1158 new_node = token_to_element node.token, NS_HTML, ca
1162 debug_log "replaced in afe"
1164 for t, i in open_els
1166 node_above = open_els[i + 1]
1167 open_els[i] = new_node
1168 debug_log "replaced in open_els"
1171 # 8. If last node is furthest block, then move the
1172 # aforementioned bookmark to be immediately after the new node
1173 # in the list of active formatting elements.
1178 debug_log "removed bookmark"
1182 # "after" means lower
1183 afe.splice i, 0, bookmark # "after as <-
1184 debug_log "placed bookmark after node"
1185 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1187 # 9. Insert last node into node, first removing it from its
1188 # previous parent node if any.
1189 if last_node.parent?
1190 debug_log "last_node has parent"
1191 for c, i in last_node.parent.children
1193 debug_log "removing last_node from parent"
1194 last_node.parent.children.splice i, 1
1196 node.children.push last_node
1197 last_node.parent = node
1198 # 10. Let last node be node.
1201 # 11. Return to the step labeled inner loop.
1202 # 14. Insert whatever last node ended up being in the previous step
1203 # at the appropriate place for inserting a node, but using common
1204 # ancestor as the override target.
1206 # In the case where fe is immediately followed by fb:
1207 # * inner loop exits out early (node==fe)
1209 # * last_node is still in the tree (not a duplicate)
1210 if last_node.parent?
1211 debug_log "FEFIRST? last_node has parent"
1212 for c, i in last_node.parent.children
1214 debug_log "removing last_node from parent"
1215 last_node.parent.children.splice i, 1
1218 debug_log "after aaa inner loop"
1219 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1220 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1221 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1222 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1223 debug_log "tree: #{serialize_els doc.children, false, true}"
1228 # can't use standard insert token thing, because it's already in
1229 # open_els and must stay at it's current position in open_els
1230 dest = adjusted_insertion_location ca
1231 dest[0].children.splice dest[1], 0, last_node
1232 last_node.parent = dest[0]
1235 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1236 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1237 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1238 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1239 debug_log "tree: #{serialize_els doc.children, false, true}"
1241 # 15. Create an element for the token for which formatting element
1242 # was created, in the HTML namespace, with furthest block as the
1244 new_element = token_to_element fe.token, NS_HTML, fb
1245 # 16. Take all of the child nodes of furthest block and append them
1246 # to the element created in the last step.
1247 while fb.children.length
1248 t = fb.children.shift()
1249 t.parent = new_element
1250 new_element.children.push t
1251 # 17. Append that new element to furthest block.
1252 new_element.parent = fb
1253 fb.children.push new_element
1254 # 18. Remove formatting element from the list of active formatting
1255 # elements, and insert the new element into the list of active
1256 # formatting elements at the position of the aforementioned
1264 afe[i] = new_element
1266 # 19. Remove formatting element from the stack of open elements,
1267 # and insert the new element into the stack of open elements
1268 # immediately below the position of furthest block in that stack.
1269 for t, i in open_els
1271 open_els.splice i, 1
1273 for t, i in open_els
1275 open_els.splice i, 0, new_element
1277 # 20. Jump back to the step labeled outer loop.
1278 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1279 debug_log "tree: #{serialize_els doc.children, false, true}"
1280 debug_log "open_els: #{serialize_els open_els, true, true}"
1281 debug_log "afe: #{serialize_els afe, true, true}"
1282 debug_log "AAA DONE"
1284 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1285 close_p_element = ->
1286 generate_implied_end_tags 'p' # arg is exception
1287 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1289 while open_els.length > 1 # just in case
1290 el = open_els.shift()
1291 if el.name is 'p' and el.namespace is NS_HTML
1293 close_p_if_in_button_scope = ->
1294 if is_in_button_scope 'p', NS_HTML
1297 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1298 # aka insert_a_character = (t) ->
1299 insert_character = (t) ->
1300 dest = adjusted_insertion_location()
1301 # fixfull check for Document node
1303 prev = dest[0].children[dest[1] - 1]
1304 if prev.type is TYPE_TEXT
1307 dest[0].children.splice dest[1], 0, t
1310 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1311 process_token = (t) ->
1312 acn = adjusted_current_node()
1316 if acn.namespace is NS_HTML
1319 if is_mathml_text_integration_point(acn)
1320 if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1323 if t.type is TYPE_TEXT
1326 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1329 if is_html_integration acn
1330 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1333 if t.type is TYPE_EOF
1336 in_foreign_content t
1340 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1341 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1342 adjusted_insertion_location = (override_target = null) ->
1343 # 1. If there was an override target specified, then let target be the
1346 target = override_target
1347 else # Otherwise, let target be the current node.
1348 target = open_els[0]
1349 # 2. Determine the adjusted insertion location using the first matching
1350 # steps from the following list:
1352 # If foster parenting is enabled and target is a table, tbody, tfoot,
1353 # thead, or tr element Foster parenting happens when content is
1354 # misnested in tables.
1355 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1356 loop # once. this is here so we can ``break`` to "abort these substeps"
1357 # 1. Let last template be the last template element in the
1358 # stack of open elements, if any.
1359 last_template = null
1360 last_template_i = null
1361 for el, i in open_els
1362 if el.name is 'template' and el.namespace is NS_HTML
1366 # 2. Let last table be the last table element in the stack of
1367 # open elements, if any.
1370 for el, i in open_els
1371 if el.name is 'table' and el.namespace is NS_HTML
1375 # 3. If there is a last template and either there is no last
1376 # table, or there is one, but last template is lower (more
1377 # recently added) than last table in the stack of open
1378 # elements, then: let adjusted insertion location be inside
1379 # last template's template contents, after its last child (if
1380 # any), and abort these substeps.
1381 if last_template and (last_table is null or last_template_i < last_table_i)
1382 target = last_template # fixfull should be it's contents
1383 target_i = target.children.length
1385 # 4. If there is no last table, then let adjusted insertion
1386 # location be inside the first element in the stack of open
1387 # elements (the html element), after its last child (if any),
1388 # and abort these substeps. (fragment case)
1389 if last_table is null
1391 target = open_els[open_els.length - 1]
1392 target_i = target.children.length
1394 # 5. If last table has a parent element, then let adjusted
1395 # insertion location be inside last table's parent element,
1396 # immediately before last table, and abort these substeps.
1397 if last_table.parent?
1398 for c, i in last_table.parent.children
1400 target = last_table.parent
1404 # 6. Let previous element be the element immediately above last
1405 # table in the stack of open elements.
1407 # huh? how could it not have a parent?
1408 previous_element = open_els[last_table_i + 1]
1409 # 7. Let adjusted insertion location be inside previous
1410 # element, after its last child (if any).
1411 target = previous_element
1412 target_i = target.children.length
1413 # Note: These steps are involved in part because it's possible
1414 # for elements, the table element in this case in particular,
1415 # to have been moved by a script around in the DOM, or indeed
1416 # removed from the DOM entirely, after the element was inserted
1418 break # don't really loop
1420 # Otherwise Let adjusted insertion location be inside target, after
1421 # its last child (if any).
1422 target_i = target.children.length
1424 # 3. If the adjusted insertion location is inside a template element,
1425 # let it instead be inside the template element's template contents,
1426 # after its last child (if any).
1427 # fixfull (template)
1429 # 4. Return the adjusted insertion location.
1430 return [target, target_i]
1432 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1433 # aka create_an_element_for_token
1434 token_to_element = (t, namespace, intended_parent) ->
1435 # convert attributes into a hash
1438 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1439 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1441 # TODO 2. If the newly created element has an xmlns attribute in the
1442 # XMLNS namespace whose value is not exactly the same as the element's
1443 # namespace, that is a parse error. Similarly, if the newly created
1444 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1445 # value is not the XLink Namespace, that is a parse error.
1447 # fixfull: the spec says stuff about form pointers and ownerDocument
1451 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1452 insert_foreign_element = (token, namespace) ->
1453 ail = adjusted_insertion_location()
1456 el = token_to_element token, namespace, ail_el
1457 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1459 ail_el.children.splice ail_i, 0, el
1462 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1463 insert_html_element = (token) ->
1464 insert_foreign_element token, NS_HTML
1466 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1467 # position should be [node, index_within_children]
1468 insert_comment = (t, position = null) ->
1469 position ?= adjusted_insertion_location()
1470 position[0].children.splice position[1], 0, t
1473 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1474 parse_generic_raw_text = (t) ->
1475 insert_html_element t
1476 tok_state = tok_state_rawtext
1477 original_ins_mode = ins_mode
1478 ins_mode = ins_mode_text
1479 parse_generic_rcdata_text = (t) ->
1480 insert_html_element t
1481 tok_state = tok_state_rcdata
1482 original_ins_mode = ins_mode
1483 ins_mode = ins_mode_text
1485 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1486 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1487 generate_implied_end_tags = (except = null) ->
1488 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1491 # 8.2.5.4 The rules for parsing tokens in HTML content
1492 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1494 # 8.2.5.4.1 The "initial" insertion mode
1495 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1496 is_quirks_yes_doctype = (t) ->
1497 if t.flag 'force-quirks'
1499 if t.name isnt 'html'
1501 if t.public_identifier?
1502 pi = t.public_identifier.toLowerCase()
1503 for p in quirks_yes_pi_prefixes
1504 if pi.substr(0, p.length) is p
1506 if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
1508 if t.system_identifier?
1509 if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
1511 else if t.public_identifier?
1512 # already did this: pi = t.public_identifier.toLowerCase()
1513 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1516 is_quirks_limited_doctype = (t) ->
1517 if t.public_identifier?
1518 pi = t.public_identifier.toLowerCase()
1519 if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
1521 if t.system_identifier?
1522 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1525 ins_mode_initial = (t) ->
1528 if t.type is TYPE_COMMENT
1532 if t.type is TYPE_DOCTYPE
1533 # fixfull syntax error from first paragraph and following bullets
1534 # fixfull set doc.doctype
1535 # fixfull is the "not an iframe srcdoc" thing relevant?
1536 if is_quirks_yes_doctype t
1537 doc.flag 'quirks mode', QUIRKS_YES
1538 else if is_quirks_limited_doctype t
1539 doc.flag 'quirks mode', QUIRKS_LIMITED
1541 ins_mode = ins_mode_before_html
1544 # fixfull not iframe srcdoc?
1546 doc.flag 'quirks mode', QUIRKS_YES
1547 ins_mode = ins_mode_before_html
1551 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1552 ins_mode_before_html = (t) ->
1553 if t.type is TYPE_DOCTYPE
1556 if t.type is TYPE_COMMENT
1561 if t.type is TYPE_START_TAG and t.name is 'html'
1562 el = token_to_element t, NS_HTML, doc
1563 doc.children.push el
1565 open_els.unshift(el)
1566 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1567 ins_mode = ins_mode_before_head
1569 if t.type is TYPE_END_TAG
1570 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1571 # fall through to "anything else"
1576 el = token_to_element new_open_tag('html'), NS_HTML, doc
1577 doc.children.push el
1580 # ?fixfull browsing context
1581 ins_mode = ins_mode_before_head
1585 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1586 ins_mode_before_head = (t) ->
1589 if t.type is TYPE_COMMENT
1592 if t.type is TYPE_DOCTYPE
1595 if t.type is TYPE_START_TAG and t.name is 'html'
1598 if t.type is TYPE_START_TAG and t.name is 'head'
1599 el = insert_html_element t
1600 head_element_pointer = el
1601 ins_mode = ins_mode_in_head
1603 if t.type is TYPE_END_TAG
1604 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1605 # fall through to Anything else below
1610 el = insert_html_element new_open_tag 'head'
1611 head_element_pointer = el
1612 ins_mode = ins_mode_in_head
1615 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1616 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1617 open_els.shift() # spec says this will be a 'head' node
1618 ins_mode = ins_mode_after_head
1620 ins_mode_in_head = (t) ->
1621 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1624 if t.type is TYPE_COMMENT
1627 if t.type is TYPE_DOCTYPE
1630 if t.type is TYPE_START_TAG and t.name is 'html'
1633 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1634 el = insert_html_element t
1636 t.acknowledge_self_closing()
1638 if t.type is TYPE_START_TAG and t.name is 'meta'
1639 el = insert_html_element t
1641 t.acknowledge_self_closing()
1642 # fixfull encoding stuff
1644 if t.type is TYPE_START_TAG and t.name is 'title'
1645 parse_generic_rcdata_text t
1647 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1648 parse_generic_raw_text t
1650 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1651 insert_html_element t
1652 ins_mode = ins_mode_in_head_noscript
1654 if t.type is TYPE_START_TAG and t.name is 'script'
1655 ail = adjusted_insertion_location()
1656 el = token_to_element t, NS_HTML, ail
1657 el.flag 'parser-inserted', true
1658 # fixfull frament case
1659 ail[0].children.splice ail[1], 0, el
1661 tok_state = tok_state_script_data
1662 original_ins_mode = ins_mode # make sure orig... is defined
1663 ins_mode = ins_mode_text
1665 if t.type is TYPE_END_TAG and t.name is 'head'
1666 open_els.shift() # will be a head element... spec says so
1667 ins_mode = ins_mode_after_head
1669 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1670 ins_mode_in_head_else t
1672 if t.type is TYPE_START_TAG and t.name is 'template'
1673 insert_html_element t
1675 flag_frameset_ok = false
1676 ins_mode = ins_mode_in_template
1677 template_ins_modes.unshift ins_mode_in_template
1679 if t.type is TYPE_END_TAG and t.name is 'template'
1680 if template_tag_is_open()
1681 generate_implied_end_tags
1682 if open_els[0].name isnt 'template'
1685 el = open_els.shift()
1686 if el.name is 'template' and el.namespace is NS_HTML
1688 clear_afe_to_marker()
1689 template_ins_modes.shift()
1694 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1697 ins_mode_in_head_else t
1699 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1700 ins_mode_in_head_noscript_else = (t) ->
1703 ins_mode = ins_mode_in_head
1705 ins_mode_in_head_noscript = (t) ->
1706 if t.type is TYPE_DOCTYPE
1709 if t.type is TYPE_START_TAG and t.name is 'html'
1712 if t.type is TYPE_END_TAG and t.name is 'noscript'
1714 ins_mode = ins_mode_in_head
1716 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1719 if t.type is TYPE_END_TAG and t.name is 'br'
1720 ins_mode_in_head_noscript_else t
1722 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1726 ins_mode_in_head_noscript_else t
1731 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1732 ins_mode_after_head_else = (t) ->
1733 body_tok = new_open_tag 'body'
1734 insert_html_element body_tok
1735 ins_mode = ins_mode_in_body
1738 ins_mode_after_head = (t) ->
1742 if t.type is TYPE_COMMENT
1745 if t.type is TYPE_DOCTYPE
1748 if t.type is TYPE_START_TAG and t.name is 'html'
1751 if t.type is TYPE_START_TAG and t.name is 'body'
1752 insert_html_element t
1753 flag_frameset_ok = false
1754 ins_mode = ins_mode_in_body
1756 if t.type is TYPE_START_TAG and t.name is 'frameset'
1757 insert_html_element t
1758 ins_mode = ins_mode_in_frameset
1760 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1762 open_els.unshift head_element_pointer
1764 for el, i in open_els
1765 if el is head_element_pointer
1766 open_els.splice i, 1
1768 console.log "warning: 23904 couldn't find head element in open_els"
1770 if t.type is TYPE_END_TAG and t.name is 'template'
1773 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1774 ins_mode_after_head_else t
1776 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1780 ins_mode_after_head_else t
1782 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1783 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1786 if node.name is name and node.namespace is NS_HTML
1787 generate_implied_end_tags name # arg is exception
1788 unless node is open_els[0]
1791 el = open_els.shift()
1794 if special_elements[node.name] is node.namespace
1797 for el, i in open_els
1799 node = open_els[i + 1]
1802 ins_mode_in_body = (t) ->
1803 if t.type is TYPE_TEXT and t.text is "\u0000"
1810 if t.type is TYPE_TEXT
1813 flag_frameset_ok = false
1815 if t.type is TYPE_COMMENT
1818 if t.type is TYPE_DOCTYPE
1821 if t.type is TYPE_START_TAG and t.name is 'html'
1823 return if template_tag_is_open()
1824 root_attrs = open_els[open_els.length - 1].attrs
1826 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1829 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1832 if t.type is TYPE_START_TAG and t.name is 'body'
1834 return if open_els.length < 2
1835 second = open_els[open_els.length - 2]
1836 return unless second.namespace is NS_HTML
1837 return unless second.name is 'body'
1838 return if template_tag_is_open()
1839 flag_frameset_ok = false
1841 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1843 if t.type is TYPE_START_TAG and t.name is 'frameset'
1845 return if open_els.length < 2
1846 second_i = open_els.length - 2
1847 second = open_els[second_i]
1848 return unless second.namespace is NS_HTML
1849 return unless second.name is 'body'
1850 if flag_frameset_ok is false
1853 for el, i in second.parent.children
1855 second.parent.children.splice i, 1
1857 open_els.splice second_i, 1
1858 # pop everything except the "root html element"
1859 while open_els.length > 1
1861 insert_html_element t
1862 ins_mode = ins_mode_in_frameset
1864 if t.type is TYPE_EOF
1866 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1867 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1868 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1871 unless ok_tags[t.name] is el.namespace
1874 if template_ins_modes.length > 0
1875 ins_mode_in_template t
1879 if t.type is TYPE_END_TAG and t.name is 'body'
1880 unless is_in_scope 'body', NS_HTML
1884 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1885 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1886 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1887 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1891 unless ok_tags[t.name] is el.namespace
1894 ins_mode = ins_mode_after_body
1896 if t.type is TYPE_END_TAG and t.name is 'html'
1897 unless is_in_scope 'body', NS_HTML
1901 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1902 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1903 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1904 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1908 unless ok_tags[t.name] is el.namespace
1911 ins_mode = ins_mode_after_body
1914 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1915 close_p_if_in_button_scope()
1916 insert_html_element t
1918 if t.type is TYPE_START_TAG and h_tags[t.name]?
1919 close_p_if_in_button_scope()
1920 if h_tags[open_els[0].name] is open_els[0].namespace
1923 insert_html_element t
1925 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1926 close_p_if_in_button_scope()
1927 insert_html_element t
1928 eat_next_token_if_newline()
1929 flag_frameset_ok = false
1931 if t.type is TYPE_START_TAG and t.name is 'form'
1932 unless form_element_pointer is null or template_tag_is_open()
1935 close_p_if_in_button_scope()
1936 el = insert_html_element t
1937 unless template_tag_is_open()
1938 form_element_pointer = el
1940 if t.type is TYPE_START_TAG and t.name is 'li'
1941 flag_frameset_ok = false
1942 for node in open_els
1943 if node.name is 'li' and node.namespace is NS_HTML
1944 generate_implied_end_tags 'li' # arg is exception
1945 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1948 el = open_els.shift()
1949 if el.name is 'li' and el.namespace is NS_HTML
1952 if el_is_special_not_adp node
1954 close_p_if_in_button_scope()
1955 insert_html_element t
1957 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1958 flag_frameset_ok = false
1959 for node in open_els
1960 if node.name is 'dd' and node.namespace is NS_HTML
1961 generate_implied_end_tags 'dd' # arg is exception
1962 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1965 el = open_els.shift()
1966 if el.name is 'dd' and el.namespace is NS_HTML
1969 if node.name is 'dt' and node.namespace is NS_HTML
1970 generate_implied_end_tags 'dt' # arg is exception
1971 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1974 el = open_els.shift()
1975 if el.name is 'dt' and el.namespace is NS_HTML
1978 if el_is_special_not_adp node
1980 close_p_if_in_button_scope()
1981 insert_html_element t
1983 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1984 close_p_if_in_button_scope()
1985 insert_html_element t
1986 tok_state = tok_state_plaintext
1988 if t.type is TYPE_START_TAG and t.name is 'button'
1989 if is_in_scope 'button', NS_HTML
1991 generate_implied_end_tags()
1993 el = open_els.shift()
1994 if el.name is 'button' and el.namespace is NS_HTML
1997 insert_html_element t
1998 flag_frameset_ok = false
2000 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
2001 unless is_in_scope t.name, NS_HTML
2004 generate_implied_end_tags()
2005 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
2008 el = open_els.shift()
2009 if el.name is t.name and el.namespace is NS_HTML
2012 if t.type is TYPE_END_TAG and t.name is 'form'
2013 unless template_tag_is_open()
2014 node = form_element_pointer
2015 form_element_pointer = null
2016 if node is null or not el_is_in_scope node
2019 generate_implied_end_tags()
2020 if open_els[0] isnt node
2022 for el, i in open_els
2024 open_els.splice i, 1
2027 unless is_in_scope 'form', NS_HTML
2030 generate_implied_end_tags()
2031 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
2034 el = open_els.shift()
2035 if el.name is 'form' and el.namespace is NS_HTML
2038 if t.type is TYPE_END_TAG and t.name is 'p'
2039 unless is_in_button_scope 'p', NS_HTML
2041 insert_html_element new_open_tag 'p'
2044 if t.type is TYPE_END_TAG and t.name is 'li'
2045 unless is_in_li_scope 'li', NS_HTML
2048 generate_implied_end_tags 'li' # arg is exception
2049 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
2052 el = open_els.shift()
2053 if el.name is 'li' and el.namespace is NS_HTML
2056 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
2057 unless is_in_scope t.name, NS_HTML
2060 generate_implied_end_tags t.name # arg is exception
2061 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2064 el = open_els.shift()
2065 if el.name is t.name and el.namespace is NS_HTML
2068 if t.type is TYPE_END_TAG and h_tags[t.name]?
2071 if h_tags[el.name] is el.namespace
2074 if standard_scopers[el.name] is el.namespace
2079 generate_implied_end_tags()
2080 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2083 el = open_els.shift()
2084 if h_tags[el.name] is el.namespace
2088 if t.type is TYPE_START_TAG and t.name is 'a'
2089 # If the list of active formatting elements contains an a element
2090 # between the end of the list and the last marker on the list (or
2091 # the start of the list if there is no marker on the list), then
2092 # this is a parse error; run the adoption agency algorithm for the
2093 # tag name "a", then remove that element from the list of active
2094 # formatting elements and the stack of open elements if the
2095 # adoption agency algorithm didn't already remove it (it might not
2096 # have if the element is not in table scope).
2099 if el.type is TYPE_AFE_MARKER
2101 if el.name is 'a' and el.namespace is NS_HTML
2109 for el, i in open_els
2111 open_els.splice i, 1
2113 el = insert_html_element t
2116 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2118 el = insert_html_element t
2121 if t.type is TYPE_START_TAG and t.name is 'nobr'
2123 if is_in_scope 'nobr', NS_HTML
2125 adoption_agency 'nobr'
2127 el = insert_html_element t
2130 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2131 adoption_agency t.name
2133 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2135 insert_html_element t
2137 flag_frameset_ok = false
2139 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2140 unless is_in_scope t.name, NS_HTML
2143 generate_implied_end_tags()
2144 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2147 el = open_els.shift()
2148 if el.name is t.name and el.namespace is NS_HTML
2150 clear_afe_to_marker()
2152 if t.type is TYPE_START_TAG and t.name is 'table'
2153 unless doc.flag('quirks mode') is QUIRKS_YES
2154 close_p_if_in_button_scope() # test
2155 insert_html_element t
2156 flag_frameset_ok = false
2157 ins_mode = ins_mode_in_table
2159 if t.type is TYPE_END_TAG and t.name is 'br'
2161 # W3C: t.type = TYPE_START_TAG
2162 t = new_open_tag 'br' # WHATWG
2164 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2166 insert_html_element t
2168 t.acknowledge_self_closing()
2169 flag_frameset_ok = false
2171 if t.type is TYPE_START_TAG and t.name is 'input'
2173 insert_html_element t
2175 t.acknowledge_self_closing()
2176 unless is_input_hidden_tok t
2177 flag_frameset_ok = false
2179 if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
2180 # WHATWG adds 'menuitem' for this block
2181 insert_html_element t
2183 t.acknowledge_self_closing()
2185 if t.type is TYPE_START_TAG and t.name is 'hr'
2186 close_p_if_in_button_scope()
2187 insert_html_element t
2189 t.acknowledge_self_closing()
2190 flag_frameset_ok = false
2192 if t.type is TYPE_START_TAG and t.name is 'image'
2197 if t.type is TYPE_START_TAG and t.name is 'isindex'
2199 if template_tag_is_open() is false and form_element_pointer isnt null
2201 t.acknowledge_self_closing()
2202 flag_frameset_ok = false
2203 close_p_if_in_button_scope()
2204 el = insert_html_element new_open_tag 'form'
2205 unless template_tag_is_open()
2206 form_element_pointer = el
2209 el.attrs['action'] = a[1]
2211 insert_html_element new_open_tag 'hr'
2214 insert_html_element new_open_tag 'label'
2215 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2216 input_el = new_open_tag 'input'
2221 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2222 input_el.attrs_a.push [a[0], a[1]]
2223 input_el.attrs_a.push ['name', 'isindex']
2224 # fixfull this next bit is in english... internationalize?
2225 prompt ?= "This is a searchable index. Enter search keywords: "
2226 insert_character new_character_token prompt # fixfull split
2227 # TODO submit typo "balue" in spec
2228 insert_html_element input_el
2230 # insert_character '' # you can put chars here if promt attr missing
2232 insert_html_element new_open_tag 'hr'
2235 unless template_tag_is_open()
2236 form_element_pointer = null
2238 if t.type is TYPE_START_TAG and t.name is 'textarea'
2239 insert_html_element t
2240 eat_next_token_if_newline()
2241 tok_state = tok_state_rcdata
2242 original_ins_mode = ins_mode
2243 flag_frameset_ok = false
2244 ins_mode = ins_mode_text
2246 if t.type is TYPE_START_TAG and t.name is 'xmp'
2247 close_p_if_in_button_scope()
2249 flag_frameset_ok = false
2250 parse_generic_raw_text t
2252 if t.type is TYPE_START_TAG and t.name is 'iframe'
2253 flag_frameset_ok = false
2254 parse_generic_raw_text t
2256 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2257 parse_generic_raw_text t
2259 if t.type is TYPE_START_TAG and t.name is 'select'
2261 insert_html_element t
2262 flag_frameset_ok = false
2263 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2264 ins_mode = ins_mode_in_select_in_table
2266 ins_mode = ins_mode_in_select
2268 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2269 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2272 insert_html_element t
2274 # this comment block implements the W3C spec
2275 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2276 # if is_in_scope 'ruby', NS_HTML
2277 # generate_implied_end_tags()
2278 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2280 # insert_html_element t
2282 # if t.type is TYPE_START_TAG and t.name is 'rt'
2283 # if is_in_scope 'ruby', NS_HTML
2284 # generate_implied_end_tags 'rtc' # arg is exception
2285 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2287 # insert_html_element t
2289 # below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2290 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2291 if is_in_scope 'ruby', NS_HTML
2292 generate_implied_end_tags()
2293 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2295 insert_html_element t
2297 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2298 if is_in_scope 'ruby', NS_HTML
2299 generate_implied_end_tags 'rtc'
2300 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2302 insert_html_element t
2305 if t.type is TYPE_START_TAG and t.name is 'math'
2307 adjust_mathml_attributes t
2308 adjust_foreign_attributes t
2309 insert_foreign_element t, NS_MATHML
2310 if t.flag 'self-closing'
2312 t.acknowledge_self_closing()
2314 if t.type is TYPE_START_TAG and t.name is 'svg'
2316 adjust_svg_attributes t
2317 adjust_foreign_attributes t
2318 insert_foreign_element t, NS_SVG
2319 if t.flag 'self-closing'
2321 t.acknowledge_self_closing()
2323 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2326 if t.type is TYPE_START_TAG # any other start tag
2328 insert_html_element t
2330 if t.type is TYPE_END_TAG # any other end tag
2331 in_body_any_other_end_tag t.name
2335 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2336 ins_mode_text = (t) ->
2337 if t.type is TYPE_TEXT
2340 if t.type is TYPE_EOF
2342 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2343 open_els[0].flag 'already started', true
2345 ins_mode = original_ins_mode
2348 if t.type is TYPE_END_TAG and t.name is 'script'
2350 ins_mode = original_ins_mode
2351 # fixfull the spec seems to assume that I'm going to run the script
2352 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2354 if t.type is TYPE_END_TAG
2356 ins_mode = original_ins_mode
2358 console.log 'warning: end of ins_mode_text reached'
2360 # the functions below implement the tokenizer stats described here:
2361 # http://www.w3.org/TR/html5/syntax.html#tokenization
2363 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2364 ins_mode_in_table_else = (t) ->
2366 flag_foster_parenting = true
2368 flag_foster_parenting = false
2370 ins_mode_in_table = (t) ->
2373 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2374 pending_table_character_tokens = []
2375 original_ins_mode = ins_mode
2376 ins_mode = ins_mode_in_table_text
2379 ins_mode_in_table_else t
2387 clear_stack_to_table_context()
2389 insert_html_element t
2390 ins_mode = ins_mode_in_caption
2392 clear_stack_to_table_context()
2393 insert_html_element t
2394 ins_mode = ins_mode_in_column_group
2396 clear_stack_to_table_context()
2397 insert_html_element new_open_tag 'colgroup'
2398 ins_mode = ins_mode_in_column_group
2400 when 'tbody', 'tfoot', 'thead'
2401 clear_stack_to_table_context()
2402 insert_html_element t
2403 ins_mode = ins_mode_in_table_body
2404 when 'td', 'th', 'tr'
2405 clear_stack_to_table_context()
2406 insert_html_element new_open_tag 'tbody'
2407 ins_mode = ins_mode_in_table_body
2411 if is_in_table_scope 'table', NS_HTML
2413 el = open_els.shift()
2414 if el.name is 'table' and el.namespace is NS_HTML
2418 when 'style', 'script', 'template'
2421 unless is_input_hidden_tok t
2422 ins_mode_in_table_else t
2425 el = insert_html_element t
2427 t.acknowledge_self_closing()
2430 if form_element_pointer?
2432 if template_tag_is_open()
2434 form_element_pointer = insert_html_element t
2437 ins_mode_in_table_else t
2441 if is_in_table_scope 'table', NS_HTML
2443 el = open_els.shift()
2444 if el.name is 'table' and el.namespace is NS_HTML
2449 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2454 ins_mode_in_table_else t
2458 ins_mode_in_table_else t
2461 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2462 ins_mode_in_table_text = (t) ->
2463 if t.type is TYPE_TEXT and t.text is "\u0000"
2467 if t.type is TYPE_TEXT
2468 pending_table_character_tokens.push t
2472 for old in pending_table_character_tokens
2473 unless is_space_tok old
2477 for old in pending_table_character_tokens
2478 insert_character old
2480 for old in pending_table_character_tokens
2481 ins_mode_in_table_else old
2482 pending_table_character_tokens = []
2483 ins_mode = original_ins_mode
2486 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2487 ins_mode_in_caption = (t) ->
2488 if t.type is TYPE_END_TAG and t.name is 'caption'
2489 if is_in_table_scope 'caption', NS_HTML
2490 generate_implied_end_tags()
2491 if open_els[0].name isnt 'caption'
2494 el = open_els.shift()
2495 if el.name is 'caption' and el.namespace is NS_HTML
2497 clear_afe_to_marker()
2498 ins_mode = ins_mode_in_table
2503 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2505 if is_in_table_scope 'caption', NS_HTML
2507 el = open_els.shift()
2508 if el.name is 'caption' and el.namespace is NS_HTML
2510 clear_afe_to_marker()
2511 ins_mode = ins_mode_in_table
2513 # else fragment case
2515 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2521 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2522 ins_mode_in_column_group = (t) ->
2526 if t.type is TYPE_COMMENT
2529 if t.type is TYPE_DOCTYPE
2532 if t.type is TYPE_START_TAG and t.name is 'html'
2535 if t.type is TYPE_START_TAG and t.name is 'col'
2536 el = insert_html_element t
2538 t.acknowledge_self_closing()
2540 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2541 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2543 ins_mode = ins_mode_in_table
2547 if t.type is TYPE_END_TAG and t.name is 'col'
2550 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2553 if t.type is TYPE_EOF
2557 if open_els[0].name isnt 'colgroup'
2561 ins_mode = ins_mode_in_table
2565 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2566 ins_mode_in_table_body = (t) ->
2567 if t.type is TYPE_START_TAG and t.name is 'tr'
2568 clear_stack_to_table_body_context()
2569 insert_html_element t
2570 ins_mode = ins_mode_in_row
2572 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2574 clear_stack_to_table_body_context()
2575 insert_html_element new_open_tag 'tr'
2576 ins_mode = ins_mode_in_row
2579 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2580 unless is_in_table_scope t.name, NS_HTML
2583 clear_stack_to_table_body_context()
2585 ins_mode = ins_mode_in_table
2587 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2590 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2593 if table_scopers[el.name] is el.namespace
2598 clear_stack_to_table_body_context()
2600 ins_mode = ins_mode_in_table
2603 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2609 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2610 ins_mode_in_row = (t) ->
2611 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2612 clear_stack_to_table_row_context()
2613 insert_html_element t
2614 ins_mode = ins_mode_in_cell
2617 if t.type is TYPE_END_TAG and t.name is 'tr'
2618 if is_in_table_scope 'tr', NS_HTML
2619 clear_stack_to_table_row_context()
2621 ins_mode = ins_mode_in_table_body
2625 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2626 if is_in_table_scope 'tr', NS_HTML
2627 clear_stack_to_table_row_context()
2629 ins_mode = ins_mode_in_table_body
2634 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2635 if is_in_table_scope t.name, NS_HTML
2636 if is_in_table_scope 'tr', NS_HTML
2637 clear_stack_to_table_row_context()
2639 ins_mode = ins_mode_in_table_body
2644 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2650 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2652 generate_implied_end_tags()
2653 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2656 el = open_els.shift()
2657 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2659 clear_afe_to_marker()
2660 ins_mode = ins_mode_in_row
2662 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2663 ins_mode_in_cell = (t) ->
2664 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2665 if is_in_table_scope t.name, NS_HTML
2666 generate_implied_end_tags()
2667 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2670 el = open_els.shift()
2671 if el.name is t.name and el.namespace is NS_HTML
2673 clear_afe_to_marker()
2674 ins_mode = ins_mode_in_row
2678 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2681 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2684 if table_scopers[el.name] is el.namespace
2692 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2695 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2696 if is_in_table_scope t.name, NS_HTML
2705 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2706 ins_mode_in_select = (t) ->
2707 if t.type is TYPE_TEXT and t.text is "\u0000"
2710 if t.type is TYPE_TEXT
2713 if t.type is TYPE_COMMENT
2716 if t.type is TYPE_DOCTYPE
2719 if t.type is TYPE_START_TAG and t.name is 'html'
2722 if t.type is TYPE_START_TAG and t.name is 'option'
2723 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2725 insert_html_element t
2727 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2728 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2730 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2732 insert_html_element t
2734 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2735 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2736 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2738 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2743 if t.type is TYPE_END_TAG and t.name is 'option'
2744 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2749 if t.type is TYPE_END_TAG and t.name is 'select'
2750 if is_in_select_scope 'select', NS_HTML
2752 el = open_els.shift()
2753 if el.name is 'select' and el.namespace is NS_HTML
2759 if t.type is TYPE_START_TAG and t.name is 'select'
2762 el = open_els.shift()
2763 if el.name is 'select' and el.namespace is NS_HTML
2766 # spec says that this is the same as </select> but it doesn't say
2767 # to check scope first
2769 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2771 unless is_in_select_scope 'select', NS_HTML
2774 el = open_els.shift()
2775 if el.name is 'select' and el.namespace is NS_HTML
2780 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2783 if t.type is TYPE_EOF
2790 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2791 ins_mode_in_select_in_table = (t) ->
2792 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2795 el = open_els.shift()
2796 if el.name is 'select' and el.namespace is NS_HTML
2801 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2803 unless is_in_table_scope t.name, NS_HTML
2806 el = open_els.shift()
2807 if el.name is 'select' and el.namespace is NS_HTML
2813 ins_mode_in_select t
2816 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2817 ins_mode_in_template = (t) ->
2818 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2821 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2824 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2825 template_ins_modes.shift()
2826 template_ins_modes.unshift ins_mode_in_table
2827 ins_mode = ins_mode_in_table
2830 if t.type is TYPE_START_TAG and t.name is 'col'
2831 template_ins_modes.shift()
2832 template_ins_modes.unshift ins_mode_in_column_group
2833 ins_mode = ins_mode_in_column_group
2836 if t.type is TYPE_START_TAG and t.name is 'tr'
2837 template_ins_modes.shift()
2838 template_ins_modes.unshift ins_mode_in_table_body
2839 ins_mode = ins_mode_in_table_body
2842 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2843 template_ins_modes.shift()
2844 template_ins_modes.unshift ins_mode_in_row
2845 ins_mode = ins_mode_in_row
2848 if t.type is TYPE_START_TAG
2849 template_ins_modes.shift()
2850 template_ins_modes.unshift ins_mode_in_body
2851 ins_mode = ins_mode_in_body
2854 if t.type is TYPE_END_TAG
2857 if t.type is TYPE_EOF
2858 unless template_tag_is_open()
2863 el = open_els.shift()
2864 if el.name is 'template' and el.namespace is NS_HTML
2866 clear_afe_to_marker()
2867 template_ins_modes.shift()
2871 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2872 ins_mode_after_body = (t) ->
2876 if t.type is TYPE_COMMENT
2877 first = open_els[open_els.length - 1]
2878 insert_comment t, [first, first.children.length]
2880 if t.type is TYPE_DOCTYPE
2883 if t.type is TYPE_START_TAG and t.name is 'html'
2886 if t.type is TYPE_END_TAG and t.name is 'html'
2887 if flag_fragment_parsing
2890 ins_mode = ins_mode_after_after_body
2892 if t.type is TYPE_EOF
2897 ins_mode = ins_mode_in_body
2900 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2901 ins_mode_in_frameset = (t) ->
2905 if t.type is TYPE_COMMENT
2908 if t.type is TYPE_DOCTYPE
2911 if t.type is TYPE_START_TAG and t.name is 'html'
2914 if t.type is TYPE_START_TAG and t.name is 'frameset'
2915 insert_html_element t
2917 if t.type is TYPE_END_TAG and t.name is 'frameset'
2918 if open_els.length is 1
2920 return # fragment case
2922 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2923 ins_mode = ins_mode_after_frameset
2925 if t.type is TYPE_START_TAG and t.name is 'frame'
2926 insert_html_element t
2928 t.acknowledge_self_closing()
2930 if t.type is TYPE_START_TAG and t.name is 'noframes'
2933 if t.type is TYPE_EOF
2934 if open_els.length isnt 1
2942 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2943 ins_mode_after_frameset = (t) ->
2947 if t.type is TYPE_COMMENT
2950 if t.type is TYPE_DOCTYPE
2953 if t.type is TYPE_START_TAG and t.name is 'html'
2956 if t.type is TYPE_END_TAG and t.name is 'html'
2957 ins_mode = ins_mode_after_after_frameset
2959 if t.type is TYPE_START_TAG and t.name is 'noframes'
2962 if t.type is TYPE_EOF
2969 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2970 ins_mode_after_after_body = (t) ->
2971 if t.type is TYPE_COMMENT
2972 insert_comment t, [doc, doc.children.length]
2974 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2977 if t.type is TYPE_EOF
2982 ins_mode = ins_mode_in_body
2986 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2987 ins_mode_after_after_frameset = (t) ->
2988 if t.type is TYPE_COMMENT
2989 insert_comment t, [doc, doc.children.length]
2991 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2994 if t.type is TYPE_EOF
2997 if t.type is TYPE_START_TAG and t.name is 'noframes'
3004 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
3005 has_color_face_or_size = (t) ->
3007 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
3010 in_foreign_content_end_script = ->
3014 in_foreign_content_other_start = (t) ->
3015 acn = adjusted_current_node()
3016 if acn.namespace is NS_MATHML
3017 adjust_mathml_attributes t
3018 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
3019 t.name = svg_name_fixes[t.name]
3020 if acn.namespace is NS_SVG
3021 adjust_svg_attributes t
3022 adjust_foreign_attributes t
3023 insert_foreign_element t, acn.namespace
3024 if t.flag 'self-closing'
3025 if t.name is 'script'
3026 t.acknowledge_self_closing()
3027 in_foreign_content_end_script()
3031 t.acknowledge_self_closing()
3033 in_foreign_content = (t) ->
3034 if t.type is TYPE_TEXT and t.text is "\u0000"
3036 insert_character new_character_token "\ufffd"
3041 if t.type is TYPE_TEXT
3042 flag_frameset_ok = false
3045 if t.type is TYPE_COMMENT
3048 if t.type is TYPE_DOCTYPE
3051 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
3053 if flag_fragment_parsing
3054 in_foreign_content_other_start t
3056 loop # is this safe?
3058 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
3062 if t.type is TYPE_START_TAG
3063 in_foreign_content_other_start t
3065 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
3066 in_foreign_content_end_script()
3068 if t.type is TYPE_END_TAG
3071 if node.name.toLowerCase() isnt t.name
3074 if node is open_els[open_els.length - 1]
3076 if node.name.toLowerCase() is t.name
3078 el = open_els.shift()
3083 if node.namespace is NS_HTML
3085 ins_mode t # explicitly call HTML insertion mode
3088 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
3090 switch c = txt.charAt(cur++)
3092 return new_text_node parse_character_reference()
3094 tok_state = tok_state_tag_open
3097 return new_text_node c
3099 return new_eof_token()
3101 return new_text_node c
3104 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
3105 # not needed: tok_state_character_reference_in_data = ->
3106 # just call parse_character_reference()
3108 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
3109 tok_state_rcdata = ->
3110 switch c = txt.charAt(cur++)
3112 return new_text_node parse_character_reference()
3114 tok_state = tok_state_rcdata_less_than_sign
3117 return new_character_token "\ufffd"
3119 return new_eof_token()
3121 return new_character_token c
3124 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
3125 # not needed: tok_state_character_reference_in_rcdata = ->
3126 # just call parse_character_reference()
3128 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
3129 tok_state_rawtext = ->
3130 switch c = txt.charAt(cur++)
3132 tok_state = tok_state_rawtext_less_than_sign
3135 return new_character_token "\ufffd"
3137 return new_eof_token()
3139 return new_character_token c
3142 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
3143 tok_state_script_data = ->
3144 switch c = txt.charAt(cur++)
3146 tok_state = tok_state_script_data_less_than_sign
3149 return new_character_token "\ufffd"
3151 return new_eof_token()
3153 return new_character_token c
3156 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3157 tok_state_plaintext = ->
3158 switch c = txt.charAt(cur++)
3161 return new_character_token "\ufffd"
3163 return new_eof_token()
3165 return new_character_token c
3169 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3170 tok_state_tag_open = ->
3171 c = txt.charAt(cur++)
3173 tok_state = tok_state_markup_declaration_open
3176 tok_state = tok_state_end_tag_open
3179 tok_cur_tag = new_open_tag c.toLowerCase()
3180 tok_state = tok_state_tag_name
3183 tok_cur_tag = new_open_tag c
3184 tok_state = tok_state_tag_name
3188 tok_cur_tag = new_comment_token '?' # FIXME right?
3189 tok_state = tok_state_bogus_comment
3193 tok_state = tok_state_data
3194 cur -= 1 # we didn't parse/handle the char after <
3195 return new_text_node '<'
3197 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3198 tok_state_end_tag_open = ->
3199 c = txt.charAt(cur++)
3201 tok_cur_tag = new_end_tag c.toLowerCase()
3202 tok_state = tok_state_tag_name
3205 tok_cur_tag = new_end_tag c
3206 tok_state = tok_state_tag_name
3210 tok_state = tok_state_data
3214 tok_state = tok_state_data
3215 return new_text_node '</'
3218 tok_cur_tag = new_comment_token c
3219 tok_state = tok_state_bogus_comment
3222 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3223 tok_state_tag_name = ->
3224 switch c = txt.charAt(cur++)
3225 when "\t", "\n", "\u000c", ' '
3226 tok_state = tok_state_before_attribute_name
3228 tok_state = tok_state_self_closing_start_tag
3230 tok_state = tok_state_data
3236 tok_cur_tag.name += "\ufffd"
3239 tok_state = tok_state_data
3242 tok_cur_tag.name += c.toLowerCase()
3244 tok_cur_tag.name += c
3247 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3248 tok_state_rcdata_less_than_sign = ->
3249 c = txt.charAt(cur++)
3251 temporary_buffer = ''
3252 tok_state = tok_state_rcdata_end_tag_open
3255 tok_state = tok_state_rcdata
3256 cur -= 1 # reconsume the input character
3257 return new_character_token '<'
3259 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3260 tok_state_rcdata_end_tag_open = ->
3261 c = txt.charAt(cur++)
3263 tok_cur_tag = new_end_tag c.toLowerCase()
3264 temporary_buffer += c
3265 tok_state = tok_state_rcdata_end_tag_name
3268 tok_cur_tag = new_end_tag c
3269 temporary_buffer += c
3270 tok_state = tok_state_rcdata_end_tag_name
3273 tok_state = tok_state_rcdata
3274 cur -= 1 # reconsume the input character
3275 return new_character_token "</" # fixfull separate these
3277 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3278 is_appropriate_end_tag = (t) ->
3279 # spec says to check against "the tag name of the last start tag to
3280 # have been emitted from this tokenizer", but this is only called from
3281 # the various "raw" states, so it's hopefully ok to assume that
3282 # open_els[0].name will work instead TODO: verify this after the script
3283 # data states are implemented
3284 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3285 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3287 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3288 tok_state_rcdata_end_tag_name = ->
3289 c = txt.charAt(cur++)
3290 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3291 if is_appropriate_end_tag tok_cur_tag
3292 tok_state = tok_state_before_attribute_name
3294 # else fall through to "Anything else"
3296 if is_appropriate_end_tag tok_cur_tag
3297 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3299 # else fall through to "Anything else"
3301 if is_appropriate_end_tag tok_cur_tag
3302 tok_state = tok_state_data
3304 # else fall through to "Anything else"
3306 tok_cur_tag.name += c.toLowerCase()
3307 temporary_buffer += c
3310 tok_cur_tag.name += c
3311 temporary_buffer += c
3314 tok_state = tok_state_rcdata
3315 cur -= 1 # reconsume the input character
3316 return new_character_token '</' + temporary_buffer # fixfull separate these
3318 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3319 tok_state_rawtext_less_than_sign = ->
3320 c = txt.charAt(cur++)
3322 temporary_buffer = ''
3323 tok_state = tok_state_rawtext_end_tag_open
3326 tok_state = tok_state_rawtext
3327 cur -= 1 # reconsume the input character
3328 return new_character_token '<'
3330 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3331 tok_state_rawtext_end_tag_open = ->
3332 c = txt.charAt(cur++)
3334 tok_cur_tag = new_end_tag c.toLowerCase()
3335 temporary_buffer += c
3336 tok_state = tok_state_rawtext_end_tag_name
3339 tok_cur_tag = new_end_tag c
3340 temporary_buffer += c
3341 tok_state = tok_state_rawtext_end_tag_name
3344 tok_state = tok_state_rawtext
3345 cur -= 1 # reconsume the input character
3346 return new_character_token "</" # fixfull separate these
3348 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3349 tok_state_rawtext_end_tag_name = ->
3350 c = txt.charAt(cur++)
3351 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3352 if is_appropriate_end_tag tok_cur_tag
3353 tok_state = tok_state_before_attribute_name
3355 # else fall through to "Anything else"
3357 if is_appropriate_end_tag tok_cur_tag
3358 tok_state = tok_state_self_closing_start_tag
3360 # else fall through to "Anything else"
3362 if is_appropriate_end_tag tok_cur_tag
3363 tok_state = tok_state_data
3365 # else fall through to "Anything else"
3367 tok_cur_tag.name += c.toLowerCase()
3368 temporary_buffer += c
3371 tok_cur_tag.name += c
3372 temporary_buffer += c
3375 tok_state = tok_state_rawtext
3376 cur -= 1 # reconsume the input character
3377 return new_character_token '</' + temporary_buffer # fixfull separate these
3379 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3380 tok_state_script_data_less_than_sign = ->
3381 c = txt.charAt(cur++)
3383 temporary_buffer = ''
3384 tok_state = tok_state_script_data_end_tag_open
3387 tok_state = tok_state_script_data_escape_start
3388 return new_character_token '<!' # fixfull split
3390 tok_state = tok_state_script_data
3391 cur -= 1 # Reconsume
3392 return new_character_token '<'
3394 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3395 tok_state_script_data_end_tag_open = ->
3396 c = txt.charAt(cur++)
3398 tok_cur_tag = new_end_tag c.toLowerCase()
3399 temporary_buffer += c
3400 tok_state = tok_state_script_data_end_tag_name
3403 tok_cur_tag = new_end_tag c
3404 temporary_buffer += c
3405 tok_state = tok_state_script_data_end_tag_name
3408 tok_state = tok_state_script_data
3409 cur -= 1 # Reconsume
3410 return new_character_token '</'
3412 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3413 tok_state_script_data_end_tag_name = ->
3414 c = txt.charAt(cur++)
3415 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3416 if is_appropriate_end_tag tok_cur_tag
3417 tok_state = tok_state_before_attribute_name
3421 if is_appropriate_end_tag tok_cur_tag
3422 tok_state = tok_state_self_closing_start_tag
3426 if is_appropriate_end_tag tok_cur_tag
3427 tok_state = tok_state_data
3431 tok_cur_tag.name += c.toLowerCase()
3432 temporary_buffer += c
3435 tok_cur_tag.name += c
3436 temporary_buffer += c
3439 tok_state = tok_state_script_data
3440 cur -= 1 # Reconsume
3441 return new_character_token "</#{temporary_buffer}" # fixfull split
3443 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3444 tok_state_script_data_escape_start = ->
3445 c = txt.charAt(cur++)
3447 tok_state = tok_state_script_data_escape_start_dash
3448 return new_character_token '-'
3450 tok_state = tok_state_script_data
3451 cur -= 1 # Reconsume
3454 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3455 tok_state_script_data_escape_start_dash = ->
3456 c = txt.charAt(cur++)
3458 tok_state = tok_state_script_data_escaped_dash_dash
3459 return new_character_token '-'
3461 tok_state = tok_state_script_data
3462 cur -= 1 # Reconsume
3465 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3466 tok_state_script_data_escaped = ->
3467 c = txt.charAt(cur++)
3469 tok_state = tok_state_script_data_escaped_dash
3470 return new_character_token '-'
3472 tok_state = tok_state_script_data_escaped_less_than_sign
3476 return new_character_token "\ufffd"
3478 tok_state = tok_state_data
3480 cur -= 1 # Reconsume
3483 return new_character_token c
3485 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3486 tok_state_script_data_escaped_dash = ->
3487 c = txt.charAt(cur++)
3489 tok_state = tok_state_script_data_escaped_dash_dash
3490 return new_character_token '-'
3492 tok_state = tok_state_script_data_escaped_less_than_sign
3496 tok_state = tok_state_script_data_escaped
3497 return new_character_token "\ufffd"
3499 tok_state = tok_state_data
3501 cur -= 1 # Reconsume
3504 tok_state = tok_state_script_data_escaped
3505 return new_character_token c
3507 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3508 tok_state_script_data_escaped_dash_dash = ->
3509 c = txt.charAt(cur++)
3511 return new_character_token '-'
3513 tok_state = tok_state_script_data_escaped_less_than_sign
3516 tok_state = tok_state_script_data
3517 return new_character_token '>'
3520 tok_state = tok_state_script_data_escaped
3521 return new_character_token "\ufffd"
3524 tok_state = tok_state_data
3525 cur -= 1 # Reconsume
3528 tok_state = tok_state_script_data_escaped
3529 return new_character_token c
3531 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3532 tok_state_script_data_escaped_less_than_sign = ->
3533 c = txt.charAt(cur++)
3535 temporary_buffer = ''
3536 tok_state = tok_state_script_data_escaped_end_tag_open
3539 temporary_buffer = c.toLowerCase() # yes, really
3540 tok_state = tok_state_script_data_double_escape_start
3541 return new_character_token "<#{c}" # fixfull split
3543 temporary_buffer = c
3544 tok_state = tok_state_script_data_double_escape_start
3545 return new_character_token "<#{c}" # fixfull split
3547 tok_state = tok_state_script_data_escaped
3548 cur -= 1 # Reconsume
3549 return new_character_token '<'
3551 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3552 tok_state_script_data_escaped_end_tag_open = ->
3553 c = txt.charAt(cur++)
3555 tok_cur_tag = new_end_tag c.toLowerCase()
3556 temporary_buffer += c
3557 tok_state = tok_state_script_data_escaped_end_tag_name
3560 tok_cur_tag = new_end_tag c
3561 temporary_buffer += c
3562 tok_state = tok_state_script_data_escaped_end_tag_name
3565 tok_state = tok_state_script_data_escaped
3566 cur -= 1 # Reconsume
3567 return new_character_token '</' # fixfull split
3569 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3570 tok_state_script_data_escaped_end_tag_name = ->
3571 c = txt.charAt(cur++)
3572 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3573 if is_appropriate_end_tag tok_cur_tag
3574 tok_state = tok_state_before_attribute_name
3578 if is_appropriate_end_tag tok_cur_tag
3579 tok_state = tok_state_self_closing_start_tag
3583 if is_appropriate_end_tag tok_cur_tag
3584 tok_state = tok_state_data
3588 tok_cur_tag.name += c.toLowerCase()
3589 temporary_buffer += c.toLowerCase()
3592 tok_cur_tag.name += c
3593 temporary_buffer += c.toLowerCase()
3596 tok_state = tok_state_script_data_escaped
3597 cur -= 1 # Reconsume
3598 return new_character_token "</#{temporary_buffer}" # fixfull split
3600 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3601 tok_state_script_data_double_escape_start = ->
3602 c = txt.charAt(cur++)
3603 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3604 if temporary_buffer is 'script'
3605 tok_state = tok_state_script_data_double_escaped
3607 tok_state = tok_state_script_data_escaped
3608 return new_character_token c
3610 temporary_buffer += c.toLowerCase() # yes, really lowercase
3611 return new_character_token c
3613 temporary_buffer += c
3614 return new_character_token c
3616 tok_state = tok_state_script_data_escaped
3617 cur -= 1 # Reconsume
3620 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3621 tok_state_script_data_double_escaped = ->
3622 c = txt.charAt(cur++)
3624 tok_state = tok_state_script_data_double_escaped_dash
3625 return new_character_token '-'
3627 tok_state = tok_state_script_data_double_escaped_less_than_sign
3628 return new_character_token '<'
3631 return new_character_token "\ufffd"
3634 tok_state = tok_state_data
3635 cur -= 1 # Reconsume
3638 return new_character_token c
3640 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3641 tok_state_script_data_double_escaped_dash = ->
3642 c = txt.charAt(cur++)
3644 tok_state = tok_state_script_data_double_escaped_dash_dash
3645 return new_character_token '-'
3647 tok_state = tok_state_script_data_double_escaped_less_than_sign
3648 return new_character_token '<'
3651 tok_state = tok_state_script_data_double_escaped
3652 return new_character_token "\ufffd"
3655 tok_state = tok_state_data
3656 cur -= 1 # Reconsume
3659 tok_state = tok_state_script_data_double_escaped
3660 return new_character_token c
3662 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3663 tok_state_script_data_double_escaped_dash_dash = ->
3664 c = txt.charAt(cur++)
3666 return new_character_token '-'
3668 tok_state = tok_state_script_data_double_escaped_less_than_sign
3669 return new_character_token '<'
3671 tok_state = tok_state_script_data
3672 return new_character_token '>'
3675 tok_state = tok_state_script_data_double_escaped
3676 return new_character_token "\ufffd"
3679 tok_state = tok_state_data
3680 cur -= 1 # Reconsume
3683 tok_state = tok_state_script_data_double_escaped
3684 return new_character_token c
3686 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3687 tok_state_script_data_double_escaped_less_than_sign = ->
3688 c = txt.charAt(cur++)
3690 temporary_buffer = ''
3691 tok_state = tok_state_script_data_double_escape_end
3692 return new_character_token '/'
3694 tok_state = tok_state_script_data_double_escaped
3695 cur -= 1 # Reconsume
3698 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3699 tok_state_script_data_double_escape_end = ->
3700 c = txt.charAt(cur++)
3701 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3702 if temporary_buffer is 'script'
3703 tok_state = tok_state_script_data_escaped
3705 tok_state = tok_state_script_data_double_escaped
3706 return new_character_token c
3708 temporary_buffer += c.toLowerCase() # yes, really lowercase
3709 return new_character_token c
3711 temporary_buffer += c
3712 return new_character_token c
3714 tok_state = tok_state_script_data_double_escaped
3715 cur -= 1 # Reconsume
3718 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3719 tok_state_before_attribute_name = ->
3721 switch c = txt.charAt(cur++)
3722 when "\t", "\n", "\u000c", ' '
3725 tok_state = tok_state_self_closing_start_tag
3728 tok_state = tok_state_data
3734 attr_name = "\ufffd"
3735 when '"', "'", '<', '='
3740 tok_state = tok_state_data
3743 attr_name = c.toLowerCase()
3747 tok_cur_tag.attrs_a.unshift [attr_name, '']
3748 tok_state = tok_state_attribute_name
3751 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3752 tok_state_attribute_name = ->
3753 switch c = txt.charAt(cur++)
3754 when "\t", "\n", "\u000c", ' '
3755 tok_state = tok_state_after_attribute_name
3757 tok_state = tok_state_self_closing_start_tag
3759 tok_state = tok_state_before_attribute_value
3761 tok_state = tok_state_data
3767 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3770 tok_cur_tag.attrs_a[0][0] += c
3773 tok_state = tok_state_data
3776 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3778 tok_cur_tag.attrs_a[0][0] += c
3781 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3782 tok_state_after_attribute_name = ->
3783 c = txt.charAt(cur++)
3784 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3787 tok_state = tok_state_self_closing_start_tag
3790 tok_state = tok_state_before_attribute_value
3793 tok_state = tok_state_data
3796 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3797 tok_state = tok_state_attribute_name
3801 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3802 tok_state = tok_state_attribute_name
3806 tok_state = tok_state_data
3807 cur -= 1 # reconsume
3809 if c is '"' or c is "'" or c is '<'
3811 # fall through to Anything else
3813 tok_cur_tag.attrs_a.unshift [c, '']
3814 tok_state = tok_state_attribute_name
3816 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3817 tok_state_before_attribute_value = ->
3818 switch c = txt.charAt(cur++)
3819 when "\t", "\n", "\u000c", ' '
3822 tok_state = tok_state_attribute_value_double_quoted
3824 tok_state = tok_state_attribute_value_unquoted
3827 tok_state = tok_state_attribute_value_single_quoted
3830 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3831 tok_state = tok_state_attribute_value_unquoted
3834 tok_state = tok_state_data
3840 tok_state = tok_state_data
3842 tok_cur_tag.attrs_a[0][1] += c
3843 tok_state = tok_state_attribute_value_unquoted
3846 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3847 tok_state_attribute_value_double_quoted = ->
3848 switch c = txt.charAt(cur++)
3850 tok_state = tok_state_after_attribute_value_quoted
3852 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3855 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3858 tok_state = tok_state_data
3860 tok_cur_tag.attrs_a[0][1] += c
3863 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3864 tok_state_attribute_value_single_quoted = ->
3865 switch c = txt.charAt(cur++)
3867 tok_state = tok_state_after_attribute_value_quoted
3869 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3872 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3875 tok_state = tok_state_data
3877 tok_cur_tag.attrs_a[0][1] += c
3880 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3881 tok_state_attribute_value_unquoted = ->
3882 switch c = txt.charAt(cur++)
3883 when "\t", "\n", "\u000c", ' '
3884 tok_state = tok_state_before_attribute_name
3886 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3888 tok_state = tok_state_data
3893 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3896 tok_state = tok_state_data
3898 # Parse Error if ', <, = or ` (backtick)
3899 tok_cur_tag.attrs_a[0][1] += c
3902 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3903 tok_state_after_attribute_value_quoted = ->
3904 switch c = txt.charAt(cur++)
3905 when "\t", "\n", "\u000c", ' '
3906 tok_state = tok_state_before_attribute_name
3908 tok_state = tok_state_self_closing_start_tag
3910 tok_state = tok_state_data
3916 tok_state = tok_state_data
3919 tok_state = tok_state_before_attribute_name
3920 cur -= 1 # we didn't handle that char
3923 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3924 tok_state_self_closing_start_tag = ->
3925 c = txt.charAt(cur++)
3927 tok_cur_tag.flag 'self-closing', true
3928 tok_state = tok_state_data
3932 tok_state = tok_state_data
3933 cur -= 1 # Reconsume
3937 tok_state = tok_state_before_attribute_name
3938 cur -= 1 # Reconsume
3941 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3942 # WARNING: put a comment token in tok_cur_tag before setting this state
3943 tok_state_bogus_comment = ->
3944 next_gt = txt.indexOf '>', cur
3946 val = txt.substr cur
3949 val = txt.substr cur, (next_gt - cur)
3951 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3952 tok_cur_tag.text += val
3953 tok_state = tok_state_data
3956 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3957 tok_state_markup_declaration_open = ->
3958 if txt.substr(cur, 2) is '--'
3960 tok_cur_tag = new_comment_token ''
3961 tok_state = tok_state_comment_start
3963 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3965 tok_state = tok_state_doctype
3967 acn = adjusted_current_node()
3968 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3970 tok_state = tok_state_cdata_section
3974 tok_cur_tag = new_comment_token ''
3975 tok_state = tok_state_bogus_comment
3978 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3979 tok_state_comment_start = ->
3980 switch c = txt.charAt(cur++)
3982 tok_state = tok_state_comment_start_dash
3985 tok_state = tok_state_comment
3986 return new_character_token "\ufffd"
3989 tok_state = tok_state_data
3993 tok_state = tok_state_data
3994 cur -= 1 # Reconsume
3997 tok_cur_tag.text += c
3998 tok_state = tok_state_comment
4001 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
4002 tok_state_comment_start_dash = ->
4003 switch c = txt.charAt(cur++)
4005 tok_state = tok_state_comment_end
4008 tok_cur_tag.text += "-\ufffd"
4009 tok_state = tok_state_comment
4012 tok_state = tok_state_data
4016 tok_state = tok_state_data
4017 cur -= 1 # Reconsume
4020 tok_cur_tag.text += "-#{c}"
4021 tok_state = tok_state_comment
4024 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
4025 tok_state_comment = ->
4026 switch c = txt.charAt(cur++)
4028 tok_state = tok_state_comment_end_dash
4031 tok_cur_tag.text += "\ufffd"
4034 tok_state = tok_state_data
4035 cur -= 1 # Reconsume
4038 tok_cur_tag.text += c
4041 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
4042 tok_state_comment_end_dash = ->
4043 switch c = txt.charAt(cur++)
4045 tok_state = tok_state_comment_end
4048 tok_cur_tag.text += "-\ufffd"
4049 tok_state = tok_state_comment
4052 tok_state = tok_state_data
4053 cur -= 1 # Reconsume
4056 tok_cur_tag.text += "-#{c}"
4057 tok_state = tok_state_comment
4060 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
4061 tok_state_comment_end = ->
4062 switch c = txt.charAt(cur++)
4064 tok_state = tok_state_data
4068 tok_cur_tag.text += "--\ufffd"
4069 tok_state = tok_state_comment
4072 tok_state = tok_state_comment_end_bang
4075 tok_cur_tag.text += '-'
4078 tok_state = tok_state_data
4079 cur -= 1 # Reconsume
4083 tok_cur_tag.text += "--#{c}"
4084 tok_state = tok_state_comment
4087 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
4088 tok_state_comment_end_bang = ->
4089 switch c = txt.charAt(cur++)
4091 tok_cur_tag.text += "--!#{c}"
4092 tok_state = tok_state_comment_end_dash
4094 tok_state = tok_state_data
4098 tok_cur_tag.text += "--!\ufffd"
4099 tok_state = tok_state_comment
4102 tok_state = tok_state_data
4103 cur -= 1 # Reconsume
4106 tok_cur_tag.text += "--!#{c}"
4107 tok_state = tok_state_comment
4110 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4111 tok_state_doctype = ->
4112 switch c = txt.charAt(cur++)
4113 when "\t", "\u000a", "\u000c", ' '
4114 tok_state = tok_state_before_doctype_name
4117 tok_state = tok_state_data
4118 el = new_doctype_token ''
4119 el.flag 'force-quirks', true
4120 cur -= 1 # Reconsume
4124 tok_state = tok_state_before_doctype_name
4125 cur -= 1 # Reconsume
4128 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4129 tok_state_before_doctype_name = ->
4130 c = txt.charAt(cur++)
4131 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4134 tok_cur_tag = new_doctype_token c.toLowerCase()
4135 tok_state = tok_state_doctype_name
4139 tok_cur_tag = new_doctype_token "\ufffd"
4140 tok_state = tok_state_doctype_name
4144 el = new_doctype_token ''
4145 el.flag 'force-quirks', true
4146 tok_state = tok_state_data
4150 tok_state = tok_state_data
4151 el = new_doctype_token ''
4152 el.flag 'force-quirks', true
4153 cur -= 1 # Reconsume
4156 tok_cur_tag = new_doctype_token c
4157 tok_state = tok_state_doctype_name
4160 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4161 tok_state_doctype_name = ->
4162 c = txt.charAt(cur++)
4163 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4164 tok_state = tok_state_after_doctype_name
4167 tok_state = tok_state_data
4170 tok_cur_tag.name += c.toLowerCase()
4174 tok_cur_tag.name += "\ufffd"
4178 tok_state = tok_state_data
4179 tok_cur_tag.flag 'force-quirks', true
4180 cur -= 1 # Reconsume
4183 tok_cur_tag.name += c
4186 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4187 tok_state_after_doctype_name = ->
4188 c = txt.charAt(cur++)
4189 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4192 tok_state = tok_state_data
4196 tok_state = tok_state_data
4197 tok_cur_tag.flag 'force-quirks', true
4198 cur -= 1 # Reconsume
4201 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4203 tok_state = tok_state_after_doctype_public_keyword
4205 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4207 tok_state = tok_state_after_doctype_system_keyword
4210 tok_cur_tag.flag 'force-quirks', true
4211 tok_state = tok_state_bogus_doctype
4214 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4215 tok_state_after_doctype_public_keyword = ->
4216 c = txt.charAt(cur++)
4217 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4218 tok_state = tok_state_before_doctype_public_identifier
4222 tok_cur_tag.public_identifier = ''
4223 tok_state = tok_state_doctype_public_identifier_double_quoted
4227 tok_cur_tag.public_identifier = ''
4228 tok_state = tok_state_doctype_public_identifier_single_quoted
4232 tok_cur_tag.flag 'force-quirks', true
4233 tok_state = tok_state_data
4237 tok_state = tok_state_data
4238 tok_cur_tag.flag 'force-quirks', true
4239 cur -= 1 # Reconsume
4243 tok_cur_tag.flag 'force-quirks', true
4244 tok_state = tok_state_bogus_doctype
4247 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4248 tok_state_before_doctype_public_identifier = ->
4249 c = txt.charAt(cur++)
4250 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4254 tok_cur_tag.public_identifier = ''
4255 tok_state = tok_state_doctype_public_identifier_double_quoted
4259 tok_cur_tag.public_identifier = ''
4260 tok_state = tok_state_doctype_public_identifier_single_quoted
4264 tok_cur_tag.flag 'force-quirks', true
4265 tok_state = tok_state_data
4269 tok_state = tok_state_data
4270 tok_cur_tag.flag 'force-quirks', true
4271 cur -= 1 # Reconsume
4275 tok_cur_tag.flag 'force-quirks', true
4276 tok_state = tok_state_bogus_doctype
4280 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4281 tok_state_doctype_public_identifier_double_quoted = ->
4282 c = txt.charAt(cur++)
4284 tok_state = tok_state_after_doctype_public_identifier
4288 tok_cur_tag.public_identifier += "\ufffd"
4292 tok_cur_tag.flag 'force-quirks', true
4293 tok_state = tok_state_data
4297 tok_state = tok_state_data
4298 tok_cur_tag.flag 'force-quirks', true
4299 cur -= 1 # Reconsume
4302 tok_cur_tag.public_identifier += c
4305 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4306 tok_state_doctype_public_identifier_single_quoted = ->
4307 c = txt.charAt(cur++)
4309 tok_state = tok_state_after_doctype_public_identifier
4313 tok_cur_tag.public_identifier += "\ufffd"
4317 tok_cur_tag.flag 'force-quirks', true
4318 tok_state = tok_state_data
4322 tok_state = tok_state_data
4323 tok_cur_tag.flag 'force-quirks', true
4324 cur -= 1 # Reconsume
4327 tok_cur_tag.public_identifier += c
4330 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4331 tok_state_after_doctype_public_identifier = ->
4332 c = txt.charAt(cur++)
4333 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4334 tok_state = tok_state_between_doctype_public_and_system_identifiers
4337 tok_state = tok_state_data
4341 tok_cur_tag.system_identifier = ''
4342 tok_state = tok_state_doctype_system_identifier_double_quoted
4346 tok_cur_tag.system_identifier = ''
4347 tok_state = tok_state_doctype_system_identifier_single_quoted
4351 tok_state = tok_state_data
4352 tok_cur_tag.flag 'force-quirks', true
4353 cur -= 1 # Reconsume
4357 tok_cur_tag.flag 'force-quirks', true
4358 tok_state = tok_state_bogus_doctype
4361 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4362 tok_state_between_doctype_public_and_system_identifiers = ->
4363 c = txt.charAt(cur++)
4364 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4367 tok_state = tok_state_data
4371 tok_cur_tag.system_identifier = ''
4372 tok_state = tok_state_doctype_system_identifier_double_quoted
4376 tok_cur_tag.system_identifier = ''
4377 tok_state = tok_state_doctype_system_identifier_single_quoted
4381 tok_state = tok_state_data
4382 tok_cur_tag.flag 'force-quirks', true
4383 cur -= 1 # Reconsume
4387 tok_cur_tag.flag 'force-quirks', true
4388 tok_state = tok_state_bogus_doctype
4391 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4392 tok_state_after_doctype_system_keyword = ->
4393 c = txt.charAt(cur++)
4394 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4395 tok_state = tok_state_before_doctype_system_identifier
4399 tok_cur_tag.system_identifier = ''
4400 tok_state = tok_state_doctype_system_identifier_double_quoted
4404 tok_cur_tag.system_identifier = ''
4405 tok_state = tok_state_doctype_system_identifier_single_quoted
4409 tok_cur_tag.flag 'force-quirks', true
4410 tok_state = tok_state_data
4414 tok_state = tok_state_data
4415 tok_cur_tag.flag 'force-quirks', true
4416 cur -= 1 # Reconsume
4420 tok_cur_tag.flag 'force-quirks', true
4421 tok_state = tok_state_bogus_doctype
4424 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4425 tok_state_before_doctype_system_identifier = ->
4426 c = txt.charAt(cur++)
4427 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4430 tok_cur_tag.system_identifier = ''
4431 tok_state = tok_state_doctype_system_identifier_double_quoted
4434 tok_cur_tag.system_identifier = ''
4435 tok_state = tok_state_doctype_system_identifier_single_quoted
4439 tok_cur_tag.flag 'force-quirks', true
4440 tok_state = tok_state_data
4444 tok_state = tok_state_data
4445 tok_cur_tag.flag 'force-quirks', true
4446 cur -= 1 # Reconsume
4450 tok_cur_tag.flag 'force-quirks', true
4451 tok_state = tok_state_bogus_doctype
4454 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4455 tok_state_doctype_system_identifier_double_quoted = ->
4456 c = txt.charAt(cur++)
4458 tok_state = tok_state_after_doctype_system_identifier
4462 tok_cur_tag.system_identifier += "\ufffd"
4466 tok_cur_tag.flag 'force-quirks', true
4467 tok_state = tok_state_data
4471 tok_state = tok_state_data
4472 tok_cur_tag.flag 'force-quirks', true
4473 cur -= 1 # Reconsume
4476 tok_cur_tag.system_identifier += c
4479 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4480 tok_state_doctype_system_identifier_single_quoted = ->
4481 c = txt.charAt(cur++)
4483 tok_state = tok_state_after_doctype_system_identifier
4487 tok_cur_tag.system_identifier += "\ufffd"
4491 tok_cur_tag.flag 'force-quirks', true
4492 tok_state = tok_state_data
4496 tok_state = tok_state_data
4497 tok_cur_tag.flag 'force-quirks', true
4498 cur -= 1 # Reconsume
4501 tok_cur_tag.system_identifier += c
4504 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4505 tok_state_after_doctype_system_identifier = ->
4506 c = txt.charAt(cur++)
4507 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4510 tok_state = tok_state_data
4514 tok_state = tok_state_data
4515 tok_cur_tag.flag 'force-quirks', true
4516 cur -= 1 # Reconsume
4520 # do _not_ tok_cur_tag.flag 'force-quirks', true
4521 tok_state = tok_state_bogus_doctype
4524 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4525 tok_state_bogus_doctype = ->
4526 c = txt.charAt(cur++)
4528 tok_state = tok_state_data
4531 tok_state = tok_state_data
4532 cur -= 1 # Reconsume
4537 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4538 tok_state_cdata_section = ->
4539 tok_state = tok_state_data
4540 next_gt = txt.indexOf ']]>', cur
4542 val = txt.substr cur
4545 val = txt.substr cur, (next_gt - cur)
4547 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
4549 return new_character_token val # fixfull split
4552 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4553 # Don't set this as a state, just call it
4554 # returns a string (NOT a text node)
4555 parse_character_reference = (allowed_char = null, in_attr = false) ->
4556 if cur >= txt.length
4558 switch c = txt.charAt(cur)
4559 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4560 # explicitly not a parse error
4563 # there has to be "one or more" alnums between & and ; to be a parse error
4566 if cur + 1 >= txt.length
4568 if txt.charAt(cur + 1).toLowerCase() is 'x'
4577 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4582 if txt.charAt(start + i) is ';'
4586 code_point = txt.substr(start, i)
4587 while code_point.charAt(0) is '0' and code_point.length > 1
4588 code_point = code_point.substr 1
4589 code_point = parseInt(code_point, base)
4590 if unicode_fixes[code_point]?
4592 return unicode_fixes[code_point]
4594 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4598 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4600 return from_code_point code_point
4604 if alnum.indexOf(txt.charAt(cur + i)) is -1
4607 # exit early, because parse_error() below needs at least one alnum
4609 if txt.charAt(cur + i) is ';'
4610 i += 1 # include ';' terminator in value
4611 decoded = decode_named_char_ref txt.substr(cur, i)
4618 # no ';' terminator (only legacy char refs)
4620 for i in [2..max] # no prefix matches, so ok to check shortest first
4621 c = legacy_char_refs[txt.substr(cur, i)]
4624 if txt.charAt(cur + i) is '='
4625 # "because some legacy user agents will
4626 # misinterpret the markup in those cases"
4629 if alnum.indexOf(txt.charAt(cur + i)) > -1
4630 # this makes attributes forgiving about url args
4632 # ok, and besides the weird exceptions for attributes...
4633 # return the matching char
4634 cur += i # consume entity chars
4635 parse_error() # because no terminating ";"
4639 return # never reached
4641 eat_next_token_if_newline = ->
4646 if t.type is TYPE_TEXT
4647 # definition of a newline depends on whether it was a character ref or not
4648 if cur - old_cur is 1
4649 # not a character reference
4650 if t.text is "\u000d" or t.text is "\u000a"
4653 if t.text is "\u000a"
4659 # tree constructor initialization
4660 # see comments on TYPE_TAG/etc for the structure of this data
4663 doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
4664 doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
4665 fragment_root = null # fragment parsing algorithm returns children of this
4667 afe = [] # active formatting elements
4668 template_ins_modes = []
4669 ins_mode = ins_mode_initial
4670 original_ins_mode = ins_mode # TODO check spec
4671 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4672 flag_frameset_ok = true
4674 flag_foster_parenting = false
4675 form_element_pointer = null
4676 temporary_buffer = null
4677 pending_table_character_tokens = []
4678 head_element_pointer = null
4679 flag_fragment_parsing = false
4680 context_element = null
4681 prev_node_id = 0 # just for debugging
4683 # tokenizer initialization
4684 tok_state = tok_state_data
4687 # fragment parsing (text arg)
4689 # this handles the fragment from the tests in the format described here:
4690 # https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/README.md
4693 if f.substr(0, 5) is 'math '
4696 else if f.substr(0, 4) is 'svg '
4700 context_element = token_to_element t, ns
4701 context_element.document = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
4702 context_element.document.flag 'quirks mode', QUIRKS_NO
4703 # fragment parsing (Node arg)
4705 context_element = args.context
4707 # http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4708 # fragment parsing algorithm
4710 flag_fragment_parsing = true
4711 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4712 # search up the tree from context, to try to find it's document,
4713 # because this file only puts a "document" property on the root
4716 el = context_element
4719 old_doc = el.document
4726 doc.flag 'quirks mode', old_doc.flag 'quirks mode'
4728 if context_element.namespace is NS_HTML
4729 switch context_element.name
4730 when 'title', 'textarea'
4731 tok_state = tok_state_rcdata
4732 when 'style', 'xmp', 'iframe', 'noembed', 'noframes'
4733 tok_state = tok_state_rawtext
4735 tok_state = tok_state_script_data
4738 tok_state = tok_state_rawtext
4740 tok_state = tok_state_plaintext
4741 fragment_root = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4742 doc.children.push fragment_root
4743 fragment_root.document = doc
4744 open_els = [fragment_root]
4745 if context_element.name is 'template' and context_element.namespace is NS_HTML
4746 template_ins_modes.unshift ins_mode_in_template
4747 # fixfull create token for context (it should have it's original one already)
4749 # set form_element pointer... in the foreign doc?!
4750 el = context_element
4752 if el.name is 'form' and el.namespace is NS_HTML
4753 form_element_pointer = el
4760 # text pre-processing
4761 # FIXME check http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4762 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4763 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4765 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4766 parse_main_loop = ->
4771 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4776 if flag_fragment_parsing
4777 return fragment_root.children
4780 serialize_els = (els, shallow, show_ids) ->
4786 serialized += t.serialize shallow, show_ids
4789 module.exports.parse_html = parse_html
4790 module.exports.debug_log_reset = debug_log_reset
4791 module.exports.debug_log_each = debug_log_each
4792 module.exports.TYPE_TAG = TYPE_TAG
4793 module.exports.TYPE_TEXT = TYPE_TEXT
4794 module.exports.TYPE_COMMENT = TYPE_COMMENT
4795 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4796 module.exports.NS_HTML = NS_HTML
4797 module.exports.NS_MATHML = NS_MATHML
4798 module.exports.NS_SVG = NS_SVG
4799 module.exports.QUIRKS_NO = QUIRKS_NO
4800 module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
4801 module.exports.QUIRKS_YES = QUIRKS_YES