1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
25 # Deviations from that spec:
27 # Purposeful: search this file for "WHATWG"
29 # Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
34 # the spec uses a many different words do indicate which ends of lists/stacks
35 # they are talking about (and relative movement within the lists/stacks). This
36 # section splains. I'm implementing "lists" (afe and open_els) the same way
39 # stacks grow downward (current element is index=0)
41 # example: open_els = [a, b, c, d, e, f, g]
43 # "grows downwards" means it's visualized like this: (index: el, names)
45 # 6: g "start of the list", "topmost", "first"
47 # 4: e "previous" (to d), "above", "before"
48 # 3: d (previous/next are relative to this element)
49 # 2: c "next", "after", "lower", "below"
51 # 0: a "end of the list", "current node", "bottommost", "last"
55 # note: to get this to run outside a browser, you'll have to write a native
56 # implementation of decode_named_char_ref()
57 unless module?.exports?
59 module = exports: window.wheic
61 from_code_point = (x) ->
62 if String.fromCodePoint?
63 return String.fromCodePoint x
66 return String.fromCharCode x
68 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
70 # Each node is an obect of the Node class. Here are the Node types:
71 TYPE_TAG = 0 # name, {attributes}, [children]
72 TYPE_TEXT = 1 # "text"
75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
77 TYPE_END_TAG = 5 # name
79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
87 # quirks mode constants
97 debug_log_each = (cb) ->
98 for str in g_debug_log
103 constructor: (type, args = {}) ->
104 @type = type # one of the TYPE_* constants above
105 @name = args.name ? '' # tag name
106 @text = args.text ? '' # contents for text/comment nodes
107 @attrs = args.attrs ? {}
108 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
109 @children = args.children ? []
110 @namespace = args.namespace ? NS_HTML
111 @parent = args.parent ? null
112 @token = args.token ? null
113 @flags = args.flags ? {}
117 @id = "#{++prev_node_id}"
118 acknowledge_self_closing: ->
120 @token.flag 'did_self_close', true
122 @flag 'did_self_close', true
123 flag: (key, value = null) ->
128 serialize: (shallow = false, show_ids = false) -> # for unit tests
133 ret += JSON.stringify @name
148 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
154 ret += c.serialize shallow, show_ids
158 ret += JSON.stringify @text
161 ret += JSON.stringify @text
163 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
166 when TYPE_AAA_BOOKMARK
167 ret += 'aaa_bookmark'
170 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
173 # helpers: (only take args that are normally known when parser creates nodes)
174 new_open_tag = (name) ->
175 return new Node TYPE_START_TAG, name: name
176 new_end_tag = (name) ->
177 return new Node TYPE_END_TAG, name: name
178 new_element = (name) ->
179 return new Node TYPE_TAG, name: name
180 new_text_node = (txt) ->
181 return new Node TYPE_TEXT, text: txt
182 new_character_token = new_text_node
183 new_comment_token = (txt) ->
184 return new Node TYPE_COMMENT, text: txt
185 new_doctype_token = (name) ->
186 return new Node TYPE_DOCTYPE, name: name
188 return new Node TYPE_EOF
190 return new Node TYPE_AFE_MARKER
191 new_aaa_bookmark = ->
192 return new Node TYPE_AAA_BOOKMARK
194 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
195 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
196 digits = "0123456789"
197 alnum = lc_alpha + uc_alpha + digits
198 hex_chars = digits + "abcdefABCDEF"
200 is_uc_alpha = (str) ->
201 return str.length is 1 and uc_alpha.indexOf(str) > -1
202 is_lc_alpha = (str) ->
203 return str.length is 1 and lc_alpha.indexOf(str) > -1
205 # some SVG elements have dashes in them
206 tag_name_chars = alnum + "-"
208 # http://www.w3.org/TR/html5/infrastructure.html#space-character
209 space_chars = "\u0009\u000a\u000c\u000d\u0020"
211 return txt.length is 1 and space_chars.indexOf(txt) > -1
212 is_space_tok = (t) ->
213 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
215 is_input_hidden_tok = (t) ->
216 return false unless t.type is TYPE_START_TAG
219 if a[1].toLowerCase() is 'hidden'
224 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
225 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
228 unicode_fixes[0x00] = "\uFFFD"
229 unicode_fixes[0x80] = "\u20AC"
230 unicode_fixes[0x82] = "\u201A"
231 unicode_fixes[0x83] = "\u0192"
232 unicode_fixes[0x84] = "\u201E"
233 unicode_fixes[0x85] = "\u2026"
234 unicode_fixes[0x86] = "\u2020"
235 unicode_fixes[0x87] = "\u2021"
236 unicode_fixes[0x88] = "\u02C6"
237 unicode_fixes[0x89] = "\u2030"
238 unicode_fixes[0x8A] = "\u0160"
239 unicode_fixes[0x8B] = "\u2039"
240 unicode_fixes[0x8C] = "\u0152"
241 unicode_fixes[0x8E] = "\u017D"
242 unicode_fixes[0x91] = "\u2018"
243 unicode_fixes[0x92] = "\u2019"
244 unicode_fixes[0x93] = "\u201C"
245 unicode_fixes[0x94] = "\u201D"
246 unicode_fixes[0x95] = "\u2022"
247 unicode_fixes[0x96] = "\u2013"
248 unicode_fixes[0x97] = "\u2014"
249 unicode_fixes[0x98] = "\u02DC"
250 unicode_fixes[0x99] = "\u2122"
251 unicode_fixes[0x9A] = "\u0161"
252 unicode_fixes[0x9B] = "\u203A"
253 unicode_fixes[0x9C] = "\u0153"
254 unicode_fixes[0x9E] = "\u017E"
255 unicode_fixes[0x9F] = "\u0178"
257 quirks_yes_pi_prefixes = [
258 "+//silmaril//dtd html pro v0r11 19970101//"
259 "-//as//dtd html 3.0 aswedit + extensions//"
260 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
261 "-//ietf//dtd html 2.0 level 1//"
262 "-//ietf//dtd html 2.0 level 2//"
263 "-//ietf//dtd html 2.0 strict level 1//"
264 "-//ietf//dtd html 2.0 strict level 2//"
265 "-//ietf//dtd html 2.0 strict//"
266 "-//ietf//dtd html 2.0//"
267 "-//ietf//dtd html 2.1e//"
268 "-//ietf//dtd html 3.0//"
269 "-//ietf//dtd html 3.2 final//"
270 "-//ietf//dtd html 3.2//"
271 "-//ietf//dtd html 3//"
272 "-//ietf//dtd html level 0//"
273 "-//ietf//dtd html level 1//"
274 "-//ietf//dtd html level 2//"
275 "-//ietf//dtd html level 3//"
276 "-//ietf//dtd html strict level 0//"
277 "-//ietf//dtd html strict level 1//"
278 "-//ietf//dtd html strict level 2//"
279 "-//ietf//dtd html strict level 3//"
280 "-//ietf//dtd html strict//"
281 "-//ietf//dtd html//"
282 "-//metrius//dtd metrius presentational//"
283 "-//microsoft//dtd internet explorer 2.0 html strict//"
284 "-//microsoft//dtd internet explorer 2.0 html//"
285 "-//microsoft//dtd internet explorer 2.0 tables//"
286 "-//microsoft//dtd internet explorer 3.0 html strict//"
287 "-//microsoft//dtd internet explorer 3.0 html//"
288 "-//microsoft//dtd internet explorer 3.0 tables//"
289 "-//netscape comm. corp.//dtd html//"
290 "-//netscape comm. corp.//dtd strict html//"
291 "-//o'reilly and associates//dtd html 2.0//"
292 "-//o'reilly and associates//dtd html extended 1.0//"
293 "-//o'reilly and associates//dtd html extended relaxed 1.0//"
294 "-//sq//dtd html 2.0 hotmetal + extensions//"
295 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
296 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
297 "-//spyglass//dtd html 2.0 extended//"
298 "-//sun microsystems corp.//dtd hotjava html//"
299 "-//sun microsystems corp.//dtd hotjava strict html//"
300 "-//w3c//dtd html 3 1995-03-24//"
301 "-//w3c//dtd html 3.2 draft//"
302 "-//w3c//dtd html 3.2 final//"
303 "-//w3c//dtd html 3.2//"
304 "-//w3c//dtd html 3.2s draft//"
305 "-//w3c//dtd html 4.0 frameset//"
306 "-//w3c//dtd html 4.0 transitional//"
307 "-//w3c//dtd html experimental 19960712//"
308 "-//w3c//dtd html experimental 970421//"
309 "-//w3c//dtd w3 html//"
310 "-//w3o//dtd w3 html 3.0//"
311 "-//webtechs//dtd mozilla html 2.0//"
312 "-//webtechs//dtd mozilla html//"
315 # These are the character references that don't need a terminating semicolon
316 # min length: 2, max: 6, none are a prefix of any other.
318 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
319 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
320 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
321 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
322 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
323 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
324 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
325 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
326 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
327 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
328 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
329 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
330 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
331 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
332 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
333 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
334 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
338 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
339 raw_text_elements = ['script', 'style']
340 escapable_raw_text_elements = ['textarea', 'title']
341 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
343 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
344 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
345 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
346 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
347 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
348 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
349 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
350 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
351 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
352 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
353 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
354 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
355 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
356 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
360 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
362 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
363 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
364 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
365 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
366 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
367 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
368 'determinant', 'diff', 'divergence', 'divide', 'domain',
369 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
370 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
371 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
372 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
373 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
374 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
375 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
376 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
377 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
378 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
379 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
380 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
381 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
382 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
383 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
384 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
385 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
386 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
387 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
388 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
389 'vectorproduct', 'xor'
391 # foreign_elements = [svg_elements..., mathml_elements...]
392 #normal_elements = All other allowed HTML elements are normal elements.
396 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
397 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
398 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
399 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
400 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
401 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
402 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
403 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
404 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
405 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
406 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
408 menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
410 meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
411 noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
412 plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
413 select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
414 table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
415 textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
416 tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
419 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
420 'annotation-xml':NS_MATHML,
423 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
426 formatting_elements = {
427 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
428 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
432 mathml_text_integration = {
433 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
435 is_mathml_text_integration_point = (el) ->
436 return mathml_text_integration[el.name] is el.namespace
437 is_html_integration = (el) -> # DON'T PASS A TOKEN
438 if el.namespace is NS_MATHML
439 if el.name is 'annotation-xml'
440 if el.attrs.encoding?
441 if el.attrs.encoding.toLowerCase() is 'text/html'
443 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
446 if el.namespace is NS_SVG
447 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
452 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
455 foster_parenting_targets = {
476 el_is_special = (e) ->
477 return special_elements[e.name] is e.namespace
479 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
480 el_is_special_not_adp = (el) ->
481 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
485 altglyphdef: 'altGlyphDef'
486 altglyphitem: 'altGlyphItem'
487 animatecolor: 'animateColor'
488 animatemotion: 'animateMotion'
489 animatetransform: 'animateTransform'
492 fecolormatrix: 'feColorMatrix'
493 fecomponenttransfer: 'feComponentTransfer'
494 fecomposite: 'feComposite'
495 feconvolvematrix: 'feConvolveMatrix'
496 fediffuselighting: 'feDiffuseLighting'
497 fedisplacementmap: 'feDisplacementMap'
498 fedistantlight: 'feDistantLight'
499 fedropshadow: 'feDropShadow'
505 fegaussianblur: 'feGaussianBlur'
508 femergenode: 'feMergeNode'
509 femorphology: 'feMorphology'
511 fepointlight: 'fePointLight'
512 fespecularlighting: 'feSpecularLighting'
513 fespotlight: 'feSpotLight'
515 feturbulence: 'feTurbulence'
516 foreignobject: 'foreignObject'
518 lineargradient: 'linearGradient'
519 radialgradient: 'radialGradient'
522 svg_attribute_fixes = {
523 attributename: 'attributeName'
524 attributetype: 'attributeType'
525 basefrequency: 'baseFrequency'
526 baseprofile: 'baseProfile'
528 clippathunits: 'clipPathUnits'
529 contentscripttype: 'contentScriptType'
530 contentstyletype: 'contentStyleType'
531 diffuseconstant: 'diffuseConstant'
533 externalresourcesrequired: 'externalResourcesRequired'
534 # WHATWG removes this: filterres: 'filterRes'
535 filterunits: 'filterUnits'
537 gradienttransform: 'gradientTransform'
538 gradientunits: 'gradientUnits'
539 kernelmatrix: 'kernelMatrix'
540 kernelunitlength: 'kernelUnitLength'
541 keypoints: 'keyPoints'
542 keysplines: 'keySplines'
544 lengthadjust: 'lengthAdjust'
545 limitingconeangle: 'limitingConeAngle'
546 markerheight: 'markerHeight'
547 markerunits: 'markerUnits'
548 markerwidth: 'markerWidth'
549 maskcontentunits: 'maskContentUnits'
550 maskunits: 'maskUnits'
551 numoctaves: 'numOctaves'
552 pathlength: 'pathLength'
553 patterncontentunits: 'patternContentUnits'
554 patterntransform: 'patternTransform'
555 patternunits: 'patternUnits'
556 pointsatx: 'pointsAtX'
557 pointsaty: 'pointsAtY'
558 pointsatz: 'pointsAtZ'
559 preservealpha: 'preserveAlpha'
560 preserveaspectratio: 'preserveAspectRatio'
561 primitiveunits: 'primitiveUnits'
564 repeatcount: 'repeatCount'
565 repeatdur: 'repeatDur'
566 requiredextensions: 'requiredExtensions'
567 requiredfeatures: 'requiredFeatures'
568 specularconstant: 'specularConstant'
569 specularexponent: 'specularExponent'
570 spreadmethod: 'spreadMethod'
571 startoffset: 'startOffset'
572 stddeviation: 'stdDeviation'
573 stitchtiles: 'stitchTiles'
574 surfacescale: 'surfaceScale'
575 systemlanguage: 'systemLanguage'
576 tablevalues: 'tableValues'
579 textlength: 'textLength'
581 viewtarget: 'viewTarget'
582 xchannelselector: 'xChannelSelector'
583 ychannelselector: 'yChannelSelector'
584 zoomandpan: 'zoomAndPan'
586 foreign_attr_fixes = {
587 'xlink:actuate': 'xlink actuate'
588 'xlink:arcrole': 'xlink arcrole'
589 'xlink:href': 'xlink href'
590 'xlink:role': 'xlink role'
591 'xlink:show': 'xlink show'
592 'xlink:title': 'xlink title'
593 'xlink:type': 'xlink type'
594 'xml:base': 'xml base'
595 'xml:lang': 'xml lang'
596 'xml:space': 'xml space'
598 'xmlns:xlink': 'xmlns xlink'
600 adjust_mathml_attributes = (t) ->
602 if a[0] is 'definitionurl'
603 a[0] = 'definitionURL'
605 adjust_svg_attributes = (t) ->
607 if svg_attribute_fixes[a[0]]?
608 a[0] = svg_attribute_fixes[a[0]]
610 adjust_foreign_attributes = (t) ->
613 if foreign_attr_fixes[a[0]]?
614 a[0] = foreign_attr_fixes[a[0]]
617 # decode_named_char_ref()
619 # The list of named character references is _huge_ so ask the browser to decode
620 # for us instead of wasting bandwidth/space on including the table here.
622 # Pass without the "&" but with the ";" examples:
623 # for "&" pass "amp;"
624 # for "′" pass "x2032;"
627 textarea: document.createElement('textarea')
629 # TODO test this in IE8
630 decode_named_char_ref = (txt) ->
632 decoded = g_dncr.cache[txt]
633 return decoded if decoded?
634 g_dncr.textarea.innerHTML = txt
635 decoded = g_dncr.textarea.value
636 return null if decoded is txt
637 return g_dncr.cache[txt] = decoded
639 parse_html = (args) ->
641 cur = null # index of next char in txt to be parsed
642 # declare doc and tokenizer variables so they're in scope below
644 open_els = null # stack of open elements
645 afe = null # active formatting elements
646 template_ins_modes = null
648 original_ins_mode = null
650 tok_cur_tag = null # partially parsed tag
651 flag_scripting = null
652 flag_frameset_ok = null
654 flag_foster_parenting = null
655 form_element_pointer = null
656 temporary_buffer = null
657 pending_table_character_tokens = null
658 head_element_pointer = null
659 flag_fragment_parsing = null
660 context_element = null
669 console.log "Parse error at character #{cur} of #{txt.length}"
671 # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
672 # "Noah's Ark clause" but with three
673 afe_push = (new_el) ->
676 if el.type is TYPE_AFE_MARKER
678 if el.name is new_el.name and el.namespace is new_el.namespace
681 unless new_el.attrs[k] is v
685 for k, v of new_el.attrs
686 unless el.attrs[k] is v
696 afe.unshift new_afe_marker()
698 # the functions below impliment the Tree Contstruction algorithm
699 # http://www.w3.org/TR/html5/syntax.html#tree-construction
701 # But first... the helpers
702 template_tag_is_open = ->
704 if el.name is 'template' and el.namespace is NS_HTML
707 is_in_scope_x = (tag_name, scope, namespace) ->
709 if el.name is tag_name and (namespace is null or namespace is el.namespace)
711 if scope[el.name] is el.namespace
714 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
716 if el.name is tag_name and (namespace is null or namespace is el.namespace)
718 if scope[el.name] is el.namespace
720 if scope2[el.name] is el.namespace
724 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
725 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
728 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
729 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
731 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
733 button_scopers = button: NS_HTML
734 li_scopers = ol: NS_HTML, ul: NS_HTML
735 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
736 is_in_scope = (tag_name, namespace = null) ->
737 return is_in_scope_x tag_name, standard_scopers, namespace
738 is_in_button_scope = (tag_name, namespace = null) ->
739 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
740 is_in_table_scope = (tag_name, namespace = null) ->
741 return is_in_scope_x tag_name, table_scopers, namespace
742 # aka is_in_list_item_scope
743 is_in_li_scope = (tag_name, namespace = null) ->
744 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
745 is_in_select_scope = (tag_name, namespace = null) ->
747 if t.name is tag_name and (namespace is null or namespace is t.namespace)
749 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
752 # this checks for a particular element, not by name
753 # this requires a namespace match
754 el_is_in_scope = (needle) ->
758 if standard_scopers[el.name] is el.namespace
762 clear_to_table_stopers = {
767 clear_stack_to_table_context = ->
769 if clear_to_table_stopers[open_els[0].name]?
773 clear_to_table_body_stopers = {
780 clear_stack_to_table_body_context = ->
782 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
786 clear_to_table_row_stopers = {
791 clear_stack_to_table_row_context = ->
793 if clear_to_table_row_stopers[open_els[0].name]?
797 clear_afe_to_marker = ->
799 return unless afe.length > 0 # this happens in fragment case, ?spec error
801 if el.type is TYPE_AFE_MARKER
806 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
808 # 1. Let last be false.
810 # 2. Let node be the last node in the stack of open elements.
812 node = open_els[node_i]
813 # 3. Loop: If node is the first node in the stack of open elements,
814 # then set last to true, and, if the parser was originally created as
815 # part of the HTML fragment parsing algorithm (fragment case) set node
816 # to the context element.
818 if node_i is open_els.length - 1
820 # fixfull (fragment case)
822 # 4. If node is a select element, run these substeps:
823 if node.name is 'select' and node.namespace is NS_HTML
824 # 1. If last is true, jump to the step below labeled done.
826 # 2. Let ancestor be node.
829 # 3. Loop: If ancestor is the first node in the stack of
830 # open elements, jump to the step below labeled done.
832 if ancestor_i is open_els.length - 1
834 # 4. Let ancestor be the node before ancestor in the stack
837 ancestor = open_els[ancestor_i]
838 # 5. If ancestor is a template node, jump to the step below
840 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
842 # 6. If ancestor is a table node, switch the insertion mode
843 # to "in select in table" and abort these steps.
844 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
845 ins_mode = ins_mode_in_select_in_table
847 # 7. Jump back to the step labeled loop.
848 # 8. Done: Switch the insertion mode to "in select" and abort
850 ins_mode = ins_mode_in_select
852 # 5. If node is a td or th element and last is false, then switch
853 # the insertion mode to "in cell" and abort these steps.
854 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
855 ins_mode = ins_mode_in_cell
857 # 6. If node is a tr element, then switch the insertion mode to "in
858 # row" and abort these steps.
859 if node.name is 'tr' and node.namespace is NS_HTML
860 ins_mode = ins_mode_in_row
862 # 7. If node is a tbody, thead, or tfoot element, then switch the
863 # insertion mode to "in table body" and abort these steps.
864 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
865 ins_mode = ins_mode_in_table_body
867 # 8. If node is a caption element, then switch the insertion mode
868 # to "in caption" and abort these steps.
869 if node.name is 'caption' and node.namespace is NS_HTML
870 ins_mode = ins_mode_in_caption
872 # 9. If node is a colgroup element, then switch the insertion mode
873 # to "in column group" and abort these steps.
874 if node.name is 'colgroup' and node.namespace is NS_HTML
875 ins_mode = ins_mode_in_column_group
877 # 10. If node is a table element, then switch the insertion mode to
878 # "in table" and abort these steps.
879 if node.name is 'table' and node.namespace is NS_HTML
880 ins_mode = ins_mode_in_table
882 # 11. If node is a template element, then switch the insertion mode
883 # to the current template insertion mode and abort these steps.
884 if node.name is 'template' and node.namespace is NS_HTML
885 ins_mode = template_ins_modes[0]
887 # 12. If node is a head element and last is true, then switch the
888 # insertion mode to "in body" ("in body"! not "in head"!) and abort
889 # these steps. (fragment case)
890 if node.name is 'head' and node.namespace is NS_HTML and last
891 ins_mode = ins_mode_in_body
893 # 13. If node is a head element and last is false, then switch the
894 # insertion mode to "in head" and abort these steps.
895 if node.name is 'head' and node.namespace is NS_HTML and last is false
896 ins_mode = ins_mode_in_head
898 # 14. If node is a body element, then switch the insertion mode to
899 # "in body" and abort these steps.
900 if node.name is 'body' and node.namespace is NS_HTML
901 ins_mode = ins_mode_in_body
903 # 15. If node is a frameset element, then switch the insertion mode
904 # to "in frameset" and abort these steps. (fragment case)
905 if node.name is 'frameset' and node.namespace is NS_HTML
906 ins_mode = ins_mode_in_frameset
908 # 16. If node is an html element, run these substeps:
909 if node.name is 'html' and node.namespace is NS_HTML
910 # 1. If the head element pointer is null, switch the insertion
911 # mode to "before head" and abort these steps. (fragment case)
912 if head_element_pointer is null
913 ins_mode = ins_mode_before_head
915 # 2. Otherwise, the head element pointer is not null,
916 # switch the insertion mode to "after head" and abort these
918 ins_mode = ins_mode_after_head
920 # 17. If last is true, then switch the insertion mode to "in body"
921 # and abort these steps. (fragment case)
923 ins_mode = ins_mode_in_body
925 # 18. Let node now be the node before node in the stack of open
928 node = open_els[node_i]
929 # 19. Return to the step labeled loop.
933 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
934 adjusted_current_node = ->
935 if open_els.length is 1 and flag_fragment_parsing
936 return context_element
939 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
940 # this implementation is structured (mostly) as described at the link above.
941 # capitalized comments are the "labels" described at the link above.
943 return if afe.length is 0
944 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
949 if i is afe.length - 1
952 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
957 el = insert_html_element afe[i].token
962 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
963 # adoption agency algorithm
965 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
966 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
967 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
968 adoption_agency = (subject) ->
969 debug_log "adoption_agency()"
970 debug_log "tree: #{serialize_els doc.children, false, true}"
971 debug_log "open_els: #{serialize_els open_els, true, true}"
972 debug_log "afe: #{serialize_els afe, true, true}"
973 # this block implements tha W3C spec
974 # # 1. If the current node is an HTML element whose tag name is subject,
975 # # then run these substeps:
977 # # 1. Let element be the current node.
979 # # 2. Pop element off the stack of open elements.
981 # # 3. If element is also in the list of active formatting elements,
982 # # remove the element from the list.
984 # # 4. Abort the adoption agency algorithm.
985 # if open_els[0].name is subject and open_els[0].namespace is NS_HTML
986 # el = open_els.shift()
987 # # remove it from the list of active formatting elements (if found)
992 # debug_log "aaa: starting off with subject on top of stack, exiting"
994 # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
995 # If the current node is an HTML element whose tag name is subject, and
996 # the current node is not in the list of active formatting elements,
997 # then pop the current node off the stack of open elements, and abort
999 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
1000 debug_log "aaa: starting off with subject on top of stack, exiting"
1001 # remove it from the list of active formatting elements (if found)
1004 if el is open_els[0]
1008 debug_log "aaa: ...and not in afe, aaa done"
1018 # 5. Let formatting element be the last element in the list of
1019 # active formatting elements that: is between the end of the list
1020 # and the last scope marker in the list, if any, or the start of
1021 # the list otherwise, and has the tag name subject.
1023 for t, fe_of_afe in afe
1024 if t.type is TYPE_AFE_MARKER
1026 if t.name is subject
1029 # If there is no such element, then abort these steps and instead
1030 # act as described in the "any other end tag" entry above.
1032 debug_log "aaa: fe not found in afe"
1033 in_body_any_other_end_tag subject
1035 # 6. If formatting element is not in the stack of open elements,
1036 # then this is a parse error; remove the element from the list, and
1037 # abort these steps.
1039 for t, fe_of_open_els in open_els
1044 debug_log "aaa: fe not found in open_els"
1046 # "remove it from the list" must mean afe, since it's not in open_els
1047 afe.splice fe_of_afe, 1
1049 # 7. If formatting element is in the stack of open elements, but
1050 # the element is not in scope, then this is a parse error; abort
1052 unless el_is_in_scope fe
1053 debug_log "aaa: fe not in scope"
1056 # 8. If formatting element is not the current node, this is a parse
1057 # error. (But do not abort these steps.)
1058 unless open_els[0] is fe
1061 # 9. Let furthest block be the topmost node in the stack of open
1062 # elements that is lower in the stack than formatting element, and
1063 # is an element in the special category. There might not be one.
1065 fb_of_open_els = null
1066 for t, i in open_els
1072 # and continue, to see if there's one that's more "topmost"
1073 # 10. If there is no furthest block, then the UA must first pop all
1074 # the nodes from the bottom of the stack of open elements, from the
1075 # current node up to and including formatting element, then remove
1076 # formatting element from the list of active formatting elements,
1077 # and finally abort these steps.
1079 debug_log "aaa: no fb"
1081 t = open_els.shift()
1083 afe.splice fe_of_afe, 1
1085 # 11. Let common ancestor be the element immediately above
1086 # formatting element in the stack of open elements.
1087 ca = open_els[fe_of_open_els + 1] # common ancestor
1089 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
1090 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
1091 bookmark = new_aaa_bookmark()
1094 afe.splice i, 0, bookmark
1096 node = last_node = fb
1100 # 3. Let node be the element immediately above node in the
1101 # stack of open elements, or if node is no longer in the stack
1102 # of open elements (e.g. because it got removed by this
1103 # algorithm), the element that was immediately above node in
1104 # the stack of open elements before node was removed.
1106 for t, i in open_els
1108 node_next = open_els[i + 1]
1110 node = node_next ? node_above
1111 debug_log "inner loop #{inner}"
1112 debug_log "tree: #{serialize_els doc.children, false, true}"
1113 debug_log "open_els: #{serialize_els open_els, true, true}"
1114 debug_log "afe: #{serialize_els afe, true, true}"
1115 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1116 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1117 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1118 debug_log "node: #{node.serialize true, true}"
1119 # TODO make sure node_above gets re-set if/when node is removed from open_els
1121 # 4. If node is formatting element, then go to the next step in
1122 # the overall algorithm.
1125 debug_log "the meat"
1126 # 5. If inner loop counter is greater than three and node is in
1127 # the list of active formatting elements, then remove node from
1128 # the list of active formatting elements.
1134 debug_log "max out inner"
1139 # 6. If node is not in the list of active formatting elements,
1140 # then remove node from the stack of open elements and then go
1141 # back to the step labeled inner loop.
1143 debug_log "not in afe"
1144 for t, i in open_els
1146 node_above = open_els[i + 1]
1147 open_els.splice i, 1
1150 debug_log "the bones"
1151 # 7. create an element for the token for which the element node
1152 # was created, in the HTML namespace, with common ancestor as
1153 # the intended parent; replace the entry for node in the list
1154 # of active formatting elements with an entry for the new
1155 # element, replace the entry for node in the stack of open
1156 # elements with an entry for the new element, and let node be
1158 new_node = token_to_element node.token, NS_HTML, ca
1162 debug_log "replaced in afe"
1164 for t, i in open_els
1166 node_above = open_els[i + 1]
1167 open_els[i] = new_node
1168 debug_log "replaced in open_els"
1171 # 8. If last node is furthest block, then move the
1172 # aforementioned bookmark to be immediately after the new node
1173 # in the list of active formatting elements.
1178 debug_log "removed bookmark"
1182 # "after" means lower
1183 afe.splice i, 0, bookmark # "after as <-
1184 debug_log "placed bookmark after node"
1185 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1187 # 9. Insert last node into node, first removing it from its
1188 # previous parent node if any.
1189 if last_node.parent?
1190 debug_log "last_node has parent"
1191 for c, i in last_node.parent.children
1193 debug_log "removing last_node from parent"
1194 last_node.parent.children.splice i, 1
1196 node.children.push last_node
1197 last_node.parent = node
1198 # 10. Let last node be node.
1201 # 11. Return to the step labeled inner loop.
1202 # 14. Insert whatever last node ended up being in the previous step
1203 # at the appropriate place for inserting a node, but using common
1204 # ancestor as the override target.
1206 # In the case where fe is immediately followed by fb:
1207 # * inner loop exits out early (node==fe)
1209 # * last_node is still in the tree (not a duplicate)
1210 if last_node.parent?
1211 debug_log "FEFIRST? last_node has parent"
1212 for c, i in last_node.parent.children
1214 debug_log "removing last_node from parent"
1215 last_node.parent.children.splice i, 1
1218 debug_log "after aaa inner loop"
1219 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1220 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1221 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1222 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1223 debug_log "tree: #{serialize_els doc.children, false, true}"
1228 # can't use standard insert token thing, because it's already in
1229 # open_els and must stay at it's current position in open_els
1230 dest = adjusted_insertion_location ca
1231 dest[0].children.splice dest[1], 0, last_node
1232 last_node.parent = dest[0]
1235 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1236 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1237 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1238 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1239 debug_log "tree: #{serialize_els doc.children, false, true}"
1241 # 15. Create an element for the token for which formatting element
1242 # was created, in the HTML namespace, with furthest block as the
1244 new_element = token_to_element fe.token, NS_HTML, fb
1245 # 16. Take all of the child nodes of furthest block and append them
1246 # to the element created in the last step.
1247 while fb.children.length
1248 t = fb.children.shift()
1249 t.parent = new_element
1250 new_element.children.push t
1251 # 17. Append that new element to furthest block.
1252 new_element.parent = fb
1253 fb.children.push new_element
1254 # 18. Remove formatting element from the list of active formatting
1255 # elements, and insert the new element into the list of active
1256 # formatting elements at the position of the aforementioned
1264 afe[i] = new_element
1266 # 19. Remove formatting element from the stack of open elements,
1267 # and insert the new element into the stack of open elements
1268 # immediately below the position of furthest block in that stack.
1269 for t, i in open_els
1271 open_els.splice i, 1
1273 for t, i in open_els
1275 open_els.splice i, 0, new_element
1277 # 20. Jump back to the step labeled outer loop.
1278 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1279 debug_log "tree: #{serialize_els doc.children, false, true}"
1280 debug_log "open_els: #{serialize_els open_els, true, true}"
1281 debug_log "afe: #{serialize_els afe, true, true}"
1282 debug_log "AAA DONE"
1284 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1285 close_p_element = ->
1286 generate_implied_end_tags 'p' # arg is exception
1287 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1289 while open_els.length > 1 # just in case
1290 el = open_els.shift()
1291 if el.name is 'p' and el.namespace is NS_HTML
1293 close_p_if_in_button_scope = ->
1294 if is_in_button_scope 'p', NS_HTML
1297 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1298 # aka insert_a_character = (t) ->
1299 insert_character = (t) ->
1300 dest = adjusted_insertion_location()
1301 # fixfull check for Document node
1303 prev = dest[0].children[dest[1] - 1]
1304 if prev.type is TYPE_TEXT
1307 dest[0].children.splice dest[1], 0, t
1310 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1311 process_token = (t) ->
1312 acn = adjusted_current_node()
1316 if acn.namespace is NS_HTML
1319 if is_mathml_text_integration_point(acn)
1320 if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1323 if t.type is TYPE_TEXT
1326 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1329 if is_html_integration acn
1330 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1333 if t.type is TYPE_EOF
1336 in_foreign_content t
1340 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1341 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1342 adjusted_insertion_location = (override_target = null) ->
1343 # 1. If there was an override target specified, then let target be the
1346 target = override_target
1347 else # Otherwise, let target be the current node.
1348 target = open_els[0]
1349 # 2. Determine the adjusted insertion location using the first matching
1350 # steps from the following list:
1352 # If foster parenting is enabled and target is a table, tbody, tfoot,
1353 # thead, or tr element Foster parenting happens when content is
1354 # misnested in tables.
1355 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1356 loop # once. this is here so we can ``break`` to "abort these substeps"
1357 # 1. Let last template be the last template element in the
1358 # stack of open elements, if any.
1359 last_template = null
1360 last_template_i = null
1361 for el, i in open_els
1362 if el.name is 'template' and el.namespace is NS_HTML
1366 # 2. Let last table be the last table element in the stack of
1367 # open elements, if any.
1370 for el, i in open_els
1371 if el.name is 'table' and el.namespace is NS_HTML
1375 # 3. If there is a last template and either there is no last
1376 # table, or there is one, but last template is lower (more
1377 # recently added) than last table in the stack of open
1378 # elements, then: let adjusted insertion location be inside
1379 # last template's template contents, after its last child (if
1380 # any), and abort these substeps.
1381 if last_template and (last_table is null or last_template_i < last_table_i)
1382 target = last_template # fixfull should be it's contents
1383 target_i = target.children.length
1385 # 4. If there is no last table, then let adjusted insertion
1386 # location be inside the first element in the stack of open
1387 # elements (the html element), after its last child (if any),
1388 # and abort these substeps. (fragment case)
1389 if last_table is null
1391 target = open_els[open_els.length - 1]
1392 target_i = target.children.length
1394 # 5. If last table has a parent element, then let adjusted
1395 # insertion location be inside last table's parent element,
1396 # immediately before last table, and abort these substeps.
1397 if last_table.parent?
1398 for c, i in last_table.parent.children
1400 target = last_table.parent
1404 # 6. Let previous element be the element immediately above last
1405 # table in the stack of open elements.
1407 # huh? how could it not have a parent?
1408 previous_element = open_els[last_table_i + 1]
1409 # 7. Let adjusted insertion location be inside previous
1410 # element, after its last child (if any).
1411 target = previous_element
1412 target_i = target.children.length
1413 # Note: These steps are involved in part because it's possible
1414 # for elements, the table element in this case in particular,
1415 # to have been moved by a script around in the DOM, or indeed
1416 # removed from the DOM entirely, after the element was inserted
1418 break # don't really loop
1420 # Otherwise Let adjusted insertion location be inside target, after
1421 # its last child (if any).
1422 target_i = target.children.length
1424 # 3. If the adjusted insertion location is inside a template element,
1425 # let it instead be inside the template element's template contents,
1426 # after its last child (if any).
1427 # fixfull (template)
1429 # 4. Return the adjusted insertion location.
1430 return [target, target_i]
1432 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1433 # aka create_an_element_for_token
1434 token_to_element = (t, namespace, intended_parent) ->
1435 # convert attributes into a hash
1438 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1439 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1441 # TODO 2. If the newly created element has an xmlns attribute in the
1442 # XMLNS namespace whose value is not exactly the same as the element's
1443 # namespace, that is a parse error. Similarly, if the newly created
1444 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1445 # value is not the XLink Namespace, that is a parse error.
1447 # fixfull: the spec says stuff about form pointers and ownerDocument
1451 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1452 insert_foreign_element = (token, namespace) ->
1453 ail = adjusted_insertion_location()
1456 el = token_to_element token, namespace, ail_el
1457 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1459 ail_el.children.splice ail_i, 0, el
1462 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1463 insert_html_element = (token) ->
1464 insert_foreign_element token, NS_HTML
1466 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1467 # position should be [node, index_within_children]
1468 insert_comment = (t, position = null) ->
1469 position ?= adjusted_insertion_location()
1470 position[0].children.splice position[1], 0, t
1473 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1474 parse_generic_raw_text = (t) ->
1475 insert_html_element t
1476 tok_state = tok_state_rawtext
1477 original_ins_mode = ins_mode
1478 ins_mode = ins_mode_text
1479 parse_generic_rcdata_text = (t) ->
1480 insert_html_element t
1481 tok_state = tok_state_rcdata
1482 original_ins_mode = ins_mode
1483 ins_mode = ins_mode_text
1485 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1486 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1487 generate_implied_end_tags = (except = null) ->
1488 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1491 # 8.2.5.4 The rules for parsing tokens in HTML content
1492 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1494 # 8.2.5.4.1 The "initial" insertion mode
1495 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1496 is_quirks_yes_doctype = (t) ->
1497 if t.flag 'force-quirks'
1499 if t.name isnt 'html'
1501 if t.public_identifier?
1502 pi = t.public_identifier.toLowerCase()
1503 for p in quirks_yes_pi_prefixes
1504 if pi.substr(0, p.length) is p
1506 if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
1508 if t.system_identifier?
1509 if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
1511 else if t.public_identifier?
1512 # already did this: pi = t.public_identifier.toLowerCase()
1513 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1516 is_quirks_limited_doctype = (t) ->
1517 if t.public_identifier?
1518 pi = t.public_identifier.toLowerCase()
1519 if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
1521 if t.system_identifier?
1522 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1525 ins_mode_initial = (t) ->
1528 if t.type is TYPE_COMMENT
1532 if t.type is TYPE_DOCTYPE
1533 # fixfull syntax error from first paragraph and following bullets
1534 # fixfull set doc.doctype
1535 # fixfull is the "not an iframe srcdoc" thing relevant?
1536 if is_quirks_yes_doctype t
1537 doc.flag 'quirks mode', QUIRKS_YES
1538 else if is_quirks_limited_doctype t
1539 doc.flag 'quirks mode', QUIRKS_LIMITED
1541 ins_mode = ins_mode_before_html
1544 # fixfull not iframe srcdoc?
1546 doc.flag 'quirks mode', QUIRKS_YES
1547 ins_mode = ins_mode_before_html
1551 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1552 ins_mode_before_html = (t) ->
1553 if t.type is TYPE_DOCTYPE
1556 if t.type is TYPE_COMMENT
1561 if t.type is TYPE_START_TAG and t.name is 'html'
1562 el = token_to_element t, NS_HTML, doc
1563 doc.children.push el
1564 open_els.unshift(el)
1565 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1566 ins_mode = ins_mode_before_head
1568 if t.type is TYPE_END_TAG
1569 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1570 # fall through to "anything else"
1575 el = token_to_element new_open_tag('html'), NS_HTML, doc
1576 doc.children.push el
1579 # ?fixfull browsing context
1580 ins_mode = ins_mode_before_head
1584 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1585 ins_mode_before_head = (t) ->
1588 if t.type is TYPE_COMMENT
1591 if t.type is TYPE_DOCTYPE
1594 if t.type is TYPE_START_TAG and t.name is 'html'
1597 if t.type is TYPE_START_TAG and t.name is 'head'
1598 el = insert_html_element t
1599 head_element_pointer = el
1600 ins_mode = ins_mode_in_head
1602 if t.type is TYPE_END_TAG
1603 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1604 # fall through to Anything else below
1609 el = insert_html_element new_open_tag 'head'
1610 head_element_pointer = el
1611 ins_mode = ins_mode_in_head
1614 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1615 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1616 open_els.shift() # spec says this will be a 'head' node
1617 ins_mode = ins_mode_after_head
1619 ins_mode_in_head = (t) ->
1620 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1623 if t.type is TYPE_COMMENT
1626 if t.type is TYPE_DOCTYPE
1629 if t.type is TYPE_START_TAG and t.name is 'html'
1632 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1633 el = insert_html_element t
1635 t.acknowledge_self_closing()
1637 if t.type is TYPE_START_TAG and t.name is 'meta'
1638 el = insert_html_element t
1640 t.acknowledge_self_closing()
1641 # fixfull encoding stuff
1643 if t.type is TYPE_START_TAG and t.name is 'title'
1644 parse_generic_rcdata_text t
1646 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1647 parse_generic_raw_text t
1649 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1650 insert_html_element t
1651 ins_mode = ins_mode_in_head_noscript
1653 if t.type is TYPE_START_TAG and t.name is 'script'
1654 ail = adjusted_insertion_location()
1655 el = token_to_element t, NS_HTML, ail
1656 el.flag 'parser-inserted', true
1657 # fixfull frament case
1658 ail[0].children.splice ail[1], 0, el
1660 tok_state = tok_state_script_data
1661 original_ins_mode = ins_mode # make sure orig... is defined
1662 ins_mode = ins_mode_text
1664 if t.type is TYPE_END_TAG and t.name is 'head'
1665 open_els.shift() # will be a head element... spec says so
1666 ins_mode = ins_mode_after_head
1668 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1669 ins_mode_in_head_else t
1671 if t.type is TYPE_START_TAG and t.name is 'template'
1672 insert_html_element t
1674 flag_frameset_ok = false
1675 ins_mode = ins_mode_in_template
1676 template_ins_modes.unshift ins_mode_in_template
1678 if t.type is TYPE_END_TAG and t.name is 'template'
1679 if template_tag_is_open()
1680 generate_implied_end_tags
1681 if open_els[0].name isnt 'template'
1684 el = open_els.shift()
1685 if el.name is 'template' and el.namespace is NS_HTML
1687 clear_afe_to_marker()
1688 template_ins_modes.shift()
1693 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1696 ins_mode_in_head_else t
1698 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1699 ins_mode_in_head_noscript_else = (t) ->
1702 ins_mode = ins_mode_in_head
1704 ins_mode_in_head_noscript = (t) ->
1705 if t.type is TYPE_DOCTYPE
1708 if t.type is TYPE_START_TAG and t.name is 'html'
1711 if t.type is TYPE_END_TAG and t.name is 'noscript'
1713 ins_mode = ins_mode_in_head
1715 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1718 if t.type is TYPE_END_TAG and t.name is 'br'
1719 ins_mode_in_head_noscript_else t
1721 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1725 ins_mode_in_head_noscript_else t
1730 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1731 ins_mode_after_head_else = (t) ->
1732 body_tok = new_open_tag 'body'
1733 insert_html_element body_tok
1734 ins_mode = ins_mode_in_body
1737 ins_mode_after_head = (t) ->
1741 if t.type is TYPE_COMMENT
1744 if t.type is TYPE_DOCTYPE
1747 if t.type is TYPE_START_TAG and t.name is 'html'
1750 if t.type is TYPE_START_TAG and t.name is 'body'
1751 insert_html_element t
1752 flag_frameset_ok = false
1753 ins_mode = ins_mode_in_body
1755 if t.type is TYPE_START_TAG and t.name is 'frameset'
1756 insert_html_element t
1757 ins_mode = ins_mode_in_frameset
1759 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1761 open_els.unshift head_element_pointer
1763 for el, i in open_els
1764 if el is head_element_pointer
1765 open_els.splice i, 1
1767 console.log "warning: 23904 couldn't find head element in open_els"
1769 if t.type is TYPE_END_TAG and t.name is 'template'
1772 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1773 ins_mode_after_head_else t
1775 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1779 ins_mode_after_head_else t
1781 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1782 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1785 if node.name is name and node.namespace is NS_HTML
1786 generate_implied_end_tags name # arg is exception
1787 unless node is open_els[0]
1790 el = open_els.shift()
1793 if special_elements[node.name] is node.namespace
1796 for el, i in open_els
1798 node = open_els[i + 1]
1801 ins_mode_in_body = (t) ->
1802 if t.type is TYPE_TEXT and t.text is "\u0000"
1809 if t.type is TYPE_TEXT
1812 flag_frameset_ok = false
1814 if t.type is TYPE_COMMENT
1817 if t.type is TYPE_DOCTYPE
1820 if t.type is TYPE_START_TAG and t.name is 'html'
1822 return if template_tag_is_open()
1823 root_attrs = open_els[open_els.length - 1].attrs
1825 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1828 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1831 if t.type is TYPE_START_TAG and t.name is 'body'
1833 return if open_els.length < 2
1834 second = open_els[open_els.length - 2]
1835 return unless second.namespace is NS_HTML
1836 return unless second.name is 'body'
1837 return if template_tag_is_open()
1838 flag_frameset_ok = false
1840 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1842 if t.type is TYPE_START_TAG and t.name is 'frameset'
1844 return if open_els.length < 2
1845 second_i = open_els.length - 2
1846 second = open_els[second_i]
1847 return unless second.namespace is NS_HTML
1848 return unless second.name is 'body'
1849 if flag_frameset_ok is false
1852 for el, i in second.parent.children
1854 second.parent.children.splice i, 1
1856 open_els.splice second_i, 1
1857 # pop everything except the "root html element"
1858 while open_els.length > 1
1860 insert_html_element t
1861 ins_mode = ins_mode_in_frameset
1863 if t.type is TYPE_EOF
1865 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1866 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1867 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1870 unless ok_tags[t.name] is el.namespace
1873 if template_ins_modes.length > 0
1874 ins_mode_in_template t
1878 if t.type is TYPE_END_TAG and t.name is 'body'
1879 unless is_in_scope 'body', NS_HTML
1883 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1884 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1885 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1886 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1890 unless ok_tags[t.name] is el.namespace
1893 ins_mode = ins_mode_after_body
1895 if t.type is TYPE_END_TAG and t.name is 'html'
1896 unless is_in_scope 'body', NS_HTML
1900 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1901 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1902 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1903 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1907 unless ok_tags[t.name] is el.namespace
1910 ins_mode = ins_mode_after_body
1913 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1914 close_p_if_in_button_scope()
1915 insert_html_element t
1917 if t.type is TYPE_START_TAG and h_tags[t.name]?
1918 close_p_if_in_button_scope()
1919 if h_tags[open_els[0].name] is open_els[0].namespace
1922 insert_html_element t
1924 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1925 close_p_if_in_button_scope()
1926 insert_html_element t
1927 # spec: If the next token is a "LF" (U+000A) character token, then
1928 # ignore that token and move on to the next one. (Newlines at the
1929 # start of pre blocks are ignored as an authoring convenience.)
1930 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1932 flag_frameset_ok = false
1934 if t.type is TYPE_START_TAG and t.name is 'form'
1935 unless form_element_pointer is null or template_tag_is_open()
1938 close_p_if_in_button_scope()
1939 el = insert_html_element t
1940 unless template_tag_is_open()
1941 form_element_pointer = el
1943 if t.type is TYPE_START_TAG and t.name is 'li'
1944 flag_frameset_ok = false
1945 for node in open_els
1946 if node.name is 'li' and node.namespace is NS_HTML
1947 generate_implied_end_tags 'li' # arg is exception
1948 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1951 el = open_els.shift()
1952 if el.name is 'li' and el.namespace is NS_HTML
1955 if el_is_special_not_adp node
1957 close_p_if_in_button_scope()
1958 insert_html_element t
1960 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1961 flag_frameset_ok = false
1962 for node in open_els
1963 if node.name is 'dd' and node.namespace is NS_HTML
1964 generate_implied_end_tags 'dd' # arg is exception
1965 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1968 el = open_els.shift()
1969 if el.name is 'dd' and el.namespace is NS_HTML
1972 if node.name is 'dt' and node.namespace is NS_HTML
1973 generate_implied_end_tags 'dt' # arg is exception
1974 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1977 el = open_els.shift()
1978 if el.name is 'dt' and el.namespace is NS_HTML
1981 if el_is_special_not_adp node
1983 close_p_if_in_button_scope()
1984 insert_html_element t
1986 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1987 close_p_if_in_button_scope()
1988 insert_html_element t
1989 tok_state = tok_state_plaintext
1991 if t.type is TYPE_START_TAG and t.name is 'button'
1992 if is_in_scope 'button', NS_HTML
1994 generate_implied_end_tags()
1996 el = open_els.shift()
1997 if el.name is 'button' and el.namespace is NS_HTML
2000 insert_html_element t
2001 flag_frameset_ok = false
2003 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
2004 unless is_in_scope t.name, NS_HTML
2007 generate_implied_end_tags()
2008 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
2011 el = open_els.shift()
2012 if el.name is t.name and el.namespace is NS_HTML
2015 if t.type is TYPE_END_TAG and t.name is 'form'
2016 unless template_tag_is_open()
2017 node = form_element_pointer
2018 form_element_pointer = null
2019 if node is null or not el_is_in_scope node
2022 generate_implied_end_tags()
2023 if open_els[0] isnt node
2025 for el, i in open_els
2027 open_els.splice i, 1
2030 unless is_in_scope 'form', NS_HTML
2033 generate_implied_end_tags()
2034 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
2037 el = open_els.shift()
2038 if el.name is 'form' and el.namespace is NS_HTML
2041 if t.type is TYPE_END_TAG and t.name is 'p'
2042 unless is_in_button_scope 'p', NS_HTML
2044 insert_html_element new_open_tag 'p'
2047 if t.type is TYPE_END_TAG and t.name is 'li'
2048 unless is_in_li_scope 'li', NS_HTML
2051 generate_implied_end_tags 'li' # arg is exception
2052 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
2055 el = open_els.shift()
2056 if el.name is 'li' and el.namespace is NS_HTML
2059 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
2060 unless is_in_scope t.name, NS_HTML
2063 generate_implied_end_tags t.name # arg is exception
2064 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2067 el = open_els.shift()
2068 if el.name is t.name and el.namespace is NS_HTML
2071 if t.type is TYPE_END_TAG and h_tags[t.name]?
2074 if h_tags[el.name] is el.namespace
2077 if standard_scopers[el.name] is el.namespace
2082 generate_implied_end_tags()
2083 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2086 el = open_els.shift()
2087 if h_tags[el.name] is el.namespace
2091 if t.type is TYPE_START_TAG and t.name is 'a'
2092 # If the list of active formatting elements contains an a element
2093 # between the end of the list and the last marker on the list (or
2094 # the start of the list if there is no marker on the list), then
2095 # this is a parse error; run the adoption agency algorithm for the
2096 # tag name "a", then remove that element from the list of active
2097 # formatting elements and the stack of open elements if the
2098 # adoption agency algorithm didn't already remove it (it might not
2099 # have if the element is not in table scope).
2102 if el.type is TYPE_AFE_MARKER
2104 if el.name is 'a' and el.namespace is NS_HTML
2112 for el, i in open_els
2114 open_els.splice i, 1
2116 el = insert_html_element t
2119 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2121 el = insert_html_element t
2124 if t.type is TYPE_START_TAG and t.name is 'nobr'
2126 if is_in_scope 'nobr', NS_HTML
2128 adoption_agency 'nobr'
2130 el = insert_html_element t
2133 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2134 adoption_agency t.name
2136 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2138 insert_html_element t
2140 flag_frameset_ok = false
2142 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2143 unless is_in_scope t.name, NS_HTML
2146 generate_implied_end_tags()
2147 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2150 el = open_els.shift()
2151 if el.name is t.name and el.namespace is NS_HTML
2153 clear_afe_to_marker()
2155 if t.type is TYPE_START_TAG and t.name is 'table'
2156 unless doc.flag('quirks mode') is QUIRKS_YES
2157 close_p_if_in_button_scope() # test
2158 insert_html_element t
2159 flag_frameset_ok = false
2160 ins_mode = ins_mode_in_table
2162 if t.type is TYPE_END_TAG and t.name is 'br'
2164 t.type = TYPE_START_TAG
2166 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2168 insert_html_element t
2170 t.acknowledge_self_closing()
2171 flag_frameset_ok = false
2173 if t.type is TYPE_START_TAG and t.name is 'input'
2175 insert_html_element t
2177 t.acknowledge_self_closing()
2178 unless is_input_hidden_tok t
2179 flag_frameset_ok = false
2181 if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
2182 # WHATWG adds 'menuitem' for this block
2183 insert_html_element t
2185 t.acknowledge_self_closing()
2187 if t.type is TYPE_START_TAG and t.name is 'hr'
2188 close_p_if_in_button_scope()
2189 insert_html_element t
2191 t.acknowledge_self_closing()
2192 flag_frameset_ok = false
2194 if t.type is TYPE_START_TAG and t.name is 'image'
2199 if t.type is TYPE_START_TAG and t.name is 'isindex'
2201 if template_tag_is_open() is false and form_element_pointer isnt null
2203 t.acknowledge_self_closing()
2204 flag_frameset_ok = false
2205 close_p_if_in_button_scope()
2206 el = insert_html_element new_open_tag 'form'
2207 unless template_tag_is_open()
2208 form_element_pointer = el
2211 el.attrs['action'] = a[1]
2213 insert_html_element new_open_tag 'hr'
2216 insert_html_element new_open_tag 'label'
2217 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2218 input_el = new_open_tag 'input'
2223 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2224 input_el.attrs_a.push [a[0], a[1]]
2225 input_el.attrs_a.push ['name', 'isindex']
2226 # fixfull this next bit is in english... internationalize?
2227 prompt ?= "This is a searchable index. Enter search keywords: "
2228 insert_character new_character_token prompt # fixfull split
2229 # TODO submit typo "balue" in spec
2230 insert_html_element input_el
2232 # insert_character '' # you can put chars here if promt attr missing
2234 insert_html_element new_open_tag 'hr'
2237 unless template_tag_is_open()
2238 form_element_pointer = null
2240 if t.type is TYPE_START_TAG and t.name is 'textarea'
2241 insert_html_element t
2242 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2244 tok_state = tok_state_rcdata
2245 original_ins_mode = ins_mode
2246 flag_frameset_ok = false
2247 ins_mode = ins_mode_text
2249 if t.type is TYPE_START_TAG and t.name is 'xmp'
2250 close_p_if_in_button_scope()
2252 flag_frameset_ok = false
2253 parse_generic_raw_text t
2255 if t.type is TYPE_START_TAG and t.name is 'iframe'
2256 flag_frameset_ok = false
2257 parse_generic_raw_text t
2259 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2260 parse_generic_raw_text t
2262 if t.type is TYPE_START_TAG and t.name is 'select'
2264 insert_html_element t
2265 flag_frameset_ok = false
2266 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2267 ins_mode = ins_mode_in_select_in_table
2269 ins_mode = ins_mode_in_select
2271 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2272 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2275 insert_html_element t
2277 # this comment block implements the W3C spec
2278 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2279 # if is_in_scope 'ruby', NS_HTML
2280 # generate_implied_end_tags()
2281 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2283 # insert_html_element t
2285 # if t.type is TYPE_START_TAG and t.name is 'rt'
2286 # if is_in_scope 'ruby', NS_HTML
2287 # generate_implied_end_tags 'rtc' # arg is exception
2288 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2290 # insert_html_element t
2292 # below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2293 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2294 if is_in_scope 'ruby', NS_HTML
2295 generate_implied_end_tags()
2296 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2298 insert_html_element t
2300 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2301 if is_in_scope 'ruby', NS_HTML
2302 generate_implied_end_tags 'rtc'
2303 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2305 insert_html_element t
2308 if t.type is TYPE_START_TAG and t.name is 'math'
2310 adjust_mathml_attributes t
2311 adjust_foreign_attributes t
2312 insert_foreign_element t, NS_MATHML
2313 if t.flag 'self-closing'
2315 t.acknowledge_self_closing()
2317 if t.type is TYPE_START_TAG and t.name is 'svg'
2319 adjust_svg_attributes t
2320 adjust_foreign_attributes t
2321 insert_foreign_element t, NS_SVG
2322 if t.flag 'self-closing'
2324 t.acknowledge_self_closing()
2326 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2329 if t.type is TYPE_START_TAG # any other start tag
2331 insert_html_element t
2333 if t.type is TYPE_END_TAG # any other end tag
2334 in_body_any_other_end_tag t.name
2338 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2339 ins_mode_text = (t) ->
2340 if t.type is TYPE_TEXT
2343 if t.type is TYPE_EOF
2345 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2346 open_els[0].flag 'already started', true
2348 ins_mode = original_ins_mode
2351 if t.type is TYPE_END_TAG and t.name is 'script'
2353 ins_mode = original_ins_mode
2354 # fixfull the spec seems to assume that I'm going to run the script
2355 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2357 if t.type is TYPE_END_TAG
2359 ins_mode = original_ins_mode
2361 console.log 'warning: end of ins_mode_text reached'
2363 # the functions below implement the tokenizer stats described here:
2364 # http://www.w3.org/TR/html5/syntax.html#tokenization
2366 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2367 ins_mode_in_table_else = (t) ->
2369 flag_foster_parenting = true
2371 flag_foster_parenting = false
2373 ins_mode_in_table = (t) ->
2376 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2377 pending_table_character_tokens = []
2378 original_ins_mode = ins_mode
2379 ins_mode = ins_mode_in_table_text
2382 ins_mode_in_table_else t
2390 clear_stack_to_table_context()
2392 insert_html_element t
2393 ins_mode = ins_mode_in_caption
2395 clear_stack_to_table_context()
2396 insert_html_element t
2397 ins_mode = ins_mode_in_column_group
2399 clear_stack_to_table_context()
2400 insert_html_element new_open_tag 'colgroup'
2401 ins_mode = ins_mode_in_column_group
2403 when 'tbody', 'tfoot', 'thead'
2404 clear_stack_to_table_context()
2405 insert_html_element t
2406 ins_mode = ins_mode_in_table_body
2407 when 'td', 'th', 'tr'
2408 clear_stack_to_table_context()
2409 insert_html_element new_open_tag 'tbody'
2410 ins_mode = ins_mode_in_table_body
2414 if is_in_table_scope 'table', NS_HTML
2416 el = open_els.shift()
2417 if el.name is 'table' and el.namespace is NS_HTML
2421 when 'style', 'script', 'template'
2424 unless is_input_hidden_tok t
2425 ins_mode_in_table_else t
2428 el = insert_html_element t
2430 t.acknowledge_self_closing()
2433 if form_element_pointer?
2435 if template_tag_is_open()
2437 form_element_pointer = insert_html_element t
2440 ins_mode_in_table_else t
2444 if is_in_table_scope 'table', NS_HTML
2446 el = open_els.shift()
2447 if el.name is 'table' and el.namespace is NS_HTML
2452 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2457 ins_mode_in_table_else t
2461 ins_mode_in_table_else t
2464 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2465 ins_mode_in_table_text = (t) ->
2466 if t.type is TYPE_TEXT and t.text is "\u0000"
2470 if t.type is TYPE_TEXT
2471 pending_table_character_tokens.push t
2475 for old in pending_table_character_tokens
2476 unless is_space_tok old
2480 for old in pending_table_character_tokens
2481 insert_character old
2483 for old in pending_table_character_tokens
2484 ins_mode_in_table_else old
2485 pending_table_character_tokens = []
2486 ins_mode = original_ins_mode
2489 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2490 ins_mode_in_caption = (t) ->
2491 if t.type is TYPE_END_TAG and t.name is 'caption'
2492 if is_in_table_scope 'caption', NS_HTML
2493 generate_implied_end_tags()
2494 if open_els[0].name isnt 'caption'
2497 el = open_els.shift()
2498 if el.name is 'caption' and el.namespace is NS_HTML
2500 clear_afe_to_marker()
2501 ins_mode = ins_mode_in_table
2506 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2508 if is_in_table_scope 'caption', NS_HTML
2510 el = open_els.shift()
2511 if el.name is 'caption' and el.namespace is NS_HTML
2513 clear_afe_to_marker()
2514 ins_mode = ins_mode_in_table
2516 # else fragment case
2518 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2524 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2525 ins_mode_in_column_group = (t) ->
2529 if t.type is TYPE_COMMENT
2532 if t.type is TYPE_DOCTYPE
2535 if t.type is TYPE_START_TAG and t.name is 'html'
2538 if t.type is TYPE_START_TAG and t.name is 'col'
2539 el = insert_html_element t
2541 t.acknowledge_self_closing()
2543 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2544 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2546 ins_mode = ins_mode_in_table
2550 if t.type is TYPE_END_TAG and t.name is 'col'
2553 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2556 if t.type is TYPE_EOF
2560 if open_els[0].name isnt 'colgroup'
2564 ins_mode = ins_mode_in_table
2568 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2569 ins_mode_in_table_body = (t) ->
2570 if t.type is TYPE_START_TAG and t.name is 'tr'
2571 clear_stack_to_table_body_context()
2572 insert_html_element t
2573 ins_mode = ins_mode_in_row
2575 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2577 clear_stack_to_table_body_context()
2578 insert_html_element new_open_tag 'tr'
2579 ins_mode = ins_mode_in_row
2582 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2583 unless is_in_table_scope t.name, NS_HTML
2586 clear_stack_to_table_body_context()
2588 ins_mode = ins_mode_in_table
2590 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2593 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2596 if table_scopers[el.name] is el.namespace
2601 clear_stack_to_table_body_context()
2603 ins_mode = ins_mode_in_table
2606 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2612 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2613 ins_mode_in_row = (t) ->
2614 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2615 clear_stack_to_table_row_context()
2616 insert_html_element t
2617 ins_mode = ins_mode_in_cell
2620 if t.type is TYPE_END_TAG and t.name is 'tr'
2621 if is_in_table_scope 'tr', NS_HTML
2622 clear_stack_to_table_row_context()
2624 ins_mode = ins_mode_in_table_body
2628 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2629 if is_in_table_scope 'tr', NS_HTML
2630 clear_stack_to_table_row_context()
2632 ins_mode = ins_mode_in_table_body
2637 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2638 if is_in_table_scope t.name, NS_HTML
2639 if is_in_table_scope 'tr', NS_HTML
2640 clear_stack_to_table_row_context()
2642 ins_mode = ins_mode_in_table_body
2647 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2653 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2655 generate_implied_end_tags()
2656 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2659 el = open_els.shift()
2660 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2662 clear_afe_to_marker()
2663 ins_mode = ins_mode_in_row
2665 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2666 ins_mode_in_cell = (t) ->
2667 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2668 if is_in_table_scope t.name, NS_HTML
2669 generate_implied_end_tags()
2670 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2673 el = open_els.shift()
2674 if el.name is t.name and el.namespace is NS_HTML
2676 clear_afe_to_marker()
2677 ins_mode = ins_mode_in_row
2681 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2684 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2687 if table_scopers[el.name] is el.namespace
2695 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2698 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2699 if is_in_table_scope t.name, NS_HTML
2708 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2709 ins_mode_in_select = (t) ->
2710 if t.type is TYPE_TEXT and t.text is "\u0000"
2713 if t.type is TYPE_TEXT
2716 if t.type is TYPE_COMMENT
2719 if t.type is TYPE_DOCTYPE
2722 if t.type is TYPE_START_TAG and t.name is 'html'
2725 if t.type is TYPE_START_TAG and t.name is 'option'
2726 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2728 insert_html_element t
2730 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2731 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2733 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2735 insert_html_element t
2737 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2738 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2739 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2741 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2746 if t.type is TYPE_END_TAG and t.name is 'option'
2747 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2752 if t.type is TYPE_END_TAG and t.name is 'select'
2753 if is_in_select_scope 'select', NS_HTML
2755 el = open_els.shift()
2756 if el.name is 'select' and el.namespace is NS_HTML
2762 if t.type is TYPE_START_TAG and t.name is 'select'
2765 el = open_els.shift()
2766 if el.name is 'select' and el.namespace is NS_HTML
2769 # spec says that this is the same as </select> but it doesn't say
2770 # to check scope first
2772 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2774 if is_in_select_scope 'select', NS_HTML
2777 el = open_els.shift()
2778 if el.name is 'select' and el.namespace is NS_HTML
2783 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2786 if t.type is TYPE_EOF
2793 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2794 ins_mode_in_select_in_table = (t) ->
2795 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2798 el = open_els.shift()
2799 if el.name is 'select' and el.namespace is NS_HTML
2804 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2806 unless is_in_table_scope t.name, NS_HTML
2809 el = open_els.shift()
2810 if el.name is 'select' and el.namespace is NS_HTML
2816 ins_mode_in_select t
2819 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2820 ins_mode_in_template = (t) ->
2821 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2824 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2827 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2828 template_ins_modes.shift()
2829 template_ins_modes.unshift ins_mode_in_table
2830 ins_mode = ins_mode_in_table
2833 if t.type is TYPE_START_TAG and t.name is 'col'
2834 template_ins_modes.shift()
2835 template_ins_modes.unshift ins_mode_in_column_group
2836 ins_mode = ins_mode_in_column_group
2839 if t.type is TYPE_START_TAG and t.name is 'tr'
2840 template_ins_modes.shift()
2841 template_ins_modes.unshift ins_mode_in_table_body
2842 ins_mode = ins_mode_in_table_body
2845 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2846 template_ins_modes.shift()
2847 template_ins_modes.unshift ins_mode_in_row
2848 ins_mode = ins_mode_in_row
2851 if t.type is TYPE_START_TAG
2852 template_ins_modes.shift()
2853 template_ins_modes.unshift ins_mode_in_body
2854 ins_mode = ins_mode_in_body
2857 if t.type is TYPE_END_TAG
2860 if t.type is TYPE_EOF
2861 unless template_tag_is_open()
2866 el = open_els.shift()
2867 if el.name is 'template' and el.namespace is NS_HTML
2869 clear_afe_to_marker()
2870 template_ins_modes.shift()
2874 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2875 ins_mode_after_body = (t) ->
2879 if t.type is TYPE_COMMENT
2880 first = open_els[open_els.length - 1]
2881 insert_comment t, [first, first.children.length]
2883 if t.type is TYPE_DOCTYPE
2886 if t.type is TYPE_START_TAG and t.name is 'html'
2889 if t.type is TYPE_END_TAG and t.name is 'html'
2890 if flag_fragment_parsing
2893 ins_mode = ins_mode_after_after_body
2895 if t.type is TYPE_EOF
2900 ins_mode = ins_mode_in_body
2903 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2904 ins_mode_in_frameset = (t) ->
2908 if t.type is TYPE_COMMENT
2911 if t.type is TYPE_DOCTYPE
2914 if t.type is TYPE_START_TAG and t.name is 'html'
2917 if t.type is TYPE_START_TAG and t.name is 'frameset'
2918 insert_html_element t
2920 if t.type is TYPE_END_TAG and t.name is 'frameset'
2921 if open_els.length is 1
2923 return # fragment case
2925 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2926 ins_mode = ins_mode_after_frameset
2928 if t.type is TYPE_START_TAG and t.name is 'frame'
2929 insert_html_element t
2931 t.acknowledge_self_closing()
2933 if t.type is TYPE_START_TAG and t.name is 'noframes'
2936 if t.type is TYPE_EOF
2937 if open_els.length isnt 1
2945 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2946 ins_mode_after_frameset = (t) ->
2950 if t.type is TYPE_COMMENT
2953 if t.type is TYPE_DOCTYPE
2956 if t.type is TYPE_START_TAG and t.name is 'html'
2959 if t.type is TYPE_END_TAG and t.name is 'html'
2960 ins_mode = ins_mode_after_after_frameset
2962 if t.type is TYPE_START_TAG and t.name is 'noframes'
2965 if t.type is TYPE_EOF
2972 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2973 ins_mode_after_after_body = (t) ->
2974 if t.type is TYPE_COMMENT
2975 insert_comment t, [doc, doc.children.length]
2977 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2980 if t.type is TYPE_EOF
2985 ins_mode = ins_mode_in_body
2989 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2990 ins_mode_after_after_frameset = (t) ->
2991 if t.type is TYPE_COMMENT
2992 insert_comment t, [doc, doc.children.length]
2994 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2997 if t.type is TYPE_EOF
3000 if t.type is TYPE_START_TAG and t.name is 'noframes'
3007 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
3008 has_color_face_or_size = (t) ->
3010 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
3013 in_foreign_content_end_script = ->
3017 in_foreign_content_other_start = (t) ->
3018 acn = adjusted_current_node()
3019 if acn.namespace is NS_MATHML
3020 adjust_mathml_attributes t
3021 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
3022 t.name = svg_name_fixes[t.name]
3023 if acn.namespace is NS_SVG
3024 adjust_svg_attributes t
3025 adjust_foreign_attributes t
3026 insert_foreign_element t, acn.namespace
3027 if t.flag 'self-closing'
3028 if t.name is 'script'
3029 t.acknowledge_self_closing()
3030 in_foreign_content_end_script()
3034 t.acknowledge_self_closing()
3036 in_foreign_content = (t) ->
3037 if t.type is TYPE_TEXT and t.text is "\u0000"
3039 insert_character new_character_token "\ufffd"
3044 if t.type is TYPE_TEXT
3045 flag_frameset_ok = false
3048 if t.type is TYPE_COMMENT
3051 if t.type is TYPE_DOCTYPE
3054 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
3056 if flag_fragment_parsing
3057 in_foreign_content_other_start t
3059 loop # is this safe?
3061 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
3065 if t.type is TYPE_START_TAG
3066 in_foreign_content_other_start t
3068 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
3069 in_foreign_content_end_script()
3071 if t.type is TYPE_END_TAG
3074 if node.name.toLowerCase() isnt t.name
3077 if node is open_els[open_els.length - 1]
3079 if node.name.toLowerCase() is t.name
3081 el = open_els.shift()
3086 if node.namespace is NS_HTML
3088 ins_mode t # explicitly call HTML insertion mode
3091 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
3093 switch c = txt.charAt(cur++)
3095 return new_text_node parse_character_reference()
3097 tok_state = tok_state_tag_open
3100 return new_text_node "\ufffd"
3102 return new_eof_token()
3104 return new_text_node c
3107 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
3108 # not needed: tok_state_character_reference_in_data = ->
3109 # just call parse_character_reference()
3111 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
3112 tok_state_rcdata = ->
3113 switch c = txt.charAt(cur++)
3115 return new_text_node parse_character_reference()
3117 tok_state = tok_state_rcdata_less_than_sign
3120 return new_character_token "\ufffd"
3122 return new_eof_token()
3124 return new_character_token c
3127 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
3128 # not needed: tok_state_character_reference_in_rcdata = ->
3129 # just call parse_character_reference()
3131 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
3132 tok_state_rawtext = ->
3133 switch c = txt.charAt(cur++)
3135 tok_state = tok_state_rawtext_less_than_sign
3138 return new_character_token "\ufffd"
3140 return new_eof_token()
3142 return new_character_token c
3145 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
3146 tok_state_script_data = ->
3147 switch c = txt.charAt(cur++)
3149 tok_state = tok_state_script_data_less_than_sign
3152 return new_character_token "\ufffd"
3154 return new_eof_token()
3156 return new_character_token c
3159 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3160 tok_state_plaintext = ->
3161 switch c = txt.charAt(cur++)
3164 return new_character_token "\ufffd"
3166 return new_eof_token()
3168 return new_character_token c
3172 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3173 tok_state_tag_open = ->
3174 c = txt.charAt(cur++)
3176 tok_state = tok_state_markup_declaration_open
3179 tok_state = tok_state_end_tag_open
3182 tok_cur_tag = new_open_tag c.toLowerCase()
3183 tok_state = tok_state_tag_name
3186 tok_cur_tag = new_open_tag c
3187 tok_state = tok_state_tag_name
3191 tok_cur_tag = new_comment_token '?' # FIXME right?
3192 tok_state = tok_state_bogus_comment
3196 tok_state = tok_state_data
3197 cur -= 1 # we didn't parse/handle the char after <
3198 return new_text_node '<'
3200 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3201 tok_state_end_tag_open = ->
3202 c = txt.charAt(cur++)
3204 tok_cur_tag = new_end_tag c.toLowerCase()
3205 tok_state = tok_state_tag_name
3208 tok_cur_tag = new_end_tag c
3209 tok_state = tok_state_tag_name
3213 tok_state = tok_state_data
3217 tok_state = tok_state_data
3218 return new_text_node '</'
3221 tok_cur_tag = new_comment_token c
3222 tok_state = tok_state_bogus_comment
3225 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3226 tok_state_tag_name = ->
3227 switch c = txt.charAt(cur++)
3228 when "\t", "\n", "\u000c", ' '
3229 tok_state = tok_state_before_attribute_name
3231 tok_state = tok_state_self_closing_start_tag
3233 tok_state = tok_state_data
3239 tok_cur_tag.name += "\ufffd"
3242 tok_state = tok_state_data
3245 tok_cur_tag.name += c.toLowerCase()
3247 tok_cur_tag.name += c
3250 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3251 tok_state_rcdata_less_than_sign = ->
3252 c = txt.charAt(cur++)
3254 temporary_buffer = ''
3255 tok_state = tok_state_rcdata_end_tag_open
3258 tok_state = tok_state_rcdata
3259 cur -= 1 # reconsume the input character
3260 return new_character_token '<'
3262 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3263 tok_state_rcdata_end_tag_open = ->
3264 c = txt.charAt(cur++)
3266 tok_cur_tag = new_end_tag c.toLowerCase()
3267 temporary_buffer += c
3268 tok_state = tok_state_rcdata_end_tag_name
3271 tok_cur_tag = new_end_tag c
3272 temporary_buffer += c
3273 tok_state = tok_state_rcdata_end_tag_name
3276 tok_state = tok_state_rcdata
3277 cur -= 1 # reconsume the input character
3278 return new_character_token "</" # fixfull separate these
3280 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3281 is_appropriate_end_tag = (t) ->
3282 # spec says to check against "the tag name of the last start tag to
3283 # have been emitted from this tokenizer", but this is only called from
3284 # the various "raw" states, so it's hopefully ok to assume that
3285 # open_els[0].name will work instead TODO: verify this after the script
3286 # data states are implemented
3287 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3288 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3290 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3291 tok_state_rcdata_end_tag_name = ->
3292 c = txt.charAt(cur++)
3293 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3294 if is_appropriate_end_tag tok_cur_tag
3295 tok_state = tok_state_before_attribute_name
3297 # else fall through to "Anything else"
3299 if is_appropriate_end_tag tok_cur_tag
3300 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3302 # else fall through to "Anything else"
3304 if is_appropriate_end_tag tok_cur_tag
3305 tok_state = tok_state_data
3307 # else fall through to "Anything else"
3309 tok_cur_tag.name += c.toLowerCase()
3310 temporary_buffer += c
3313 tok_cur_tag.name += c
3314 temporary_buffer += c
3317 tok_state = tok_state_rcdata
3318 cur -= 1 # reconsume the input character
3319 return new_character_token '</' + temporary_buffer # fixfull separate these
3321 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3322 tok_state_rawtext_less_than_sign = ->
3323 c = txt.charAt(cur++)
3325 temporary_buffer = ''
3326 tok_state = tok_state_rawtext_end_tag_open
3329 tok_state = tok_state_rawtext
3330 cur -= 1 # reconsume the input character
3331 return new_character_token '<'
3333 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3334 tok_state_rawtext_end_tag_open = ->
3335 c = txt.charAt(cur++)
3337 tok_cur_tag = new_end_tag c.toLowerCase()
3338 temporary_buffer += c
3339 tok_state = tok_state_rawtext_end_tag_name
3342 tok_cur_tag = new_end_tag c
3343 temporary_buffer += c
3344 tok_state = tok_state_rawtext_end_tag_name
3347 tok_state = tok_state_rawtext
3348 cur -= 1 # reconsume the input character
3349 return new_character_token "</" # fixfull separate these
3351 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3352 tok_state_rawtext_end_tag_name = ->
3353 c = txt.charAt(cur++)
3354 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3355 if is_appropriate_end_tag tok_cur_tag
3356 tok_state = tok_state_before_attribute_name
3358 # else fall through to "Anything else"
3360 if is_appropriate_end_tag tok_cur_tag
3361 tok_state = tok_state_self_closing_start_tag
3363 # else fall through to "Anything else"
3365 if is_appropriate_end_tag tok_cur_tag
3366 tok_state = tok_state_data
3368 # else fall through to "Anything else"
3370 tok_cur_tag.name += c.toLowerCase()
3371 temporary_buffer += c
3374 tok_cur_tag.name += c
3375 temporary_buffer += c
3378 tok_state = tok_state_rawtext
3379 cur -= 1 # reconsume the input character
3380 return new_character_token '</' + temporary_buffer # fixfull separate these
3382 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3383 tok_state_script_data_less_than_sign = ->
3384 c = txt.charAt(cur++)
3386 temporary_buffer = ''
3387 tok_state = tok_state_script_data_end_tag_open
3390 tok_state = tok_state_script_data_escape_start
3391 return new_character_token '<!' # fixfull split
3393 tok_state = tok_state_script_data
3394 cur -= 1 # Reconsume
3395 return new_character_token '<'
3397 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3398 tok_state_script_data_end_tag_open = ->
3399 c = txt.charAt(cur++)
3401 tok_cur_tag = new_end_tag c.toLowerCase()
3402 temporary_buffer += c
3403 tok_state = tok_state_script_data_end_tag_name
3406 tok_cur_tag = new_end_tag c
3407 temporary_buffer += c
3408 tok_state = tok_state_script_data_end_tag_name
3411 tok_state = tok_state_script_data
3412 cur -= 1 # Reconsume
3413 return new_character_token '</'
3415 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3416 tok_state_script_data_end_tag_name = ->
3417 c = txt.charAt(cur++)
3418 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3419 if is_appropriate_end_tag tok_cur_tag
3420 tok_state = tok_state_before_attribute_name
3424 if is_appropriate_end_tag tok_cur_tag
3425 tok_state = tok_state_self_closing_start_tag
3429 if is_appropriate_end_tag tok_cur_tag
3430 tok_state = tok_state_data
3434 tok_cur_tag.name += c.toLowerCase()
3435 temporary_buffer += c
3438 tok_cur_tag.name += c
3439 temporary_buffer += c
3442 tok_state = tok_state_script_data
3443 cur -= 1 # Reconsume
3444 return new_character_token "</#{temporary_buffer}" # fixfull split
3446 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3447 tok_state_script_data_escape_start = ->
3448 c = txt.charAt(cur++)
3450 tok_state = tok_state_script_data_escape_start_dash
3451 return new_character_token '-'
3453 tok_state = tok_state_script_data
3454 cur -= 1 # Reconsume
3457 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3458 tok_state_script_data_escape_start_dash = ->
3459 c = txt.charAt(cur++)
3461 tok_state = tok_state_script_data_escaped_dash_dash
3462 return new_character_token '-'
3464 tok_state = tok_state_script_data
3465 cur -= 1 # Reconsume
3468 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3469 tok_state_script_data_escaped = ->
3470 c = txt.charAt(cur++)
3472 tok_state = tok_state_script_data_escaped_dash
3473 return new_character_token '-'
3475 tok_state = tok_state_script_data_escaped_less_than_sign
3479 return new_character_token "\ufffd"
3481 tok_state = tok_state_data
3483 cur -= 1 # Reconsume
3486 return new_character_token c
3488 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3489 tok_state_script_data_escaped_dash = ->
3490 c = txt.charAt(cur++)
3492 tok_state = tok_state_script_data_escaped_dash_dash
3493 return new_character_token '-'
3495 tok_state = tok_state_script_data_escaped_less_than_sign
3499 tok_state = tok_state_script_data_escaped
3500 return new_character_token "\ufffd"
3502 tok_state = tok_state_data
3504 cur -= 1 # Reconsume
3507 tok_state = tok_state_script_data_escaped
3508 return new_character_token c
3510 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3511 tok_state_script_data_escaped_dash_dash = ->
3512 c = txt.charAt(cur++)
3514 return new_character_token '-'
3516 tok_state = tok_state_script_data_escaped_less_than_sign
3519 tok_state = tok_state_script_data
3520 return new_character_token '>'
3523 tok_state = tok_state_script_data_escaped
3524 return new_character_token "\ufffd"
3527 tok_state = tok_state_data
3528 cur -= 1 # Reconsume
3531 tok_state = tok_state_script_data_escaped
3532 return new_character_token c
3534 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3535 tok_state_script_data_escaped_less_than_sign = ->
3536 c = txt.charAt(cur++)
3538 temporary_buffer = ''
3539 tok_state = tok_state_script_data_escaped_end_tag_open
3542 temporary_buffer = c.toLowerCase() # yes, really
3543 tok_state = tok_state_script_data_double_escape_start
3544 return new_character_token "<#{c}" # fixfull split
3546 temporary_buffer = c
3547 tok_state = tok_state_script_data_double_escape_start
3548 return new_character_token "<#{c}" # fixfull split
3550 tok_state = tok_state_script_data_escaped
3551 cur -= 1 # Reconsume
3552 return new_character_token '<'
3554 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3555 tok_state_script_data_escaped_end_tag_open = ->
3556 c = txt.charAt(cur++)
3558 tok_cur_tag = new_end_tag c.toLowerCase()
3559 temporary_buffer += c
3560 tok_state = tok_state_script_data_escaped_end_tag_name
3563 tok_cur_tag = new_end_tag c
3564 temporary_buffer += c
3565 tok_state = tok_state_script_data_escaped_end_tag_name
3568 tok_state = tok_state_script_data_escaped
3569 cur -= 1 # Reconsume
3570 return new_character_token '</' # fixfull split
3572 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3573 tok_state_script_data_escaped_end_tag_name = ->
3574 c = txt.charAt(cur++)
3575 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3576 if is_appropriate_end_tag tok_cur_tag
3577 tok_state = tok_state_before_attribute_name
3581 if is_appropriate_end_tag tok_cur_tag
3582 tok_state = tok_state_self_closing_start_tag
3586 if is_appropriate_end_tag tok_cur_tag
3587 tok_state = tok_state_data
3591 tok_cur_tag.name += c.toLowerCase()
3592 temporary_buffer += c.toLowerCase()
3595 tok_cur_tag.name += c
3596 temporary_buffer += c.toLowerCase()
3599 tok_state = tok_state_script_data_escaped
3600 cur -= 1 # Reconsume
3601 return new_character_token "</#{temporary_buffer}" # fixfull split
3603 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3604 tok_state_script_data_double_escape_start = ->
3605 c = txt.charAt(cur++)
3606 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3607 if temporary_buffer is 'script'
3608 tok_state = tok_state_script_data_double_escaped
3610 tok_state = tok_state_script_data_escaped
3611 return new_character_token c
3613 temporary_buffer += c.toLowerCase() # yes, really lowercase
3614 return new_character_token c
3616 temporary_buffer += c
3617 return new_character_token c
3619 tok_state = tok_state_script_data_escaped
3620 cur -= 1 # Reconsume
3623 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3624 tok_state_script_data_double_escaped = ->
3625 c = txt.charAt(cur++)
3627 tok_state = tok_state_script_data_double_escaped_dash
3628 return new_character_token '-'
3630 tok_state = tok_state_script_data_double_escaped_less_than_sign
3631 return new_character_token '<'
3634 return new_character_token "\ufffd"
3637 tok_state = tok_state_data
3638 cur -= 1 # Reconsume
3641 return new_character_token c
3643 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3644 tok_state_script_data_double_escaped_dash = ->
3645 c = txt.charAt(cur++)
3647 tok_state = tok_state_script_data_double_escaped_dash_dash
3648 return new_character_token '-'
3650 tok_state = tok_state_script_data_double_escaped_less_than_sign
3651 return new_character_token '<'
3654 tok_state = tok_state_script_data_double_escaped
3655 return new_character_token "\ufffd"
3658 tok_state = tok_state_data
3659 cur -= 1 # Reconsume
3662 tok_state = tok_state_script_data_double_escaped
3663 return new_character_token c
3665 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3666 tok_state_script_data_double_escaped_dash_dash = ->
3667 c = txt.charAt(cur++)
3669 return new_character_token '-'
3671 tok_state = tok_state_script_data_double_escaped_less_than_sign
3672 return new_character_token '<'
3674 tok_state = tok_state_script_data
3675 return new_character_token '>'
3678 tok_state = tok_state_script_data_double_escaped
3679 return new_character_token "\ufffd"
3682 tok_state = tok_state_data
3683 cur -= 1 # Reconsume
3686 tok_state = tok_state_script_data_double_escaped
3687 return new_character_token c
3689 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3690 tok_state_script_data_double_escaped_less_than_sign = ->
3691 c = txt.charAt(cur++)
3693 temporary_buffer = ''
3694 tok_state = tok_state_script_data_double_escape_end
3695 return new_character_token '/'
3697 tok_state = tok_state_script_data_double_escaped
3698 cur -= 1 # Reconsume
3701 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3702 tok_state_script_data_double_escape_end = ->
3703 c = txt.charAt(cur++)
3704 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3705 if temporary_buffer is 'script'
3706 tok_state = tok_state_script_data_escaped
3708 tok_state = tok_state_script_data_double_escaped
3709 return new_character_token c
3711 temporary_buffer += c.toLowerCase() # yes, really lowercase
3712 return new_character_token c
3714 temporary_buffer += c
3715 return new_character_token c
3717 tok_state = tok_state_script_data_double_escaped
3718 cur -= 1 # Reconsume
3721 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3722 tok_state_before_attribute_name = ->
3724 switch c = txt.charAt(cur++)
3725 when "\t", "\n", "\u000c", ' '
3728 tok_state = tok_state_self_closing_start_tag
3731 tok_state = tok_state_data
3737 attr_name = "\ufffd"
3738 when '"', "'", '<', '='
3743 tok_state = tok_state_data
3746 attr_name = c.toLowerCase()
3750 tok_cur_tag.attrs_a.unshift [attr_name, '']
3751 tok_state = tok_state_attribute_name
3754 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3755 tok_state_attribute_name = ->
3756 switch c = txt.charAt(cur++)
3757 when "\t", "\n", "\u000c", ' '
3758 tok_state = tok_state_after_attribute_name
3760 tok_state = tok_state_self_closing_start_tag
3762 tok_state = tok_state_before_attribute_value
3764 tok_state = tok_state_data
3770 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3773 tok_cur_tag.attrs_a[0][0] += c
3776 tok_state = tok_state_data
3779 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3781 tok_cur_tag.attrs_a[0][0] += c
3784 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3785 tok_state_after_attribute_name = ->
3786 c = txt.charAt(cur++)
3787 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3790 tok_state = tok_state_self_closing_start_tag
3793 tok_state = tok_state_before_attribute_value
3796 tok_state = tok_state_data
3799 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3800 tok_state = tok_state_attribute_name
3804 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3805 tok_state = tok_state_attribute_name
3809 tok_state = tok_state_data
3810 cur -= 1 # reconsume
3812 if c is '"' or c is "'" or c is '<'
3814 # fall through to Anything else
3816 tok_cur_tag.attrs_a.unshift [c, '']
3817 tok_state = tok_state_attribute_name
3819 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3820 tok_state_before_attribute_value = ->
3821 switch c = txt.charAt(cur++)
3822 when "\t", "\n", "\u000c", ' '
3825 tok_state = tok_state_attribute_value_double_quoted
3827 tok_state = tok_state_attribute_value_unquoted
3830 tok_state = tok_state_attribute_value_single_quoted
3833 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3834 tok_state = tok_state_attribute_value_unquoted
3837 tok_state = tok_state_data
3843 tok_state = tok_state_data
3845 tok_cur_tag.attrs_a[0][1] += c
3846 tok_state = tok_state_attribute_value_unquoted
3849 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3850 tok_state_attribute_value_double_quoted = ->
3851 switch c = txt.charAt(cur++)
3853 tok_state = tok_state_after_attribute_value_quoted
3855 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3858 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3861 tok_state = tok_state_data
3863 tok_cur_tag.attrs_a[0][1] += c
3866 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3867 tok_state_attribute_value_single_quoted = ->
3868 switch c = txt.charAt(cur++)
3870 tok_state = tok_state_after_attribute_value_quoted
3872 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3875 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3878 tok_state = tok_state_data
3880 tok_cur_tag.attrs_a[0][1] += c
3883 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3884 tok_state_attribute_value_unquoted = ->
3885 switch c = txt.charAt(cur++)
3886 when "\t", "\n", "\u000c", ' '
3887 tok_state = tok_state_before_attribute_name
3889 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3891 tok_state = tok_state_data
3896 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3899 tok_state = tok_state_data
3901 # Parse Error if ', <, = or ` (backtick)
3902 tok_cur_tag.attrs_a[0][1] += c
3905 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3906 tok_state_after_attribute_value_quoted = ->
3907 switch c = txt.charAt(cur++)
3908 when "\t", "\n", "\u000c", ' '
3909 tok_state = tok_state_before_attribute_name
3911 tok_state = tok_state_self_closing_start_tag
3913 tok_state = tok_state_data
3919 tok_state = tok_state_data
3922 tok_state = tok_state_before_attribute_name
3923 cur -= 1 # we didn't handle that char
3926 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3927 tok_state_self_closing_start_tag = ->
3928 c = txt.charAt(cur++)
3930 tok_cur_tag.flag 'self-closing', true
3931 tok_state = tok_state_data
3935 tok_state = tok_state_data
3936 cur -= 1 # Reconsume
3940 tok_state = tok_state_before_attribute_name
3941 cur -= 1 # Reconsume
3944 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3945 # WARNING: put a comment token in tok_cur_tag before setting this state
3946 tok_state_bogus_comment = ->
3947 next_gt = txt.indexOf '>', cur
3949 val = txt.substr cur
3952 val = txt.substr cur, (next_gt - cur)
3954 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3955 tok_cur_tag.text += val
3956 tok_state = tok_state_data
3959 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3960 tok_state_markup_declaration_open = ->
3961 if txt.substr(cur, 2) is '--'
3963 tok_cur_tag = new_comment_token ''
3964 tok_state = tok_state_comment_start
3966 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3968 tok_state = tok_state_doctype
3970 acn = adjusted_current_node()
3971 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3973 tok_state = tok_state_cdata_section
3977 tok_cur_tag = new_comment_token ''
3978 tok_state = tok_state_bogus_comment
3981 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3982 tok_state_comment_start = ->
3983 switch c = txt.charAt(cur++)
3985 tok_state = tok_state_comment_start_dash
3988 tok_state = tok_state_comment
3989 return new_character_token "\ufffd"
3992 tok_state = tok_state_data
3996 tok_state = tok_state_data
3997 cur -= 1 # Reconsume
4000 tok_cur_tag.text += c
4001 tok_state = tok_state_comment
4004 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
4005 tok_state_comment_start_dash = ->
4006 switch c = txt.charAt(cur++)
4008 tok_state = tok_state_comment_end
4011 tok_cur_tag.text += "-\ufffd"
4012 tok_state = tok_state_comment
4015 tok_state = tok_state_data
4019 tok_state = tok_state_data
4020 cur -= 1 # Reconsume
4023 tok_cur_tag.text += "-#{c}"
4024 tok_state = tok_state_comment
4027 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
4028 tok_state_comment = ->
4029 switch c = txt.charAt(cur++)
4031 tok_state = tok_state_comment_end_dash
4034 tok_cur_tag.text += "\ufffd"
4037 tok_state = tok_state_data
4038 cur -= 1 # Reconsume
4041 tok_cur_tag.text += c
4044 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
4045 tok_state_comment_end_dash = ->
4046 switch c = txt.charAt(cur++)
4048 tok_state = tok_state_comment_end
4051 tok_cur_tag.text += "-\ufffd"
4052 tok_state = tok_state_comment
4055 tok_state = tok_state_data
4056 cur -= 1 # Reconsume
4059 tok_cur_tag.text += "-#{c}"
4060 tok_state = tok_state_comment
4063 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
4064 tok_state_comment_end = ->
4065 switch c = txt.charAt(cur++)
4067 tok_state = tok_state_data
4071 tok_cur_tag.text += "--\ufffd"
4072 tok_state = tok_state_comment
4075 tok_state = tok_state_comment_end_bang
4078 tok_cur_tag.text += '-'
4081 tok_state = tok_state_data
4082 cur -= 1 # Reconsume
4086 tok_cur_tag.text += "--#{c}"
4087 tok_state = tok_state_comment
4090 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
4091 tok_state_comment_end_bang = ->
4092 switch c = txt.charAt(cur++)
4094 tok_cur_tag.text += "--!#{c}"
4095 tok_state = tok_state_comment_end_dash
4097 tok_state = tok_state_data
4101 tok_cur_tag.text += "--!\ufffd"
4102 tok_state = tok_state_comment
4105 tok_state = tok_state_data
4106 cur -= 1 # Reconsume
4109 tok_cur_tag.text += "--!#{c}"
4110 tok_state = tok_state_comment
4113 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4114 tok_state_doctype = ->
4115 switch c = txt.charAt(cur++)
4116 when "\t", "\u000a", "\u000c", ' '
4117 tok_state = tok_state_before_doctype_name
4120 tok_state = tok_state_data
4121 el = new_doctype_token ''
4122 el.flag 'force-quirks', true
4123 cur -= 1 # Reconsume
4127 tok_state = tok_state_before_doctype_name
4128 cur -= 1 # Reconsume
4131 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4132 tok_state_before_doctype_name = ->
4133 c = txt.charAt(cur++)
4134 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4137 tok_cur_tag = new_doctype_token c.toLowerCase()
4138 tok_state = tok_state_doctype_name
4142 tok_cur_tag = new_doctype_token "\ufffd"
4143 tok_state = tok_state_doctype_name
4147 el = new_doctype_token ''
4148 el.flag 'force-quirks', true
4149 tok_state = tok_state_data
4153 tok_state = tok_state_data
4154 el = new_doctype_token ''
4155 el.flag 'force-quirks', true
4156 cur -= 1 # Reconsume
4159 tok_cur_tag = new_doctype_token c
4160 tok_state = tok_state_doctype_name
4163 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4164 tok_state_doctype_name = ->
4165 c = txt.charAt(cur++)
4166 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4167 tok_state = tok_state_after_doctype_name
4170 tok_state = tok_state_data
4173 tok_cur_tag.name += c.toLowerCase()
4177 tok_cur_tag.name += "\ufffd"
4181 tok_state = tok_state_data
4182 tok_cur_tag.flag 'force-quirks', true
4183 cur -= 1 # Reconsume
4186 tok_cur_tag.name += c
4189 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4190 tok_state_after_doctype_name = ->
4191 c = txt.charAt(cur++)
4192 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4195 tok_state = tok_state_data
4199 tok_state = tok_state_data
4200 tok_cur_tag.flag 'force-quirks', true
4201 cur -= 1 # Reconsume
4204 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4206 tok_state = tok_state_after_doctype_public_keyword
4208 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4210 tok_state = tok_state_after_doctype_system_keyword
4213 tok_cur_tag.flag 'force-quirks', true
4214 tok_state = tok_state_bogus_doctype
4217 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4218 tok_state_after_doctype_public_keyword = ->
4219 c = txt.charAt(cur++)
4220 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4221 tok_state = tok_state_before_doctype_public_identifier
4225 tok_cur_tag.public_identifier = ''
4226 tok_state = tok_state_doctype_public_identifier_double_quoted
4230 tok_cur_tag.public_identifier = ''
4231 tok_state = tok_state_doctype_public_identifier_single_quoted
4235 tok_cur_tag.flag 'force-quirks', true
4236 tok_state = tok_state_data
4240 tok_state = tok_state_data
4241 tok_cur_tag.flag 'force-quirks', true
4242 cur -= 1 # Reconsume
4246 tok_cur_tag.flag 'force-quirks', true
4247 tok_state = tok_state_bogus_doctype
4250 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4251 tok_state_before_doctype_public_identifier = ->
4252 c = txt.charAt(cur++)
4253 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4257 tok_cur_tag.public_identifier = ''
4258 tok_state = tok_state_doctype_public_identifier_double_quoted
4262 tok_cur_tag.public_identifier = ''
4263 tok_state = tok_state_doctype_public_identifier_single_quoted
4267 tok_cur_tag.flag 'force-quirks', true
4268 tok_state = tok_state_data
4272 tok_state = tok_state_data
4273 tok_cur_tag.flag 'force-quirks', true
4274 cur -= 1 # Reconsume
4278 tok_cur_tag.flag 'force-quirks', true
4279 tok_state = tok_state_bogus_doctype
4283 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4284 tok_state_doctype_public_identifier_double_quoted = ->
4285 c = txt.charAt(cur++)
4287 tok_state = tok_state_after_doctype_public_identifier
4291 tok_cur_tag.public_identifier += "\ufffd"
4295 tok_cur_tag.flag 'force-quirks', true
4296 tok_state = tok_state_data
4300 tok_state = tok_state_data
4301 tok_cur_tag.flag 'force-quirks', true
4302 cur -= 1 # Reconsume
4305 tok_cur_tag.public_identifier += c
4308 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4309 tok_state_doctype_public_identifier_single_quoted = ->
4310 c = txt.charAt(cur++)
4312 tok_state = tok_state_after_doctype_public_identifier
4316 tok_cur_tag.public_identifier += "\ufffd"
4320 tok_cur_tag.flag 'force-quirks', true
4321 tok_state = tok_state_data
4325 tok_state = tok_state_data
4326 tok_cur_tag.flag 'force-quirks', true
4327 cur -= 1 # Reconsume
4330 tok_cur_tag.public_identifier += c
4333 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4334 tok_state_after_doctype_public_identifier = ->
4335 c = txt.charAt(cur++)
4336 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4337 tok_state = tok_state_between_doctype_public_and_system_identifiers
4340 tok_state = tok_state_data
4344 tok_cur_tag.system_identifier = ''
4345 tok_state = tok_state_doctype_system_identifier_double_quoted
4349 tok_cur_tag.system_identifier = ''
4350 tok_state = tok_state_doctype_system_identifier_single_quoted
4354 tok_state = tok_state_data
4355 tok_cur_tag.flag 'force-quirks', true
4356 cur -= 1 # Reconsume
4360 tok_cur_tag.flag 'force-quirks', true
4361 tok_state = tok_state_bogus_doctype
4364 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4365 tok_state_between_doctype_public_and_system_identifiers = ->
4366 c = txt.charAt(cur++)
4367 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4370 tok_state = tok_state_data
4374 tok_cur_tag.system_identifier = ''
4375 tok_state = tok_state_doctype_system_identifier_double_quoted
4379 tok_cur_tag.system_identifier = ''
4380 tok_state = tok_state_doctype_system_identifier_single_quoted
4384 tok_state = tok_state_data
4385 tok_cur_tag.flag 'force-quirks', true
4386 cur -= 1 # Reconsume
4390 tok_cur_tag.flag 'force-quirks', true
4391 tok_state = tok_state_bogus_doctype
4394 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4395 tok_state_after_doctype_system_keyword = ->
4396 c = txt.charAt(cur++)
4397 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4398 tok_state = tok_state_before_doctype_system_identifier
4402 tok_cur_tag.system_identifier = ''
4403 tok_state = tok_state_doctype_system_identifier_double_quoted
4407 tok_cur_tag.system_identifier = ''
4408 tok_state = tok_state_doctype_system_identifier_single_quoted
4412 tok_cur_tag.flag 'force-quirks', true
4413 tok_state = tok_state_data
4417 tok_state = tok_state_data
4418 tok_cur_tag.flag 'force-quirks', true
4419 cur -= 1 # Reconsume
4423 tok_cur_tag.flag 'force-quirks', true
4424 tok_state = tok_state_bogus_doctype
4427 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4428 tok_state_before_doctype_system_identifier = ->
4429 c = txt.charAt(cur++)
4430 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4433 tok_cur_tag.system_identifier = ''
4434 tok_state = tok_state_doctype_system_identifier_double_quoted
4437 tok_cur_tag.system_identifier = ''
4438 tok_state = tok_state_doctype_system_identifier_single_quoted
4442 tok_cur_tag.flag 'force-quirks', true
4443 tok_state = tok_state_data
4447 tok_state = tok_state_data
4448 tok_cur_tag.flag 'force-quirks', true
4449 cur -= 1 # Reconsume
4453 tok_cur_tag.flag 'force-quirks', true
4454 tok_state = tok_state_bogus_doctype
4457 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4458 tok_state_doctype_system_identifier_double_quoted = ->
4459 c = txt.charAt(cur++)
4461 tok_state = tok_state_after_doctype_system_identifier
4465 tok_cur_tag.system_identifier += "\ufffd"
4469 tok_cur_tag.flag 'force-quirks', true
4470 tok_state = tok_state_data
4474 tok_state = tok_state_data
4475 tok_cur_tag.flag 'force-quirks', true
4476 cur -= 1 # Reconsume
4479 tok_cur_tag.system_identifier += c
4482 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4483 tok_state_doctype_system_identifier_single_quoted = ->
4484 c = txt.charAt(cur++)
4486 tok_state = tok_state_after_doctype_system_identifier
4490 tok_cur_tag.system_identifier += "\ufffd"
4494 tok_cur_tag.flag 'force-quirks', true
4495 tok_state = tok_state_data
4499 tok_state = tok_state_data
4500 tok_cur_tag.flag 'force-quirks', true
4501 cur -= 1 # Reconsume
4504 tok_cur_tag.system_identifier += c
4507 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4508 tok_state_after_doctype_system_identifier = ->
4509 c = txt.charAt(cur++)
4510 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4513 tok_state = tok_state_data
4517 tok_state = tok_state_data
4518 tok_cur_tag.flag 'force-quirks', true
4519 cur -= 1 # Reconsume
4523 # do _not_ tok_cur_tag.flag 'force-quirks', true
4524 tok_state = tok_state_bogus_doctype
4527 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4528 tok_state_bogus_doctype = ->
4529 c = txt.charAt(cur++)
4531 tok_state = tok_state_data
4534 tok_state = tok_state_data
4535 cur -= 1 # Reconsume
4540 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4541 tok_state_cdata_section = ->
4542 tok_state = tok_state_data
4543 next_gt = txt.indexOf ']]>', cur
4545 val = txt.substr cur
4548 val = txt.substr cur, (next_gt - cur)
4551 return new_character_token val # fixfull split
4554 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4555 # Don't set this as a state, just call it
4556 # returns a string (NOT a text node)
4557 parse_character_reference = (allowed_char = null, in_attr = false) ->
4558 if cur >= txt.length
4560 switch c = txt.charAt(cur)
4561 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4562 # explicitly not a parse error
4565 # there has to be "one or more" alnums between & and ; to be a parse error
4568 if cur + 1 >= txt.length
4570 if txt.charAt(cur + 1).toLowerCase() is 'x'
4579 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4584 if txt.charAt(start + i) is ';'
4588 code_point = txt.substr(start, i)
4589 while code_point.charAt(0) is '0' and code_point.length > 1
4590 code_point = code_point.substr 1
4591 code_point = parseInt(code_point, base)
4592 if unicode_fixes[code_point]?
4594 return unicode_fixes[code_point]
4596 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4600 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4602 return from_code_point code_point
4606 if alnum.indexOf(txt.charAt(cur + i)) is -1
4609 # exit early, because parse_error() below needs at least one alnum
4611 if txt.charAt(cur + i) is ';'
4612 i += 1 # include ';' terminator in value
4613 decoded = decode_named_char_ref txt.substr(cur, i)
4620 # no ';' terminator (only legacy char refs)
4622 for i in [2..max] # no prefix matches, so ok to check shortest first
4623 c = legacy_char_refs[txt.substr(cur, i)]
4626 if txt.charAt(cur + i) is '='
4627 # "because some legacy user agents will
4628 # misinterpret the markup in those cases"
4631 if alnum.indexOf(txt.charAt(cur + i)) > -1
4632 # this makes attributes forgiving about url args
4634 # ok, and besides the weird exceptions for attributes...
4635 # return the matching char
4636 cur += i # consume entity chars
4637 parse_error() # because no terminating ";"
4641 return # never reached
4643 # tree constructor initialization
4644 # see comments on TYPE_TAG/etc for the structure of this data
4647 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4648 doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
4650 afe = [] # active formatting elements
4651 template_ins_modes = []
4652 ins_mode = ins_mode_initial
4653 original_ins_mode = ins_mode # TODO check spec
4654 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4655 flag_frameset_ok = true
4657 flag_foster_parenting = false
4658 form_element_pointer = null
4659 temporary_buffer = null
4660 pending_table_character_tokens = []
4661 head_element_pointer = null
4662 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4663 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4664 prev_node_id = 0 # just for debugging
4666 # tokenizer initialization
4667 tok_state = tok_state_data
4669 # text pre-processing
4670 # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4671 txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4672 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4673 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4675 if args.name is "tests23.dat #1"
4678 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4683 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4686 serialize_els = (els, shallow, show_ids) ->
4692 serialized += t.serialize shallow, show_ids
4695 module.exports.parse_html = parse_html
4696 module.exports.debug_log_reset = debug_log_reset
4697 module.exports.debug_log_each = debug_log_each
4698 module.exports.TYPE_TAG = TYPE_TAG
4699 module.exports.TYPE_TEXT = TYPE_TEXT
4700 module.exports.TYPE_COMMENT = TYPE_COMMENT
4701 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4702 module.exports.NS_HTML = NS_HTML
4703 module.exports.NS_MATHML = NS_MATHML
4704 module.exports.NS_SVG = NS_SVG
4705 module.exports.QUIRKS_NO = QUIRKS_NO
4706 module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
4707 module.exports.QUIRKS_YES = QUIRKS_YES