1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
25 # Deviations from that spec:
27 # Purposeful: search this file for "WHATWG"
29 # Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
34 # the spec uses a many different words do indicate which ends of lists/stacks
35 # they are talking about (and relative movement within the lists/stacks). This
36 # section splains. I'm implementing "lists" (afe and open_els) the same way
39 # stacks grow downward (current element is index=0)
41 # example: open_els = [a, b, c, d, e, f, g]
43 # "grows downwards" means it's visualized like this: (index: el, names)
45 # 6: g "start of the list", "topmost", "first"
47 # 4: e "previous" (to d), "above", "before"
48 # 3: d (previous/next are relative to this element)
49 # 2: c "next", "after", "lower", "below"
51 # 0: a "end of the list", "current node", "bottommost", "last"
55 # note: to get this to run outside a browser, you'll have to write a native
56 # implementation of decode_named_char_ref()
57 unless module?.exports?
59 module = exports: window.wheic
61 from_code_point = (x) ->
62 if String.fromCodePoint?
63 return String.fromCodePoint x
66 return String.fromCharCode x
68 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
70 # Each node is an obect of the Node class. Here are the Node types:
71 TYPE_TAG = 0 # name, {attributes}, [children]
72 TYPE_TEXT = 1 # "text"
75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
77 TYPE_END_TAG = 5 # name
79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
87 # quirks mode constants
97 debug_log_each = (cb) ->
98 for str in g_debug_log
103 constructor: (type, args = {}) ->
104 @type = type # one of the TYPE_* constants above
105 @name = args.name ? '' # tag name
106 @text = args.text ? '' # contents for text/comment nodes
107 @attrs = args.attrs ? {}
108 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
109 @children = args.children ? []
110 @namespace = args.namespace ? NS_HTML
111 @parent = args.parent ? null
112 @token = args.token ? null
113 @flags = args.flags ? {}
117 @id = "#{++prev_node_id}"
118 acknowledge_self_closing: ->
120 @token.flag 'did_self_close', true
122 @flag 'did_self_close', true
123 flag: (key, value = null) ->
128 serialize: (shallow = false, show_ids = false) -> # for unit tests
133 ret += JSON.stringify @name
148 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
154 ret += c.serialize shallow, show_ids
158 ret += JSON.stringify @text
161 ret += JSON.stringify @text
163 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
166 when TYPE_AAA_BOOKMARK
167 ret += 'aaa_bookmark'
170 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
173 # helpers: (only take args that are normally known when parser creates nodes)
174 new_open_tag = (name) ->
175 return new Node TYPE_START_TAG, name: name
176 new_end_tag = (name) ->
177 return new Node TYPE_END_TAG, name: name
178 new_element = (name) ->
179 return new Node TYPE_TAG, name: name
180 new_text_node = (txt) ->
181 return new Node TYPE_TEXT, text: txt
182 new_character_token = new_text_node
183 new_comment_token = (txt) ->
184 return new Node TYPE_COMMENT, text: txt
185 new_doctype_token = (name) ->
186 return new Node TYPE_DOCTYPE, name: name
188 return new Node TYPE_EOF
190 return new Node TYPE_AFE_MARKER
191 new_aaa_bookmark = ->
192 return new Node TYPE_AAA_BOOKMARK
194 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
195 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
196 digits = "0123456789"
197 alnum = lc_alpha + uc_alpha + digits
198 hex_chars = digits + "abcdefABCDEF"
200 is_uc_alpha = (str) ->
201 return str.length is 1 and uc_alpha.indexOf(str) > -1
202 is_lc_alpha = (str) ->
203 return str.length is 1 and lc_alpha.indexOf(str) > -1
205 # some SVG elements have dashes in them
206 tag_name_chars = alnum + "-"
208 # http://www.w3.org/TR/html5/infrastructure.html#space-character
209 space_chars = "\u0009\u000a\u000c\u000d\u0020"
211 return txt.length is 1 and space_chars.indexOf(txt) > -1
212 is_space_tok = (t) ->
213 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
215 is_input_hidden_tok = (t) ->
216 return false unless t.type is TYPE_START_TAG
219 if a[1].toLowerCase() is 'hidden'
224 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
225 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
228 unicode_fixes[0x00] = "\uFFFD"
229 unicode_fixes[0x80] = "\u20AC"
230 unicode_fixes[0x82] = "\u201A"
231 unicode_fixes[0x83] = "\u0192"
232 unicode_fixes[0x84] = "\u201E"
233 unicode_fixes[0x85] = "\u2026"
234 unicode_fixes[0x86] = "\u2020"
235 unicode_fixes[0x87] = "\u2021"
236 unicode_fixes[0x88] = "\u02C6"
237 unicode_fixes[0x89] = "\u2030"
238 unicode_fixes[0x8A] = "\u0160"
239 unicode_fixes[0x8B] = "\u2039"
240 unicode_fixes[0x8C] = "\u0152"
241 unicode_fixes[0x8E] = "\u017D"
242 unicode_fixes[0x91] = "\u2018"
243 unicode_fixes[0x92] = "\u2019"
244 unicode_fixes[0x93] = "\u201C"
245 unicode_fixes[0x94] = "\u201D"
246 unicode_fixes[0x95] = "\u2022"
247 unicode_fixes[0x96] = "\u2013"
248 unicode_fixes[0x97] = "\u2014"
249 unicode_fixes[0x98] = "\u02DC"
250 unicode_fixes[0x99] = "\u2122"
251 unicode_fixes[0x9A] = "\u0161"
252 unicode_fixes[0x9B] = "\u203A"
253 unicode_fixes[0x9C] = "\u0153"
254 unicode_fixes[0x9E] = "\u017E"
255 unicode_fixes[0x9F] = "\u0178"
257 quirks_yes_pi_prefixes = [
258 "+//silmaril//dtd html pro v0r11 19970101//"
259 "-//as//dtd html 3.0 aswedit + extensions//"
260 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
261 "-//ietf//dtd html 2.0 level 1//"
262 "-//ietf//dtd html 2.0 level 2//"
263 "-//ietf//dtd html 2.0 strict level 1//"
264 "-//ietf//dtd html 2.0 strict level 2//"
265 "-//ietf//dtd html 2.0 strict//"
266 "-//ietf//dtd html 2.0//"
267 "-//ietf//dtd html 2.1e//"
268 "-//ietf//dtd html 3.0//"
269 "-//ietf//dtd html 3.2 final//"
270 "-//ietf//dtd html 3.2//"
271 "-//ietf//dtd html 3//"
272 "-//ietf//dtd html level 0//"
273 "-//ietf//dtd html level 1//"
274 "-//ietf//dtd html level 2//"
275 "-//ietf//dtd html level 3//"
276 "-//ietf//dtd html strict level 0//"
277 "-//ietf//dtd html strict level 1//"
278 "-//ietf//dtd html strict level 2//"
279 "-//ietf//dtd html strict level 3//"
280 "-//ietf//dtd html strict//"
281 "-//ietf//dtd html//"
282 "-//metrius//dtd metrius presentational//"
283 "-//microsoft//dtd internet explorer 2.0 html strict//"
284 "-//microsoft//dtd internet explorer 2.0 html//"
285 "-//microsoft//dtd internet explorer 2.0 tables//"
286 "-//microsoft//dtd internet explorer 3.0 html strict//"
287 "-//microsoft//dtd internet explorer 3.0 html//"
288 "-//microsoft//dtd internet explorer 3.0 tables//"
289 "-//netscape comm. corp.//dtd html//"
290 "-//netscape comm. corp.//dtd strict html//"
291 "-//o'reilly and associates//dtd html 2.0//"
292 "-//o'reilly and associates//dtd html extended 1.0//"
293 "-//o'reilly and associates//dtd html extended relaxed 1.0//"
294 "-//sq//dtd html 2.0 hotmetal + extensions//"
295 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
296 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
297 "-//spyglass//dtd html 2.0 extended//"
298 "-//sun microsystems corp.//dtd hotjava html//"
299 "-//sun microsystems corp.//dtd hotjava strict html//"
300 "-//w3c//dtd html 3 1995-03-24//"
301 "-//w3c//dtd html 3.2 draft//"
302 "-//w3c//dtd html 3.2 final//"
303 "-//w3c//dtd html 3.2//"
304 "-//w3c//dtd html 3.2s draft//"
305 "-//w3c//dtd html 4.0 frameset//"
306 "-//w3c//dtd html 4.0 transitional//"
307 "-//w3c//dtd html experimental 19960712//"
308 "-//w3c//dtd html experimental 970421//"
309 "-//w3c//dtd w3 html//"
310 "-//w3o//dtd w3 html 3.0//"
311 "-//webtechs//dtd mozilla html 2.0//"
312 "-//webtechs//dtd mozilla html//"
315 # These are the character references that don't need a terminating semicolon
316 # min length: 2, max: 6, none are a prefix of any other.
318 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
319 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
320 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
321 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
322 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
323 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
324 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
325 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
326 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
327 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
328 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
329 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
330 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
331 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
332 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
333 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
334 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
338 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
339 raw_text_elements = ['script', 'style']
340 escapable_raw_text_elements = ['textarea', 'title']
341 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
343 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
344 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
345 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
346 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
347 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
348 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
349 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
350 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
351 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
352 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
353 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
354 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
355 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
356 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
360 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
362 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
363 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
364 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
365 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
366 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
367 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
368 'determinant', 'diff', 'divergence', 'divide', 'domain',
369 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
370 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
371 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
372 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
373 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
374 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
375 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
376 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
377 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
378 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
379 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
380 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
381 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
382 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
383 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
384 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
385 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
386 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
387 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
388 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
389 'vectorproduct', 'xor'
391 # foreign_elements = [svg_elements..., mathml_elements...]
392 #normal_elements = All other allowed HTML elements are normal elements.
396 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
397 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
398 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
399 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
400 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
401 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
402 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
403 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
404 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
405 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
406 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
408 menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
410 meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
411 noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
412 plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
413 select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
414 table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
415 textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
416 tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
419 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
420 'annotation-xml':NS_MATHML,
423 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
426 formatting_elements = {
427 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
428 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
432 mathml_text_integration = {
433 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
435 is_mathml_text_integration_point = (el) ->
436 return mathml_text_integration[el.name] is el.namespace
437 is_html_integration = (el) -> # DON'T PASS A TOKEN
438 if el.namespace is NS_MATHML
439 if el.name is 'annotation-xml'
440 if el.attrs.encoding?
441 if el.attrs.encoding.toLowerCase() is 'text/html'
443 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
446 if el.namespace is NS_SVG
447 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
452 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
455 foster_parenting_targets = {
476 el_is_special = (e) ->
477 return special_elements[e.name] is e.namespace
479 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
480 el_is_special_not_adp = (el) ->
481 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
485 altglyphdef: 'altGlyphDef'
486 altglyphitem: 'altGlyphItem'
487 animatecolor: 'animateColor'
488 animatemotion: 'animateMotion'
489 animatetransform: 'animateTransform'
492 fecolormatrix: 'feColorMatrix'
493 fecomponenttransfer: 'feComponentTransfer'
494 fecomposite: 'feComposite'
495 feconvolvematrix: 'feConvolveMatrix'
496 fediffuselighting: 'feDiffuseLighting'
497 fedisplacementmap: 'feDisplacementMap'
498 fedistantlight: 'feDistantLight'
499 fedropshadow: 'feDropShadow'
505 fegaussianblur: 'feGaussianBlur'
508 femergenode: 'feMergeNode'
509 femorphology: 'feMorphology'
511 fepointlight: 'fePointLight'
512 fespecularlighting: 'feSpecularLighting'
513 fespotlight: 'feSpotLight'
515 feturbulence: 'feTurbulence'
516 foreignobject: 'foreignObject'
518 lineargradient: 'linearGradient'
519 radialgradient: 'radialGradient'
522 svg_attribute_fixes = {
523 attributename: 'attributeName'
524 attributetype: 'attributeType'
525 basefrequency: 'baseFrequency'
526 baseprofile: 'baseProfile'
528 clippathunits: 'clipPathUnits'
529 contentscripttype: 'contentScriptType'
530 contentstyletype: 'contentStyleType'
531 diffuseconstant: 'diffuseConstant'
533 externalresourcesrequired: 'externalResourcesRequired'
534 # WHATWG removes this: filterres: 'filterRes'
535 filterunits: 'filterUnits'
537 gradienttransform: 'gradientTransform'
538 gradientunits: 'gradientUnits'
539 kernelmatrix: 'kernelMatrix'
540 kernelunitlength: 'kernelUnitLength'
541 keypoints: 'keyPoints'
542 keysplines: 'keySplines'
544 lengthadjust: 'lengthAdjust'
545 limitingconeangle: 'limitingConeAngle'
546 markerheight: 'markerHeight'
547 markerunits: 'markerUnits'
548 markerwidth: 'markerWidth'
549 maskcontentunits: 'maskContentUnits'
550 maskunits: 'maskUnits'
551 numoctaves: 'numOctaves'
552 pathlength: 'pathLength'
553 patterncontentunits: 'patternContentUnits'
554 patterntransform: 'patternTransform'
555 patternunits: 'patternUnits'
556 pointsatx: 'pointsAtX'
557 pointsaty: 'pointsAtY'
558 pointsatz: 'pointsAtZ'
559 preservealpha: 'preserveAlpha'
560 preserveaspectratio: 'preserveAspectRatio'
561 primitiveunits: 'primitiveUnits'
564 repeatcount: 'repeatCount'
565 repeatdur: 'repeatDur'
566 requiredextensions: 'requiredExtensions'
567 requiredfeatures: 'requiredFeatures'
568 specularconstant: 'specularConstant'
569 specularexponent: 'specularExponent'
570 spreadmethod: 'spreadMethod'
571 startoffset: 'startOffset'
572 stddeviation: 'stdDeviation'
573 stitchtiles: 'stitchTiles'
574 surfacescale: 'surfaceScale'
575 systemlanguage: 'systemLanguage'
576 tablevalues: 'tableValues'
579 textlength: 'textLength'
581 viewtarget: 'viewTarget'
582 xchannelselector: 'xChannelSelector'
583 ychannelselector: 'yChannelSelector'
584 zoomandpan: 'zoomAndPan'
586 foreign_attr_fixes = {
587 'xlink:actuate': 'xlink actuate'
588 'xlink:arcrole': 'xlink arcrole'
589 'xlink:href': 'xlink href'
590 'xlink:role': 'xlink role'
591 'xlink:show': 'xlink show'
592 'xlink:title': 'xlink title'
593 'xlink:type': 'xlink type'
594 'xml:base': 'xml base'
595 'xml:lang': 'xml lang'
596 'xml:space': 'xml space'
598 'xmlns:xlink': 'xmlns xlink'
600 adjust_mathml_attributes = (t) ->
602 if a[0] is 'definitionurl'
603 a[0] = 'definitionURL'
605 adjust_svg_attributes = (t) ->
607 if svg_attribute_fixes[a[0]]?
608 a[0] = svg_attribute_fixes[a[0]]
610 adjust_foreign_attributes = (t) ->
613 if foreign_attr_fixes[a[0]]?
614 a[0] = foreign_attr_fixes[a[0]]
617 # decode_named_char_ref()
619 # The list of named character references is _huge_ so ask the browser to decode
620 # for us instead of wasting bandwidth/space on including the table here.
622 # Pass without the "&" but with the ";" examples:
623 # for "&" pass "amp;"
624 # for "′" pass "x2032;"
627 textarea: document.createElement('textarea')
629 # TODO test this in IE8
630 decode_named_char_ref = (txt) ->
632 decoded = g_dncr.cache[txt]
633 return decoded if decoded?
634 g_dncr.textarea.innerHTML = txt
635 decoded = g_dncr.textarea.value
636 return null if decoded is txt
637 return g_dncr.cache[txt] = decoded
639 parse_html = (args) ->
641 cur = null # index of next char in txt to be parsed
642 # declare doc and tokenizer variables so they're in scope below
644 open_els = null # stack of open elements
645 afe = null # active formatting elements
646 template_ins_modes = null
648 original_ins_mode = null
650 tok_cur_tag = null # partially parsed tag
651 flag_scripting = null
652 flag_frameset_ok = null
654 flag_foster_parenting = null
655 form_element_pointer = null
656 temporary_buffer = null
657 pending_table_character_tokens = null
658 head_element_pointer = null
659 flag_fragment_parsing = null
660 context_element = null
669 console.log "Parse error at character #{cur} of #{txt.length}"
671 # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
672 # "Noah's Ark clause" but with three
673 afe_push = (new_el) ->
676 if el.type is TYPE_AFE_MARKER
678 if el.name is new_el.name and el.namespace is new_el.namespace
681 unless new_el.attrs[k] is v
685 for k, v of new_el.attrs
686 unless el.attrs[k] is v
696 afe.unshift new_afe_marker()
698 # the functions below impliment the Tree Contstruction algorithm
699 # http://www.w3.org/TR/html5/syntax.html#tree-construction
701 # But first... the helpers
702 template_tag_is_open = ->
704 if el.name is 'template' and el.namespace is NS_HTML
707 is_in_scope_x = (tag_name, scope, namespace) ->
709 if el.name is tag_name and (namespace is null or namespace is el.namespace)
711 if scope[el.name] is el.namespace
714 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
716 if el.name is tag_name and (namespace is null or namespace is el.namespace)
718 if scope[el.name] is el.namespace
720 if scope2[el.name] is el.namespace
724 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
725 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
728 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
729 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
731 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
733 button_scopers = button: NS_HTML
734 li_scopers = ol: NS_HTML, ul: NS_HTML
735 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
736 is_in_scope = (tag_name, namespace = null) ->
737 return is_in_scope_x tag_name, standard_scopers, namespace
738 is_in_button_scope = (tag_name, namespace = null) ->
739 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
740 is_in_table_scope = (tag_name, namespace = null) ->
741 return is_in_scope_x tag_name, table_scopers, namespace
742 # aka is_in_list_item_scope
743 is_in_li_scope = (tag_name, namespace = null) ->
744 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
745 is_in_select_scope = (tag_name, namespace = null) ->
747 if t.name is tag_name and (namespace is null or namespace is t.namespace)
749 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
752 # this checks for a particular element, not by name
753 # this requires a namespace match
754 el_is_in_scope = (needle) ->
758 if standard_scopers[el.name] is el.namespace
762 clear_to_table_stopers = {
767 clear_stack_to_table_context = ->
769 if clear_to_table_stopers[open_els[0].name]?
773 clear_to_table_body_stopers = {
780 clear_stack_to_table_body_context = ->
782 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
786 clear_to_table_row_stopers = {
791 clear_stack_to_table_row_context = ->
793 if clear_to_table_row_stopers[open_els[0].name]?
797 clear_afe_to_marker = ->
799 return unless afe.length > 0 # this happens in fragment case, ?spec error
801 if el.type is TYPE_AFE_MARKER
806 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
808 # 1. Let last be false.
810 # 2. Let node be the last node in the stack of open elements.
812 node = open_els[node_i]
813 # 3. Loop: If node is the first node in the stack of open elements,
814 # then set last to true, and, if the parser was originally created as
815 # part of the HTML fragment parsing algorithm (fragment case) set node
816 # to the context element.
818 if node_i is open_els.length - 1
820 # fixfull (fragment case)
822 # 4. If node is a select element, run these substeps:
823 if node.name is 'select' and node.namespace is NS_HTML
824 # 1. If last is true, jump to the step below labeled done.
826 # 2. Let ancestor be node.
829 # 3. Loop: If ancestor is the first node in the stack of
830 # open elements, jump to the step below labeled done.
832 if ancestor_i is open_els.length - 1
834 # 4. Let ancestor be the node before ancestor in the stack
837 ancestor = open_els[ancestor_i]
838 # 5. If ancestor is a template node, jump to the step below
840 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
842 # 6. If ancestor is a table node, switch the insertion mode
843 # to "in select in table" and abort these steps.
844 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
845 ins_mode = ins_mode_in_select_in_table
847 # 7. Jump back to the step labeled loop.
848 # 8. Done: Switch the insertion mode to "in select" and abort
850 ins_mode = ins_mode_in_select
852 # 5. If node is a td or th element and last is false, then switch
853 # the insertion mode to "in cell" and abort these steps.
854 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
855 ins_mode = ins_mode_in_cell
857 # 6. If node is a tr element, then switch the insertion mode to "in
858 # row" and abort these steps.
859 if node.name is 'tr' and node.namespace is NS_HTML
860 ins_mode = ins_mode_in_row
862 # 7. If node is a tbody, thead, or tfoot element, then switch the
863 # insertion mode to "in table body" and abort these steps.
864 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
865 ins_mode = ins_mode_in_table_body
867 # 8. If node is a caption element, then switch the insertion mode
868 # to "in caption" and abort these steps.
869 if node.name is 'caption' and node.namespace is NS_HTML
870 ins_mode = ins_mode_in_caption
872 # 9. If node is a colgroup element, then switch the insertion mode
873 # to "in column group" and abort these steps.
874 if node.name is 'colgroup' and node.namespace is NS_HTML
875 ins_mode = ins_mode_in_column_group
877 # 10. If node is a table element, then switch the insertion mode to
878 # "in table" and abort these steps.
879 if node.name is 'table' and node.namespace is NS_HTML
880 ins_mode = ins_mode_in_table
882 # 11. If node is a template element, then switch the insertion mode
883 # to the current template insertion mode and abort these steps.
884 if node.name is 'template' and node.namespace is NS_HTML
885 ins_mode = template_ins_modes[0]
887 # 12. If node is a head element and last is true, then switch the
888 # insertion mode to "in body" ("in body"! not "in head"!) and abort
889 # these steps. (fragment case)
890 if node.name is 'head' and node.namespace is NS_HTML and last
891 ins_mode = ins_mode_in_body
893 # 13. If node is a head element and last is false, then switch the
894 # insertion mode to "in head" and abort these steps.
895 if node.name is 'head' and node.namespace is NS_HTML and last is false
896 ins_mode = ins_mode_in_head
898 # 14. If node is a body element, then switch the insertion mode to
899 # "in body" and abort these steps.
900 if node.name is 'body' and node.namespace is NS_HTML
901 ins_mode = ins_mode_in_body
903 # 15. If node is a frameset element, then switch the insertion mode
904 # to "in frameset" and abort these steps. (fragment case)
905 if node.name is 'frameset' and node.namespace is NS_HTML
906 ins_mode = ins_mode_in_frameset
908 # 16. If node is an html element, run these substeps:
909 if node.name is 'html' and node.namespace is NS_HTML
910 # 1. If the head element pointer is null, switch the insertion
911 # mode to "before head" and abort these steps. (fragment case)
912 if head_element_pointer is null
913 ins_mode = ins_mode_before_head
915 # 2. Otherwise, the head element pointer is not null,
916 # switch the insertion mode to "after head" and abort these
918 ins_mode = ins_mode_after_head
920 # 17. If last is true, then switch the insertion mode to "in body"
921 # and abort these steps. (fragment case)
923 ins_mode = ins_mode_in_body
925 # 18. Let node now be the node before node in the stack of open
928 node = open_els[node_i]
929 # 19. Return to the step labeled loop.
933 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
934 adjusted_current_node = ->
935 if open_els.length is 1 and flag_fragment_parsing
936 return context_element
939 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
940 # this implementation is structured (mostly) as described at the link above.
941 # capitalized comments are the "labels" described at the link above.
943 return if afe.length is 0
944 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
949 if i is afe.length - 1
952 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
957 el = insert_html_element afe[i].token
962 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
963 # adoption agency algorithm
965 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
966 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
967 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
968 adoption_agency = (subject) ->
969 debug_log "adoption_agency()"
970 debug_log "tree: #{serialize_els doc.children, false, true}"
971 debug_log "open_els: #{serialize_els open_els, true, true}"
972 debug_log "afe: #{serialize_els afe, true, true}"
973 # this block implements tha W3C spec
974 # # 1. If the current node is an HTML element whose tag name is subject,
975 # # then run these substeps:
977 # # 1. Let element be the current node.
979 # # 2. Pop element off the stack of open elements.
981 # # 3. If element is also in the list of active formatting elements,
982 # # remove the element from the list.
984 # # 4. Abort the adoption agency algorithm.
985 # if open_els[0].name is subject and open_els[0].namespace is NS_HTML
986 # el = open_els.shift()
987 # # remove it from the list of active formatting elements (if found)
992 # debug_log "aaa: starting off with subject on top of stack, exiting"
994 # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
995 # If the current node is an HTML element whose tag name is subject, and
996 # the current node is not in the list of active formatting elements,
997 # then pop the current node off the stack of open elements, and abort
999 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
1000 debug_log "aaa: starting off with subject on top of stack, exiting"
1001 # remove it from the list of active formatting elements (if found)
1004 if el is open_els[0]
1008 debug_log "aaa: ...and not in afe, aaa done"
1018 # 5. Let formatting element be the last element in the list of
1019 # active formatting elements that: is between the end of the list
1020 # and the last scope marker in the list, if any, or the start of
1021 # the list otherwise, and has the tag name subject.
1023 for t, fe_of_afe in afe
1024 if t.type is TYPE_AFE_MARKER
1026 if t.name is subject
1029 # If there is no such element, then abort these steps and instead
1030 # act as described in the "any other end tag" entry above.
1032 debug_log "aaa: fe not found in afe"
1033 in_body_any_other_end_tag subject
1035 # 6. If formatting element is not in the stack of open elements,
1036 # then this is a parse error; remove the element from the list, and
1037 # abort these steps.
1039 for t, fe_of_open_els in open_els
1044 debug_log "aaa: fe not found in open_els"
1046 # "remove it from the list" must mean afe, since it's not in open_els
1047 afe.splice fe_of_afe, 1
1049 # 7. If formatting element is in the stack of open elements, but
1050 # the element is not in scope, then this is a parse error; abort
1052 unless el_is_in_scope fe
1053 debug_log "aaa: fe not in scope"
1056 # 8. If formatting element is not the current node, this is a parse
1057 # error. (But do not abort these steps.)
1058 unless open_els[0] is fe
1061 # 9. Let furthest block be the topmost node in the stack of open
1062 # elements that is lower in the stack than formatting element, and
1063 # is an element in the special category. There might not be one.
1065 fb_of_open_els = null
1066 for t, i in open_els
1072 # and continue, to see if there's one that's more "topmost"
1073 # 10. If there is no furthest block, then the UA must first pop all
1074 # the nodes from the bottom of the stack of open elements, from the
1075 # current node up to and including formatting element, then remove
1076 # formatting element from the list of active formatting elements,
1077 # and finally abort these steps.
1079 debug_log "aaa: no fb"
1081 t = open_els.shift()
1083 afe.splice fe_of_afe, 1
1085 # 11. Let common ancestor be the element immediately above
1086 # formatting element in the stack of open elements.
1087 ca = open_els[fe_of_open_els + 1] # common ancestor
1089 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
1090 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
1091 bookmark = new_aaa_bookmark()
1094 afe.splice i, 0, bookmark
1096 node = last_node = fb
1100 # 3. Let node be the element immediately above node in the
1101 # stack of open elements, or if node is no longer in the stack
1102 # of open elements (e.g. because it got removed by this
1103 # algorithm), the element that was immediately above node in
1104 # the stack of open elements before node was removed.
1106 for t, i in open_els
1108 node_next = open_els[i + 1]
1110 node = node_next ? node_above
1111 debug_log "inner loop #{inner}"
1112 debug_log "tree: #{serialize_els doc.children, false, true}"
1113 debug_log "open_els: #{serialize_els open_els, true, true}"
1114 debug_log "afe: #{serialize_els afe, true, true}"
1115 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1116 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1117 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1118 debug_log "node: #{node.serialize true, true}"
1119 # TODO make sure node_above gets re-set if/when node is removed from open_els
1121 # 4. If node is formatting element, then go to the next step in
1122 # the overall algorithm.
1125 debug_log "the meat"
1126 # 5. If inner loop counter is greater than three and node is in
1127 # the list of active formatting elements, then remove node from
1128 # the list of active formatting elements.
1134 debug_log "max out inner"
1139 # 6. If node is not in the list of active formatting elements,
1140 # then remove node from the stack of open elements and then go
1141 # back to the step labeled inner loop.
1143 debug_log "not in afe"
1144 for t, i in open_els
1146 node_above = open_els[i + 1]
1147 open_els.splice i, 1
1150 debug_log "the bones"
1151 # 7. create an element for the token for which the element node
1152 # was created, in the HTML namespace, with common ancestor as
1153 # the intended parent; replace the entry for node in the list
1154 # of active formatting elements with an entry for the new
1155 # element, replace the entry for node in the stack of open
1156 # elements with an entry for the new element, and let node be
1158 new_node = token_to_element node.token, NS_HTML, ca
1162 debug_log "replaced in afe"
1164 for t, i in open_els
1166 node_above = open_els[i + 1]
1167 open_els[i] = new_node
1168 debug_log "replaced in open_els"
1171 # 8. If last node is furthest block, then move the
1172 # aforementioned bookmark to be immediately after the new node
1173 # in the list of active formatting elements.
1178 debug_log "removed bookmark"
1182 # "after" means lower
1183 afe.splice i, 0, bookmark # "after as <-
1184 debug_log "placed bookmark after node"
1185 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1187 # 9. Insert last node into node, first removing it from its
1188 # previous parent node if any.
1189 if last_node.parent?
1190 debug_log "last_node has parent"
1191 for c, i in last_node.parent.children
1193 debug_log "removing last_node from parent"
1194 last_node.parent.children.splice i, 1
1196 node.children.push last_node
1197 last_node.parent = node
1198 # 10. Let last node be node.
1201 # 11. Return to the step labeled inner loop.
1202 # 14. Insert whatever last node ended up being in the previous step
1203 # at the appropriate place for inserting a node, but using common
1204 # ancestor as the override target.
1206 # In the case where fe is immediately followed by fb:
1207 # * inner loop exits out early (node==fe)
1209 # * last_node is still in the tree (not a duplicate)
1210 if last_node.parent?
1211 debug_log "FEFIRST? last_node has parent"
1212 for c, i in last_node.parent.children
1214 debug_log "removing last_node from parent"
1215 last_node.parent.children.splice i, 1
1218 debug_log "after aaa inner loop"
1219 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1220 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1221 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1222 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1223 debug_log "tree: #{serialize_els doc.children, false, true}"
1228 # can't use standard insert token thing, because it's already in
1229 # open_els and must stay at it's current position in open_els
1230 dest = adjusted_insertion_location ca
1231 dest[0].children.splice dest[1], 0, last_node
1232 last_node.parent = dest[0]
1235 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1236 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1237 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1238 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1239 debug_log "tree: #{serialize_els doc.children, false, true}"
1241 # 15. Create an element for the token for which formatting element
1242 # was created, in the HTML namespace, with furthest block as the
1244 new_element = token_to_element fe.token, NS_HTML, fb
1245 # 16. Take all of the child nodes of furthest block and append them
1246 # to the element created in the last step.
1247 while fb.children.length
1248 t = fb.children.shift()
1249 t.parent = new_element
1250 new_element.children.push t
1251 # 17. Append that new element to furthest block.
1252 new_element.parent = fb
1253 fb.children.push new_element
1254 # 18. Remove formatting element from the list of active formatting
1255 # elements, and insert the new element into the list of active
1256 # formatting elements at the position of the aforementioned
1264 afe[i] = new_element
1266 # 19. Remove formatting element from the stack of open elements,
1267 # and insert the new element into the stack of open elements
1268 # immediately below the position of furthest block in that stack.
1269 for t, i in open_els
1271 open_els.splice i, 1
1273 for t, i in open_els
1275 open_els.splice i, 0, new_element
1277 # 20. Jump back to the step labeled outer loop.
1278 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1279 debug_log "tree: #{serialize_els doc.children, false, true}"
1280 debug_log "open_els: #{serialize_els open_els, true, true}"
1281 debug_log "afe: #{serialize_els afe, true, true}"
1282 debug_log "AAA DONE"
1284 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1285 close_p_element = ->
1286 generate_implied_end_tags 'p' # arg is exception
1287 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1289 while open_els.length > 1 # just in case
1290 el = open_els.shift()
1291 if el.name is 'p' and el.namespace is NS_HTML
1293 close_p_if_in_button_scope = ->
1294 if is_in_button_scope 'p', NS_HTML
1297 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1298 # aka insert_a_character = (t) ->
1299 insert_character = (t) ->
1300 dest = adjusted_insertion_location()
1301 # fixfull check for Document node
1303 prev = dest[0].children[dest[1] - 1]
1304 if prev.type is TYPE_TEXT
1307 dest[0].children.splice dest[1], 0, t
1310 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1311 process_token = (t) ->
1312 acn = adjusted_current_node()
1316 if acn.namespace is NS_HTML
1319 if is_mathml_text_integration_point(acn)
1320 if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1323 if t.type is TYPE_TEXT
1326 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1329 if is_html_integration acn
1330 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1333 if t.type is TYPE_EOF
1336 in_foreign_content t
1340 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1341 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1342 adjusted_insertion_location = (override_target = null) ->
1343 # 1. If there was an override target specified, then let target be the
1346 target = override_target
1347 else # Otherwise, let target be the current node.
1348 target = open_els[0]
1349 # 2. Determine the adjusted insertion location using the first matching
1350 # steps from the following list:
1352 # If foster parenting is enabled and target is a table, tbody, tfoot,
1353 # thead, or tr element Foster parenting happens when content is
1354 # misnested in tables.
1355 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1356 loop # once. this is here so we can ``break`` to "abort these substeps"
1357 # 1. Let last template be the last template element in the
1358 # stack of open elements, if any.
1359 last_template = null
1360 last_template_i = null
1361 for el, i in open_els
1362 if el.name is 'template' and el.namespace is NS_HTML
1366 # 2. Let last table be the last table element in the stack of
1367 # open elements, if any.
1370 for el, i in open_els
1371 if el.name is 'table' and el.namespace is NS_HTML
1375 # 3. If there is a last template and either there is no last
1376 # table, or there is one, but last template is lower (more
1377 # recently added) than last table in the stack of open
1378 # elements, then: let adjusted insertion location be inside
1379 # last template's template contents, after its last child (if
1380 # any), and abort these substeps.
1381 if last_template and (last_table is null or last_template_i < last_table_i)
1382 target = last_template # fixfull should be it's contents
1383 target_i = target.children.length
1385 # 4. If there is no last table, then let adjusted insertion
1386 # location be inside the first element in the stack of open
1387 # elements (the html element), after its last child (if any),
1388 # and abort these substeps. (fragment case)
1389 if last_table is null
1391 target = open_els[open_els.length - 1]
1392 target_i = target.children.length
1394 # 5. If last table has a parent element, then let adjusted
1395 # insertion location be inside last table's parent element,
1396 # immediately before last table, and abort these substeps.
1397 if last_table.parent?
1398 for c, i in last_table.parent.children
1400 target = last_table.parent
1404 # 6. Let previous element be the element immediately above last
1405 # table in the stack of open elements.
1407 # huh? how could it not have a parent?
1408 previous_element = open_els[last_table_i + 1]
1409 # 7. Let adjusted insertion location be inside previous
1410 # element, after its last child (if any).
1411 target = previous_element
1412 target_i = target.children.length
1413 # Note: These steps are involved in part because it's possible
1414 # for elements, the table element in this case in particular,
1415 # to have been moved by a script around in the DOM, or indeed
1416 # removed from the DOM entirely, after the element was inserted
1418 break # don't really loop
1420 # Otherwise Let adjusted insertion location be inside target, after
1421 # its last child (if any).
1422 target_i = target.children.length
1424 # 3. If the adjusted insertion location is inside a template element,
1425 # let it instead be inside the template element's template contents,
1426 # after its last child (if any).
1427 # fixfull (template)
1429 # 4. Return the adjusted insertion location.
1430 return [target, target_i]
1432 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1433 # aka create_an_element_for_token
1434 token_to_element = (t, namespace, intended_parent) ->
1435 # convert attributes into a hash
1438 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1439 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1441 # TODO 2. If the newly created element has an xmlns attribute in the
1442 # XMLNS namespace whose value is not exactly the same as the element's
1443 # namespace, that is a parse error. Similarly, if the newly created
1444 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1445 # value is not the XLink Namespace, that is a parse error.
1447 # fixfull: the spec says stuff about form pointers and ownerDocument
1451 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1452 insert_foreign_element = (token, namespace) ->
1453 ail = adjusted_insertion_location()
1456 el = token_to_element token, namespace, ail_el
1457 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1459 ail_el.children.splice ail_i, 0, el
1462 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1463 insert_html_element = (token) ->
1464 insert_foreign_element token, NS_HTML
1466 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1467 # position should be [node, index_within_children]
1468 insert_comment = (t, position = null) ->
1469 position ?= adjusted_insertion_location()
1470 position[0].children.splice position[1], 0, t
1473 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1474 parse_generic_raw_text = (t) ->
1475 insert_html_element t
1476 tok_state = tok_state_rawtext
1477 original_ins_mode = ins_mode
1478 ins_mode = ins_mode_text
1479 parse_generic_rcdata_text = (t) ->
1480 insert_html_element t
1481 tok_state = tok_state_rcdata
1482 original_ins_mode = ins_mode
1483 ins_mode = ins_mode_text
1485 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1486 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1487 generate_implied_end_tags = (except = null) ->
1488 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1491 # 8.2.5.4 The rules for parsing tokens in HTML content
1492 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1494 # 8.2.5.4.1 The "initial" insertion mode
1495 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1496 is_quirks_yes_doctype = (t) ->
1497 if t.flag 'force-quirks'
1499 if t.name isnt 'html'
1501 if t.public_identifier?
1502 pi = t.public_identifier.toLowerCase()
1503 for p in quirks_yes_pi_prefixes
1504 if pi.substr(0, p.length) is p
1506 if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
1508 if t.system_identifier?
1509 if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
1511 else if t.public_identifier?
1512 # already did this: pi = t.public_identifier.toLowerCase()
1513 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1516 is_quirks_limited_doctype = (t) ->
1517 if t.public_identifier?
1518 pi = t.public_identifier.toLowerCase()
1519 if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
1521 if t.system_identifier?
1522 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1525 ins_mode_initial = (t) ->
1528 if t.type is TYPE_COMMENT
1532 if t.type is TYPE_DOCTYPE
1533 # fixfull syntax error from first paragraph and following bullets
1534 # fixfull set doc.doctype
1535 # fixfull is the "not an iframe srcdoc" thing relevant?
1536 if is_quirks_yes_doctype t
1537 doc.flag 'quirks mode', QUIRKS_YES
1538 else if is_quirks_limited_doctype t
1539 doc.flag 'quirks mode', QUIRKS_LIMITED
1541 ins_mode = ins_mode_before_html
1544 # fixfull not iframe srcdoc?
1546 doc.flag 'quirks mode', QUIRKS_YES
1547 ins_mode = ins_mode_before_html
1551 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1552 ins_mode_before_html = (t) ->
1553 if t.type is TYPE_DOCTYPE
1556 if t.type is TYPE_COMMENT
1561 if t.type is TYPE_START_TAG and t.name is 'html'
1562 el = token_to_element t, NS_HTML, doc
1563 doc.children.push el
1564 open_els.unshift(el)
1565 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1566 ins_mode = ins_mode_before_head
1568 if t.type is TYPE_END_TAG
1569 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1570 # fall through to "anything else"
1575 el = token_to_element new_open_tag('html'), NS_HTML, doc
1576 doc.children.push el
1579 # ?fixfull browsing context
1580 ins_mode = ins_mode_before_head
1584 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1585 ins_mode_before_head = (t) ->
1588 if t.type is TYPE_COMMENT
1591 if t.type is TYPE_DOCTYPE
1594 if t.type is TYPE_START_TAG and t.name is 'html'
1597 if t.type is TYPE_START_TAG and t.name is 'head'
1598 el = insert_html_element t
1599 head_element_pointer = el
1600 ins_mode = ins_mode_in_head
1602 if t.type is TYPE_END_TAG
1603 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1604 # fall through to Anything else below
1609 el = insert_html_element new_open_tag 'head'
1610 head_element_pointer = el
1611 ins_mode = ins_mode_in_head
1614 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1615 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1616 open_els.shift() # spec says this will be a 'head' node
1617 ins_mode = ins_mode_after_head
1619 ins_mode_in_head = (t) ->
1620 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1623 if t.type is TYPE_COMMENT
1626 if t.type is TYPE_DOCTYPE
1629 if t.type is TYPE_START_TAG and t.name is 'html'
1632 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1633 el = insert_html_element t
1635 t.acknowledge_self_closing()
1637 if t.type is TYPE_START_TAG and t.name is 'meta'
1638 el = insert_html_element t
1640 t.acknowledge_self_closing()
1641 # fixfull encoding stuff
1643 if t.type is TYPE_START_TAG and t.name is 'title'
1644 parse_generic_rcdata_text t
1646 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1647 parse_generic_raw_text t
1649 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1650 insert_html_element t
1651 ins_mode = ins_mode_in_head_noscript
1653 if t.type is TYPE_START_TAG and t.name is 'script'
1654 ail = adjusted_insertion_location()
1655 el = token_to_element t, NS_HTML, ail
1656 el.flag 'parser-inserted', true
1657 # fixfull frament case
1658 ail[0].children.splice ail[1], 0, el
1660 tok_state = tok_state_script_data
1661 original_ins_mode = ins_mode # make sure orig... is defined
1662 ins_mode = ins_mode_text
1664 if t.type is TYPE_END_TAG and t.name is 'head'
1665 open_els.shift() # will be a head element... spec says so
1666 ins_mode = ins_mode_after_head
1668 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1669 ins_mode_in_head_else t
1671 if t.type is TYPE_START_TAG and t.name is 'template'
1672 insert_html_element t
1674 flag_frameset_ok = false
1675 ins_mode = ins_mode_in_template
1676 template_ins_modes.unshift ins_mode_in_template
1678 if t.type is TYPE_END_TAG and t.name is 'template'
1679 if template_tag_is_open()
1680 generate_implied_end_tags
1681 if open_els[0].name isnt 'template'
1684 el = open_els.shift()
1685 if el.name is 'template' and el.namespace is NS_HTML
1687 clear_afe_to_marker()
1688 template_ins_modes.shift()
1693 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1696 ins_mode_in_head_else t
1698 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1699 ins_mode_in_head_noscript_else = (t) ->
1702 ins_mode = ins_mode_in_head
1704 ins_mode_in_head_noscript = (t) ->
1705 if t.type is TYPE_DOCTYPE
1708 if t.type is TYPE_START_TAG and t.name is 'html'
1711 if t.type is TYPE_END_TAG and t.name is 'noscript'
1713 ins_mode = ins_mode_in_head
1715 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1718 if t.type is TYPE_END_TAG and t.name is 'br'
1719 ins_mode_in_head_noscript_else t
1721 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1725 ins_mode_in_head_noscript_else t
1730 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1731 ins_mode_after_head_else = (t) ->
1732 body_tok = new_open_tag 'body'
1733 insert_html_element body_tok
1734 ins_mode = ins_mode_in_body
1737 ins_mode_after_head = (t) ->
1741 if t.type is TYPE_COMMENT
1744 if t.type is TYPE_DOCTYPE
1747 if t.type is TYPE_START_TAG and t.name is 'html'
1750 if t.type is TYPE_START_TAG and t.name is 'body'
1751 insert_html_element t
1752 flag_frameset_ok = false
1753 ins_mode = ins_mode_in_body
1755 if t.type is TYPE_START_TAG and t.name is 'frameset'
1756 insert_html_element t
1757 ins_mode = ins_mode_in_frameset
1759 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1761 open_els.unshift head_element_pointer
1763 for el, i in open_els
1764 if el is head_element_pointer
1765 open_els.splice i, 1
1767 console.log "warning: 23904 couldn't find head element in open_els"
1769 if t.type is TYPE_END_TAG and t.name is 'template'
1772 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1773 ins_mode_after_head_else t
1775 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1779 ins_mode_after_head_else t
1781 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1782 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1783 for el, i in open_els
1784 if el.name is name and el.namespace is NS_HTML
1785 generate_implied_end_tags name # arg is exception
1786 parse_error() unless i is 0
1791 if special_elements[el.name] is el.namespace
1795 ins_mode_in_body = (t) ->
1796 if t.type is TYPE_TEXT and t.text is "\u0000"
1803 if t.type is TYPE_TEXT
1806 flag_frameset_ok = false
1808 if t.type is TYPE_COMMENT
1811 if t.type is TYPE_DOCTYPE
1814 if t.type is TYPE_START_TAG and t.name is 'html'
1816 return if template_tag_is_open()
1817 root_attrs = open_els[open_els.length - 1].attrs
1819 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1822 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1825 if t.type is TYPE_START_TAG and t.name is 'body'
1827 return if open_els.length < 2
1828 second = open_els[open_els.length - 2]
1829 return unless second.namespace is NS_HTML
1830 return unless second.name is 'body'
1831 return if template_tag_is_open()
1832 flag_frameset_ok = false
1834 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1836 if t.type is TYPE_START_TAG and t.name is 'frameset'
1838 return if open_els.length < 2
1839 second_i = open_els.length - 2
1840 second = open_els[second_i]
1841 return unless second.namespace is NS_HTML
1842 return unless second.name is 'body'
1843 if flag_frameset_ok is false
1846 for el, i in second.parent.children
1848 second.parent.children.splice i, 1
1850 open_els.splice second_i, 1
1851 # pop everything except the "root html element"
1852 while open_els.length > 1
1854 insert_html_element t
1855 ins_mode = ins_mode_in_frameset
1857 if t.type is TYPE_EOF
1859 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1860 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1861 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1864 unless ok_tags[t.name] is el.namespace
1867 if template_ins_modes.length > 0
1868 ins_mode_in_template t
1872 if t.type is TYPE_END_TAG and t.name is 'body'
1873 unless is_in_scope 'body', NS_HTML
1877 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1878 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1879 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1880 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1884 unless ok_tags[t.name] is el.namespace
1887 ins_mode = ins_mode_after_body
1889 if t.type is TYPE_END_TAG and t.name is 'html'
1890 unless is_in_scope 'body', NS_HTML
1894 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1895 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1896 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1897 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1901 unless ok_tags[t.name] is el.namespace
1904 ins_mode = ins_mode_after_body
1907 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1908 close_p_if_in_button_scope()
1909 insert_html_element t
1911 if t.type is TYPE_START_TAG and h_tags[t.name]?
1912 close_p_if_in_button_scope()
1913 if h_tags[open_els[0].name] is open_els[0].namespace
1916 insert_html_element t
1918 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1919 close_p_if_in_button_scope()
1920 insert_html_element t
1921 # spec: If the next token is a "LF" (U+000A) character token, then
1922 # ignore that token and move on to the next one. (Newlines at the
1923 # start of pre blocks are ignored as an authoring convenience.)
1924 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1926 flag_frameset_ok = false
1928 if t.type is TYPE_START_TAG and t.name is 'form'
1929 unless form_element_pointer is null or template_tag_is_open()
1932 close_p_if_in_button_scope()
1933 el = insert_html_element t
1934 unless template_tag_is_open()
1935 form_element_pointer = el
1937 if t.type is TYPE_START_TAG and t.name is 'li'
1938 flag_frameset_ok = false
1939 for node in open_els
1940 if node.name is 'li' and node.namespace is NS_HTML
1941 generate_implied_end_tags 'li' # arg is exception
1942 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1945 el = open_els.shift()
1946 if el.name is 'li' and el.namespace is NS_HTML
1949 if el_is_special_not_adp node
1951 close_p_if_in_button_scope()
1952 insert_html_element t
1954 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1955 flag_frameset_ok = false
1956 for node in open_els
1957 if node.name is 'dd' and node.namespace is NS_HTML
1958 generate_implied_end_tags 'dd' # arg is exception
1959 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1962 el = open_els.shift()
1963 if el.name is 'dd' and el.namespace is NS_HTML
1966 if node.name is 'dt' and node.namespace is NS_HTML
1967 generate_implied_end_tags 'dt' # arg is exception
1968 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1971 el = open_els.shift()
1972 if el.name is 'dt' and el.namespace is NS_HTML
1975 if el_is_special_not_adp node
1977 close_p_if_in_button_scope()
1978 insert_html_element t
1980 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1981 close_p_if_in_button_scope()
1982 insert_html_element t
1983 tok_state = tok_state_plaintext
1985 if t.type is TYPE_START_TAG and t.name is 'button'
1986 if is_in_scope 'button', NS_HTML
1988 generate_implied_end_tags()
1990 el = open_els.shift()
1991 if el.name is 'button' and el.namespace is NS_HTML
1994 insert_html_element t
1995 flag_frameset_ok = false
1997 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1998 unless is_in_scope t.name, NS_HTML
2001 generate_implied_end_tags()
2002 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
2005 el = open_els.shift()
2006 if el.name is t.name and el.namespace is NS_HTML
2009 if t.type is TYPE_END_TAG and t.name is 'form'
2010 unless template_tag_is_open()
2011 node = form_element_pointer
2012 form_element_pointer = null
2013 if node is null or not el_is_in_scope node
2016 generate_implied_end_tags()
2017 if open_els[0] isnt node
2019 for el, i in open_els
2021 open_els.splice i, 1
2024 unless is_in_scope 'form', NS_HTML
2027 generate_implied_end_tags()
2028 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
2031 el = open_els.shift()
2032 if el.name is 'form' and el.namespace is NS_HTML
2035 if t.type is TYPE_END_TAG and t.name is 'p'
2036 unless is_in_button_scope 'p', NS_HTML
2038 insert_html_element new_open_tag 'p'
2041 if t.type is TYPE_END_TAG and t.name is 'li'
2042 unless is_in_li_scope 'li', NS_HTML
2045 generate_implied_end_tags 'li' # arg is exception
2046 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
2049 el = open_els.shift()
2050 if el.name is 'li' and el.namespace is NS_HTML
2053 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
2054 unless is_in_scope t.name, NS_HTML
2057 generate_implied_end_tags t.name # arg is exception
2058 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2061 el = open_els.shift()
2062 if el.name is t.name and el.namespace is NS_HTML
2065 if t.type is TYPE_END_TAG and h_tags[t.name]?
2068 if h_tags[el.name] is el.namespace
2071 if standard_scopers[el.name] is el.namespace
2076 generate_implied_end_tags()
2077 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2080 el = open_els.shift()
2081 if h_tags[el.name] is el.namespace
2085 if t.type is TYPE_START_TAG and t.name is 'a'
2086 # If the list of active formatting elements contains an a element
2087 # between the end of the list and the last marker on the list (or
2088 # the start of the list if there is no marker on the list), then
2089 # this is a parse error; run the adoption agency algorithm for the
2090 # tag name "a", then remove that element from the list of active
2091 # formatting elements and the stack of open elements if the
2092 # adoption agency algorithm didn't already remove it (it might not
2093 # have if the element is not in table scope).
2096 if el.type is TYPE_AFE_MARKER
2098 if el.name is 'a' and el.namespace is NS_HTML
2106 for el, i in open_els
2108 open_els.splice i, 1
2110 el = insert_html_element t
2113 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2115 el = insert_html_element t
2118 if t.type is TYPE_START_TAG and t.name is 'nobr'
2120 el = insert_html_element t
2123 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2124 adoption_agency t.name
2126 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2128 insert_html_element t
2130 flag_frameset_ok = false
2132 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2133 unless is_in_scope t.name, NS_HTML
2136 generate_implied_end_tags()
2137 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2140 el = open_els.shift()
2141 if el.name is t.name and el.namespace is NS_HTML
2143 clear_afe_to_marker()
2145 if t.type is TYPE_START_TAG and t.name is 'table'
2146 unless doc.flag('quirks mode') is QUIRKS_YES
2147 close_p_if_in_button_scope() # test
2148 insert_html_element t
2149 flag_frameset_ok = false
2150 ins_mode = ins_mode_in_table
2152 if t.type is TYPE_END_TAG and t.name is 'br'
2154 t.type = TYPE_START_TAG
2156 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2158 insert_html_element t
2160 t.acknowledge_self_closing()
2161 flag_frameset_ok = false
2163 if t.type is TYPE_START_TAG and t.name is 'input'
2165 insert_html_element t
2167 t.acknowledge_self_closing()
2168 unless is_input_hidden_tok t
2169 flag_frameset_ok = false
2171 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
2172 insert_html_element t
2174 t.acknowledge_self_closing()
2176 if t.type is TYPE_START_TAG and t.name is 'hr'
2177 close_p_if_in_button_scope()
2178 insert_html_element t
2180 t.acknowledge_self_closing()
2181 flag_frameset_ok = false
2183 if t.type is TYPE_START_TAG and t.name is 'image'
2188 if t.type is TYPE_START_TAG and t.name is 'isindex'
2190 if template_tag_is_open() is false and form_element_pointer isnt null
2192 t.acknowledge_self_closing()
2193 flag_frameset_ok = false
2194 close_p_if_in_button_scope()
2195 el = insert_html_element new_open_tag 'form'
2196 unless template_tag_is_open()
2197 form_element_pointer = el
2200 el.attrs['action'] = a[1]
2202 insert_html_element new_open_tag 'hr'
2205 insert_html_element new_open_tag 'label'
2206 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2207 input_el = new_open_tag 'input'
2212 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2213 input_el.attrs_a.push [a[0], a[1]]
2214 input_el.attrs_a.push ['name', 'isindex']
2215 # fixfull this next bit is in english... internationalize?
2216 prompt ?= "This is a searchable index. Enter search keywords: "
2217 insert_character new_character_token prompt # fixfull split
2218 # TODO submit typo "balue" in spec
2219 insert_html_element input_el
2221 # insert_character '' # you can put chars here if promt attr missing
2223 insert_html_element new_open_tag 'hr'
2226 unless template_tag_is_open()
2227 form_element_pointer = null
2229 if t.type is TYPE_START_TAG and t.name is 'textarea'
2230 insert_html_element t
2231 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2233 tok_state = tok_state_rcdata
2234 original_ins_mode = ins_mode
2235 flag_frameset_ok = false
2236 ins_mode = ins_mode_text
2238 if t.type is TYPE_START_TAG and t.name is 'xmp'
2239 close_p_if_in_button_scope()
2241 flag_frameset_ok = false
2242 parse_generic_raw_text t
2244 if t.type is TYPE_START_TAG and t.name is 'iframe'
2245 flag_frameset_ok = false
2246 parse_generic_raw_text t
2248 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2249 parse_generic_raw_text t
2251 if t.type is TYPE_START_TAG and t.name is 'select'
2253 insert_html_element t
2254 flag_frameset_ok = false
2255 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2256 ins_mode = ins_mode_in_select_in_table
2258 ins_mode = ins_mode_in_select
2260 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2261 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2264 insert_html_element t
2266 # this comment block implements the W3C spec
2267 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2268 # if is_in_scope 'ruby', NS_HTML
2269 # generate_implied_end_tags()
2270 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2272 # insert_html_element t
2274 # if t.type is TYPE_START_TAG and t.name is 'rt'
2275 # if is_in_scope 'ruby', NS_HTML
2276 # generate_implied_end_tags 'rtc' # arg is exception
2277 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2279 # insert_html_element t
2281 # below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2282 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2283 if is_in_scope 'ruby', NS_HTML
2284 generate_implied_end_tags()
2285 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2287 insert_html_element t
2289 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2290 if is_in_scope 'ruby', NS_HTML
2291 generate_implied_end_tags 'rtc'
2292 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2294 insert_html_element t
2297 if t.type is TYPE_START_TAG and t.name is 'math'
2299 adjust_mathml_attributes t
2300 adjust_foreign_attributes t
2301 insert_foreign_element t, NS_MATHML
2302 if t.flag 'self-closing'
2304 t.acknowledge_self_closing()
2306 if t.type is TYPE_START_TAG and t.name is 'svg'
2308 adjust_svg_attributes t
2309 adjust_foreign_attributes t
2310 insert_foreign_element t, NS_SVG
2311 if t.flag 'self-closing'
2313 t.acknowledge_self_closing()
2315 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2318 if t.type is TYPE_START_TAG # any other start tag
2320 insert_html_element t
2322 if t.type is TYPE_END_TAG # any other end tag
2323 in_body_any_other_end_tag t.name
2327 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2328 ins_mode_text = (t) ->
2329 if t.type is TYPE_TEXT
2332 if t.type is TYPE_EOF
2334 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2335 open_els[0].flag 'already started', true
2337 ins_mode = original_ins_mode
2340 if t.type is TYPE_END_TAG and t.name is 'script'
2342 ins_mode = original_ins_mode
2343 # fixfull the spec seems to assume that I'm going to run the script
2344 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2346 if t.type is TYPE_END_TAG
2348 ins_mode = original_ins_mode
2350 console.log 'warning: end of ins_mode_text reached'
2352 # the functions below implement the tokenizer stats described here:
2353 # http://www.w3.org/TR/html5/syntax.html#tokenization
2355 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2356 ins_mode_in_table_else = (t) ->
2358 flag_foster_parenting = true
2360 flag_foster_parenting = false
2362 ins_mode_in_table = (t) ->
2365 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2366 pending_table_character_tokens = []
2367 original_ins_mode = ins_mode
2368 ins_mode = ins_mode_in_table_text
2371 ins_mode_in_table_else t
2379 clear_stack_to_table_context()
2381 insert_html_element t
2382 ins_mode = ins_mode_in_caption
2384 clear_stack_to_table_context()
2385 insert_html_element t
2386 ins_mode = ins_mode_in_column_group
2388 clear_stack_to_table_context()
2389 insert_html_element new_open_tag 'colgroup'
2390 ins_mode = ins_mode_in_column_group
2392 when 'tbody', 'tfoot', 'thead'
2393 clear_stack_to_table_context()
2394 insert_html_element t
2395 ins_mode = ins_mode_in_table_body
2396 when 'td', 'th', 'tr'
2397 clear_stack_to_table_context()
2398 insert_html_element new_open_tag 'tbody'
2399 ins_mode = ins_mode_in_table_body
2403 if is_in_table_scope 'table', NS_HTML
2405 el = open_els.shift()
2406 if el.name is 'table' and el.namespace is NS_HTML
2410 when 'style', 'script', 'template'
2413 unless is_input_hidden_tok t
2414 ins_mode_in_table_else t
2417 el = insert_html_element t
2419 t.acknowledge_self_closing()
2422 if form_element_pointer?
2424 if template_tag_is_open()
2426 form_element_pointer = insert_html_element t
2429 ins_mode_in_table_else t
2433 if is_in_table_scope 'table', NS_HTML
2435 el = open_els.shift()
2436 if el.name is 'table' and el.namespace is NS_HTML
2441 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2446 ins_mode_in_table_else t
2450 ins_mode_in_table_else t
2453 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2454 ins_mode_in_table_text = (t) ->
2455 if t.type is TYPE_TEXT and t.text is "\u0000"
2459 if t.type is TYPE_TEXT
2460 pending_table_character_tokens.push t
2464 for old in pending_table_character_tokens
2465 unless is_space_tok old
2469 for old in pending_table_character_tokens
2470 insert_character old
2472 for old in pending_table_character_tokens
2473 ins_mode_in_table_else old
2474 pending_table_character_tokens = []
2475 ins_mode = original_ins_mode
2478 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2479 ins_mode_in_caption = (t) ->
2480 if t.type is TYPE_END_TAG and t.name is 'caption'
2481 if is_in_table_scope 'caption', NS_HTML
2482 generate_implied_end_tags()
2483 if open_els[0].name isnt 'caption'
2486 el = open_els.shift()
2487 if el.name is 'caption' and el.namespace is NS_HTML
2489 clear_afe_to_marker()
2490 ins_mode = ins_mode_in_table
2495 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2497 if is_in_table_scope 'caption', NS_HTML
2499 el = open_els.shift()
2500 if el.name is 'caption' and el.namespace is NS_HTML
2502 clear_afe_to_marker()
2503 ins_mode = ins_mode_in_table
2505 # else fragment case
2507 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2513 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2514 ins_mode_in_column_group = (t) ->
2518 if t.type is TYPE_COMMENT
2521 if t.type is TYPE_DOCTYPE
2524 if t.type is TYPE_START_TAG and t.name is 'html'
2527 if t.type is TYPE_START_TAG and t.name is 'col'
2528 el = insert_html_element t
2530 t.acknowledge_self_closing()
2532 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2533 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2535 ins_mode = ins_mode_in_table
2539 if t.type is TYPE_END_TAG and t.name is 'col'
2542 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2545 if t.type is TYPE_EOF
2549 if open_els[0].name isnt 'colgroup'
2553 ins_mode = ins_mode_in_table
2557 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2558 ins_mode_in_table_body = (t) ->
2559 if t.type is TYPE_START_TAG and t.name is 'tr'
2560 clear_stack_to_table_body_context()
2561 insert_html_element t
2562 ins_mode = ins_mode_in_row
2564 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2566 clear_stack_to_table_body_context()
2567 insert_html_element new_open_tag 'tr'
2568 ins_mode = ins_mode_in_row
2571 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2572 unless is_in_table_scope t.name, NS_HTML
2575 clear_stack_to_table_body_context()
2577 ins_mode = ins_mode_in_table
2579 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2582 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2585 if table_scopers[el.name] is el.namespace
2590 clear_stack_to_table_body_context()
2592 ins_mode = ins_mode_in_table
2595 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2601 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2602 ins_mode_in_row = (t) ->
2603 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2604 clear_stack_to_table_row_context()
2605 insert_html_element t
2606 ins_mode = ins_mode_in_cell
2609 if t.type is TYPE_END_TAG and t.name is 'tr'
2610 if is_in_table_scope 'tr', NS_HTML
2611 clear_stack_to_table_row_context()
2613 ins_mode = ins_mode_in_table_body
2617 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2618 if is_in_table_scope 'tr', NS_HTML
2619 clear_stack_to_table_row_context()
2621 ins_mode = ins_mode_in_table_body
2626 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2627 if is_in_table_scope t.name, NS_HTML
2628 if is_in_table_scope 'tr', NS_HTML
2629 clear_stack_to_table_row_context()
2631 ins_mode = ins_mode_in_table_body
2636 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2642 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2644 generate_implied_end_tags()
2645 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2648 el = open_els.shift()
2649 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2651 clear_afe_to_marker()
2652 ins_mode = ins_mode_in_row
2654 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2655 ins_mode_in_cell = (t) ->
2656 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2657 if is_in_table_scope t.name, NS_HTML
2658 generate_implied_end_tags()
2659 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2662 el = open_els.shift()
2663 if el.name is t.name and el.namespace is NS_HTML
2665 clear_afe_to_marker()
2666 ins_mode = ins_mode_in_row
2670 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2673 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2676 if table_scopers[el.name] is el.namespace
2684 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2687 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2688 if is_in_table_scope t.name, NS_HTML
2697 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2698 ins_mode_in_select = (t) ->
2699 if t.type is TYPE_TEXT and t.text is "\u0000"
2702 if t.type is TYPE_TEXT
2705 if t.type is TYPE_COMMENT
2708 if t.type is TYPE_DOCTYPE
2711 if t.type is TYPE_START_TAG and t.name is 'html'
2714 if t.type is TYPE_START_TAG and t.name is 'option'
2715 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2717 insert_html_element t
2719 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2720 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2722 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2724 insert_html_element t
2726 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2727 if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
2728 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2730 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2735 if t.type is TYPE_END_TAG and t.name is 'option'
2736 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2741 if t.type is TYPE_END_TAG and t.name is 'select'
2742 if is_in_select_scope 'select', NS_HTML
2744 el = open_els.shift()
2745 if el.name is 'select' and el.namespace is NS_HTML
2751 if t.type is TYPE_START_TAG and t.name is 'select'
2754 el = open_els.shift()
2755 if el.name is 'select' and el.namespace is NS_HTML
2758 # spec says that this is the same as </select> but it doesn't say
2759 # to check scope first
2761 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2763 if is_in_select_scope 'select', NS_HTML
2766 el = open_els.shift()
2767 if el.name is 'select' and el.namespace is NS_HTML
2772 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2775 if t.type is TYPE_EOF
2782 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2783 ins_mode_in_select_in_table = (t) ->
2784 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2787 el = open_els.shift()
2788 if el.name is 'select' and el.namespace is NS_HTML
2793 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2795 unless is_in_table_scope t.name, NS_HTML
2798 el = open_els.shift()
2799 if el.name is 'select' and el.namespace is NS_HTML
2805 ins_mode_in_select t
2808 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2809 ins_mode_in_template = (t) ->
2810 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2813 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2816 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2817 template_ins_modes.shift()
2818 template_ins_modes.unshift ins_mode_in_table
2819 ins_mode = ins_mode_in_table
2822 if t.type is TYPE_START_TAG and t.name is 'col'
2823 template_ins_modes.shift()
2824 template_ins_modes.unshift ins_mode_in_column_group
2825 ins_mode = ins_mode_in_column_group
2828 if t.type is TYPE_START_TAG and t.name is 'tr'
2829 template_ins_modes.shift()
2830 template_ins_modes.unshift ins_mode_in_table_body
2831 ins_mode = ins_mode_in_table_body
2834 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2835 template_ins_modes.shift()
2836 template_ins_modes.unshift ins_mode_in_row
2837 ins_mode = ins_mode_in_row
2840 if t.type is TYPE_START_TAG
2841 template_ins_modes.shift()
2842 template_ins_modes.unshift ins_mode_in_body
2843 ins_mode = ins_mode_in_body
2846 if t.type is TYPE_END_TAG
2849 if t.type is TYPE_EOF
2850 unless template_tag_is_open()
2855 el = open_els.shift()
2856 if el.name is 'template' and el.namespace is NS_HTML
2858 clear_afe_to_marker()
2859 template_ins_modes.shift()
2863 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2864 ins_mode_after_body = (t) ->
2868 if t.type is TYPE_COMMENT
2869 first = open_els[open_els.length - 1]
2870 insert_comment t, [first, first.children.length]
2872 if t.type is TYPE_DOCTYPE
2875 if t.type is TYPE_START_TAG and t.name is 'html'
2878 if t.type is TYPE_END_TAG and t.name is 'html'
2879 if flag_fragment_parsing
2882 ins_mode = ins_mode_after_after_body
2884 if t.type is TYPE_EOF
2889 ins_mode = ins_mode_in_body
2892 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2893 ins_mode_in_frameset = (t) ->
2897 if t.type is TYPE_COMMENT
2900 if t.type is TYPE_DOCTYPE
2903 if t.type is TYPE_START_TAG and t.name is 'html'
2906 if t.type is TYPE_START_TAG and t.name is 'frameset'
2907 insert_html_element t
2909 if t.type is TYPE_END_TAG and t.name is 'frameset'
2910 if open_els.length is 1
2912 return # fragment case
2914 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2915 ins_mode = ins_mode_after_frameset
2917 if t.type is TYPE_START_TAG and t.name is 'frame'
2918 insert_html_element t
2920 t.acknowledge_self_closing()
2922 if t.type is TYPE_START_TAG and t.name is 'noframes'
2925 if t.type is TYPE_EOF
2926 if open_els.length isnt 1
2934 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2935 ins_mode_after_frameset = (t) ->
2939 if t.type is TYPE_COMMENT
2942 if t.type is TYPE_DOCTYPE
2945 if t.type is TYPE_START_TAG and t.name is 'html'
2948 if t.type is TYPE_END_TAG and t.name is 'html'
2949 ins_mode = ins_mode_after_after_frameset
2951 if t.type is TYPE_START_TAG and t.name is 'noframes'
2954 if t.type is TYPE_EOF
2961 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2962 ins_mode_after_after_body = (t) ->
2963 if t.type is TYPE_COMMENT
2964 insert_comment t, [doc, doc.children.length]
2966 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2969 if t.type is TYPE_EOF
2974 ins_mode = ins_mode_in_body
2978 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2979 ins_mode_after_after_frameset = (t) ->
2980 if t.type is TYPE_COMMENT
2981 insert_comment t, [doc, doc.children.length]
2983 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2986 if t.type is TYPE_EOF
2989 if t.type is TYPE_START_TAG and t.name is 'noframes'
2996 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2997 has_color_face_or_size = (t) ->
2999 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
3002 in_foreign_content_end_script = ->
3006 in_foreign_content_other_start = (t) ->
3007 acn = adjusted_current_node()
3008 if acn.namespace is NS_MATHML
3009 adjust_mathml_attributes t
3010 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
3011 t.name = svg_name_fixes[t.name]
3012 if acn.namespace is NS_SVG
3013 adjust_svg_attributes t
3014 adjust_foreign_attributes t
3015 insert_foreign_element t, acn.namespace
3016 if t.flag 'self-closing'
3017 if t.name is 'script'
3018 t.acknowledge_self_closing()
3019 in_foreign_content_end_script()
3023 t.acknowledge_self_closing()
3025 in_foreign_content = (t) ->
3026 if t.type is TYPE_TEXT and t.text is "\u0000"
3028 insert_character new_character_token "\ufffd"
3033 if t.type is TYPE_TEXT
3034 flag_frameset_ok = false
3037 if t.type is TYPE_COMMENT
3040 if t.type is TYPE_DOCTYPE
3043 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
3045 if flag_fragment_parsing
3046 in_foreign_content_other_start t
3048 loop # is this safe?
3050 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
3054 if t.type is TYPE_START_TAG
3055 in_foreign_content_other_start t
3057 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
3058 in_foreign_content_end_script()
3060 if t.type is TYPE_END_TAG
3063 if node.name.toLowerCase() isnt t.name
3066 if node is open_els[open_els.length - 1]
3068 if node.name.toLowerCase() is t.name
3070 el = open_els.shift()
3075 if node.namespace is NS_HTML
3077 ins_mode t # explicitly call HTML insertion mode
3080 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
3082 switch c = txt.charAt(cur++)
3084 return new_text_node parse_character_reference()
3086 tok_state = tok_state_tag_open
3089 return new_text_node "\ufffd"
3091 return new_eof_token()
3093 return new_text_node c
3096 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
3097 # not needed: tok_state_character_reference_in_data = ->
3098 # just call parse_character_reference()
3100 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
3101 tok_state_rcdata = ->
3102 switch c = txt.charAt(cur++)
3104 return new_text_node parse_character_reference()
3106 tok_state = tok_state_rcdata_less_than_sign
3109 return new_character_token "\ufffd"
3111 return new_eof_token()
3113 return new_character_token c
3116 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
3117 # not needed: tok_state_character_reference_in_rcdata = ->
3118 # just call parse_character_reference()
3120 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
3121 tok_state_rawtext = ->
3122 switch c = txt.charAt(cur++)
3124 tok_state = tok_state_rawtext_less_than_sign
3127 return new_character_token "\ufffd"
3129 return new_eof_token()
3131 return new_character_token c
3134 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
3135 tok_state_script_data = ->
3136 switch c = txt.charAt(cur++)
3138 tok_state = tok_state_script_data_less_than_sign
3141 return new_character_token "\ufffd"
3143 return new_eof_token()
3145 return new_character_token c
3148 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3149 tok_state_plaintext = ->
3150 switch c = txt.charAt(cur++)
3153 return new_character_token "\ufffd"
3155 return new_eof_token()
3157 return new_character_token c
3161 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3162 tok_state_tag_open = ->
3163 c = txt.charAt(cur++)
3165 tok_state = tok_state_markup_declaration_open
3168 tok_state = tok_state_end_tag_open
3171 tok_cur_tag = new_open_tag c.toLowerCase()
3172 tok_state = tok_state_tag_name
3175 tok_cur_tag = new_open_tag c
3176 tok_state = tok_state_tag_name
3180 tok_cur_tag = new_comment_token '?' # FIXME right?
3181 tok_state = tok_state_bogus_comment
3185 tok_state = tok_state_data
3186 cur -= 1 # we didn't parse/handle the char after <
3187 return new_text_node '<'
3189 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3190 tok_state_end_tag_open = ->
3191 c = txt.charAt(cur++)
3193 tok_cur_tag = new_end_tag c.toLowerCase()
3194 tok_state = tok_state_tag_name
3197 tok_cur_tag = new_end_tag c
3198 tok_state = tok_state_tag_name
3202 tok_state = tok_state_data
3206 tok_state = tok_state_data
3207 return new_text_node '</'
3210 tok_cur_tag = new_comment_token c
3211 tok_state = tok_state_bogus_comment
3214 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3215 tok_state_tag_name = ->
3216 switch c = txt.charAt(cur++)
3217 when "\t", "\n", "\u000c", ' '
3218 tok_state = tok_state_before_attribute_name
3220 tok_state = tok_state_self_closing_start_tag
3222 tok_state = tok_state_data
3228 tok_cur_tag.name += "\ufffd"
3231 tok_state = tok_state_data
3234 tok_cur_tag.name += c.toLowerCase()
3236 tok_cur_tag.name += c
3239 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3240 tok_state_rcdata_less_than_sign = ->
3241 c = txt.charAt(cur++)
3243 temporary_buffer = ''
3244 tok_state = tok_state_rcdata_end_tag_open
3247 tok_state = tok_state_rcdata
3248 cur -= 1 # reconsume the input character
3249 return new_character_token '<'
3251 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3252 tok_state_rcdata_end_tag_open = ->
3253 c = txt.charAt(cur++)
3255 tok_cur_tag = new_end_tag c.toLowerCase()
3256 temporary_buffer += c
3257 tok_state = tok_state_rcdata_end_tag_name
3260 tok_cur_tag = new_end_tag c
3261 temporary_buffer += c
3262 tok_state = tok_state_rcdata_end_tag_name
3265 tok_state = tok_state_rcdata
3266 cur -= 1 # reconsume the input character
3267 return new_character_token "</" # fixfull separate these
3269 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3270 is_appropriate_end_tag = (t) ->
3271 # spec says to check against "the tag name of the last start tag to
3272 # have been emitted from this tokenizer", but this is only called from
3273 # the various "raw" states, so it's hopefully ok to assume that
3274 # open_els[0].name will work instead TODO: verify this after the script
3275 # data states are implemented
3276 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3277 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3279 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3280 tok_state_rcdata_end_tag_name = ->
3281 c = txt.charAt(cur++)
3282 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3283 if is_appropriate_end_tag tok_cur_tag
3284 tok_state = tok_state_before_attribute_name
3286 # else fall through to "Anything else"
3288 if is_appropriate_end_tag tok_cur_tag
3289 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3291 # else fall through to "Anything else"
3293 if is_appropriate_end_tag tok_cur_tag
3294 tok_state = tok_state_data
3296 # else fall through to "Anything else"
3298 tok_cur_tag.name += c.toLowerCase()
3299 temporary_buffer += c
3302 tok_cur_tag.name += c
3303 temporary_buffer += c
3306 tok_state = tok_state_rcdata
3307 cur -= 1 # reconsume the input character
3308 return new_character_token '</' + temporary_buffer # fixfull separate these
3310 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3311 tok_state_rawtext_less_than_sign = ->
3312 c = txt.charAt(cur++)
3314 temporary_buffer = ''
3315 tok_state = tok_state_rawtext_end_tag_open
3318 tok_state = tok_state_rawtext
3319 cur -= 1 # reconsume the input character
3320 return new_character_token '<'
3322 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3323 tok_state_rawtext_end_tag_open = ->
3324 c = txt.charAt(cur++)
3326 tok_cur_tag = new_end_tag c.toLowerCase()
3327 temporary_buffer += c
3328 tok_state = tok_state_rawtext_end_tag_name
3331 tok_cur_tag = new_end_tag c
3332 temporary_buffer += c
3333 tok_state = tok_state_rawtext_end_tag_name
3336 tok_state = tok_state_rawtext
3337 cur -= 1 # reconsume the input character
3338 return new_character_token "</" # fixfull separate these
3340 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3341 tok_state_rawtext_end_tag_name = ->
3342 c = txt.charAt(cur++)
3343 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3344 if is_appropriate_end_tag tok_cur_tag
3345 tok_state = tok_state_before_attribute_name
3347 # else fall through to "Anything else"
3349 if is_appropriate_end_tag tok_cur_tag
3350 tok_state = tok_state_self_closing_start_tag
3352 # else fall through to "Anything else"
3354 if is_appropriate_end_tag tok_cur_tag
3355 tok_state = tok_state_data
3357 # else fall through to "Anything else"
3359 tok_cur_tag.name += c.toLowerCase()
3360 temporary_buffer += c
3363 tok_cur_tag.name += c
3364 temporary_buffer += c
3367 tok_state = tok_state_rawtext
3368 cur -= 1 # reconsume the input character
3369 return new_character_token '</' + temporary_buffer # fixfull separate these
3371 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3372 tok_state_script_data_less_than_sign = ->
3373 c = txt.charAt(cur++)
3375 temporary_buffer = ''
3376 tok_state = tok_state_script_data_end_tag_open
3379 tok_state = tok_state_script_data_escape_start
3380 return new_character_token '<!' # fixfull split
3382 tok_state = tok_state_script_data
3383 cur -= 1 # Reconsume
3384 return new_character_token '<'
3386 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3387 tok_state_script_data_end_tag_open = ->
3388 c = txt.charAt(cur++)
3390 tok_cur_tag = new_end_tag c.toLowerCase()
3391 temporary_buffer += c
3392 tok_state = tok_state_script_data_end_tag_name
3395 tok_cur_tag = new_end_tag c
3396 temporary_buffer += c
3397 tok_state = tok_state_script_data_end_tag_name
3400 tok_state = tok_state_script_data
3401 cur -= 1 # Reconsume
3402 return new_character_token '</'
3404 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3405 tok_state_script_data_end_tag_name = ->
3406 c = txt.charAt(cur++)
3407 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3408 if is_appropriate_end_tag tok_cur_tag
3409 tok_state = tok_state_before_attribute_name
3413 if is_appropriate_end_tag tok_cur_tag
3414 tok_state = tok_state_self_closing_start_tag
3418 if is_appropriate_end_tag tok_cur_tag
3419 tok_state = tok_state_data
3423 tok_cur_tag.name += c.toLowerCase()
3424 temporary_buffer += c
3427 tok_cur_tag.name += c
3428 temporary_buffer += c
3431 tok_state = tok_state_script_data
3432 cur -= 1 # Reconsume
3433 return new_character_token "</#{temporary_buffer}" # fixfull split
3435 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3436 tok_state_script_data_escape_start = ->
3437 c = txt.charAt(cur++)
3439 tok_state = tok_state_script_data_escape_start_dash
3440 return new_character_token '-'
3442 tok_state = tok_state_script_data
3443 cur -= 1 # Reconsume
3446 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3447 tok_state_script_data_escape_start_dash = ->
3448 c = txt.charAt(cur++)
3450 tok_state = tok_state_script_data_escaped_dash_dash
3451 return new_character_token '-'
3453 tok_state = tok_state_script_data
3454 cur -= 1 # Reconsume
3457 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3458 tok_state_script_data_escaped = ->
3459 c = txt.charAt(cur++)
3461 tok_state = tok_state_script_data_escaped_dash
3462 return new_character_token '-'
3464 tok_state = tok_state_script_data_escaped_less_than_sign
3468 return new_character_token "\ufffd"
3470 tok_state = tok_state_data
3472 cur -= 1 # Reconsume
3475 return new_character_token c
3477 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3478 tok_state_script_data_escaped_dash = ->
3479 c = txt.charAt(cur++)
3481 tok_state = tok_state_script_data_escaped_dash_dash
3482 return new_character_token '-'
3484 tok_state = tok_state_script_data_escaped_less_than_sign
3488 tok_state = tok_state_script_data_escaped
3489 return new_character_token "\ufffd"
3491 tok_state = tok_state_data
3493 cur -= 1 # Reconsume
3496 tok_state = tok_state_script_data_escaped
3497 return new_character_token c
3499 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3500 tok_state_script_data_escaped_dash_dash = ->
3501 c = txt.charAt(cur++)
3503 return new_character_token '-'
3505 tok_state = tok_state_script_data_escaped_less_than_sign
3508 tok_state = tok_state_script_data
3509 return new_character_token '>'
3512 tok_state = tok_state_script_data_escaped
3513 return new_character_token "\ufffd"
3516 tok_state = tok_state_data
3517 cur -= 1 # Reconsume
3520 tok_state = tok_state_script_data_escaped
3521 return new_character_token c
3523 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3524 tok_state_script_data_escaped_less_than_sign = ->
3525 c = txt.charAt(cur++)
3527 temporary_buffer = ''
3528 tok_state = tok_state_script_data_escaped_end_tag_open
3531 temporary_buffer = c.toLowerCase() # yes, really
3532 tok_state = tok_state_script_data_double_escape_start
3533 return new_character_token "<#{c}" # fixfull split
3535 temporary_buffer = c
3536 tok_state = tok_state_script_data_double_escape_start
3537 return new_character_token "<#{c}" # fixfull split
3539 tok_state = tok_state_script_data_escaped
3540 cur -= 1 # Reconsume
3541 return new_character_token '<'
3543 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3544 tok_state_script_data_escaped_end_tag_open = ->
3545 c = txt.charAt(cur++)
3547 tok_cur_tag = new_end_tag c.toLowerCase()
3548 temporary_buffer += c
3549 tok_state = tok_state_script_data_escaped_end_tag_name
3552 tok_cur_tag = new_end_tag c
3553 temporary_buffer += c
3554 tok_state = tok_state_script_data_escaped_end_tag_name
3557 tok_state = tok_state_script_data_escaped
3558 cur -= 1 # Reconsume
3559 return new_character_token '</' # fixfull split
3561 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3562 tok_state_script_data_escaped_end_tag_name = ->
3563 c = txt.charAt(cur++)
3564 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3565 if is_appropriate_end_tag tok_cur_tag
3566 tok_state = tok_state_before_attribute_name
3570 if is_appropriate_end_tag tok_cur_tag
3571 tok_state = tok_state_self_closing_start_tag
3575 if is_appropriate_end_tag tok_cur_tag
3576 tok_state = tok_state_data
3580 tok_cur_tag.name += c.toLowerCase()
3581 temporary_buffer += c.toLowerCase()
3584 tok_cur_tag.name += c
3585 temporary_buffer += c.toLowerCase()
3588 tok_state = tok_state_script_data_escaped
3589 cur -= 1 # Reconsume
3590 return new_character_token "</#{temporary_buffer}" # fixfull split
3592 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3593 tok_state_script_data_double_escape_start = ->
3594 c = txt.charAt(cur++)
3595 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3596 if temporary_buffer is 'script'
3597 tok_state = tok_state_script_data_double_escaped
3599 tok_state = tok_state_script_data_escaped
3600 return new_character_token c
3602 temporary_buffer += c.toLowerCase() # yes, really lowercase
3603 return new_character_token c
3605 temporary_buffer += c
3606 return new_character_token c
3608 tok_state = tok_state_script_data_escaped
3609 cur -= 1 # Reconsume
3612 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3613 tok_state_script_data_double_escaped = ->
3614 c = txt.charAt(cur++)
3616 tok_state = tok_state_script_data_double_escaped_dash
3617 return new_character_token '-'
3619 tok_state = tok_state_script_data_double_escaped_less_than_sign
3620 return new_character_token '<'
3623 return new_character_token "\ufffd"
3626 tok_state = tok_state_data
3627 cur -= 1 # Reconsume
3630 return new_character_token c
3632 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3633 tok_state_script_data_double_escaped_dash = ->
3634 c = txt.charAt(cur++)
3636 tok_state = tok_state_script_data_double_escaped_dash_dash
3637 return new_character_token '-'
3639 tok_state = tok_state_script_data_double_escaped_less_than_sign
3640 return new_character_token '<'
3643 tok_state = tok_state_script_data_double_escaped
3644 return new_character_token "\ufffd"
3647 tok_state = tok_state_data
3648 cur -= 1 # Reconsume
3651 tok_state = tok_state_script_data_double_escaped
3652 return new_character_token c
3654 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3655 tok_state_script_data_double_escaped_dash_dash = ->
3656 c = txt.charAt(cur++)
3658 return new_character_token '-'
3660 tok_state = tok_state_script_data_double_escaped_less_than_sign
3661 return new_character_token '<'
3663 tok_state = tok_state_script_data
3664 return new_character_token '>'
3667 tok_state = tok_state_script_data_double_escaped
3668 return new_character_token "\ufffd"
3671 tok_state = tok_state_data
3672 cur -= 1 # Reconsume
3675 tok_state = tok_state_script_data_double_escaped
3676 return new_character_token c
3678 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3679 tok_state_script_data_double_escaped_less_than_sign = ->
3680 c = txt.charAt(cur++)
3682 temporary_buffer = ''
3683 tok_state = tok_state_script_data_double_escape_end
3684 return new_character_token '/'
3686 tok_state = tok_state_script_data_double_escaped
3687 cur -= 1 # Reconsume
3690 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3691 tok_state_script_data_double_escape_end = ->
3692 c = txt.charAt(cur++)
3693 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3694 if temporary_buffer is 'script'
3695 tok_state = tok_state_script_data_escaped
3697 tok_state = tok_state_script_data_double_escaped
3698 return new_character_token c
3700 temporary_buffer += c.toLowerCase() # yes, really lowercase
3701 return new_character_token c
3703 temporary_buffer += c
3704 return new_character_token c
3706 tok_state = tok_state_script_data_double_escaped
3707 cur -= 1 # Reconsume
3710 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3711 tok_state_before_attribute_name = ->
3713 switch c = txt.charAt(cur++)
3714 when "\t", "\n", "\u000c", ' '
3717 tok_state = tok_state_self_closing_start_tag
3720 tok_state = tok_state_data
3726 attr_name = "\ufffd"
3727 when '"', "'", '<', '='
3732 tok_state = tok_state_data
3735 attr_name = c.toLowerCase()
3739 tok_cur_tag.attrs_a.unshift [attr_name, '']
3740 tok_state = tok_state_attribute_name
3743 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3744 tok_state_attribute_name = ->
3745 switch c = txt.charAt(cur++)
3746 when "\t", "\n", "\u000c", ' '
3747 tok_state = tok_state_after_attribute_name
3749 tok_state = tok_state_self_closing_start_tag
3751 tok_state = tok_state_before_attribute_value
3753 tok_state = tok_state_data
3759 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3762 tok_cur_tag.attrs_a[0][0] += c
3765 tok_state = tok_state_data
3768 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3770 tok_cur_tag.attrs_a[0][0] += c
3773 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3774 tok_state_after_attribute_name = ->
3775 c = txt.charAt(cur++)
3776 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3779 tok_state = tok_state_self_closing_start_tag
3782 tok_state = tok_state_before_attribute_value
3785 tok_state = tok_state_data
3788 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3789 tok_state = tok_state_attribute_name
3793 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3794 tok_state = tok_state_attribute_name
3798 tok_state = tok_state_data
3799 cur -= 1 # reconsume
3801 if c is '"' or c is "'" or c is '<'
3803 # fall through to Anything else
3805 tok_cur_tag.attrs_a.unshift [c, '']
3806 tok_state = tok_state_attribute_name
3808 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3809 tok_state_before_attribute_value = ->
3810 switch c = txt.charAt(cur++)
3811 when "\t", "\n", "\u000c", ' '
3814 tok_state = tok_state_attribute_value_double_quoted
3816 tok_state = tok_state_attribute_value_unquoted
3819 tok_state = tok_state_attribute_value_single_quoted
3822 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3823 tok_state = tok_state_attribute_value_unquoted
3826 tok_state = tok_state_data
3832 tok_state = tok_state_data
3834 tok_cur_tag.attrs_a[0][1] += c
3835 tok_state = tok_state_attribute_value_unquoted
3838 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3839 tok_state_attribute_value_double_quoted = ->
3840 switch c = txt.charAt(cur++)
3842 tok_state = tok_state_after_attribute_value_quoted
3844 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3847 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3850 tok_state = tok_state_data
3852 tok_cur_tag.attrs_a[0][1] += c
3855 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3856 tok_state_attribute_value_single_quoted = ->
3857 switch c = txt.charAt(cur++)
3859 tok_state = tok_state_after_attribute_value_quoted
3861 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3864 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3867 tok_state = tok_state_data
3869 tok_cur_tag.attrs_a[0][1] += c
3872 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3873 tok_state_attribute_value_unquoted = ->
3874 switch c = txt.charAt(cur++)
3875 when "\t", "\n", "\u000c", ' '
3876 tok_state = tok_state_before_attribute_name
3878 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3880 tok_state = tok_state_data
3885 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3888 tok_state = tok_state_data
3890 # Parse Error if ', <, = or ` (backtick)
3891 tok_cur_tag.attrs_a[0][1] += c
3894 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3895 tok_state_after_attribute_value_quoted = ->
3896 switch c = txt.charAt(cur++)
3897 when "\t", "\n", "\u000c", ' '
3898 tok_state = tok_state_before_attribute_name
3900 tok_state = tok_state_self_closing_start_tag
3902 tok_state = tok_state_data
3908 tok_state = tok_state_data
3911 tok_state = tok_state_before_attribute_name
3912 cur -= 1 # we didn't handle that char
3915 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3916 tok_state_self_closing_start_tag = ->
3917 c = txt.charAt(cur++)
3919 tok_cur_tag.flag 'self-closing', true
3920 tok_state = tok_state_data
3924 tok_state = tok_state_data
3925 cur -= 1 # Reconsume
3929 tok_state = tok_state_before_attribute_name
3930 cur -= 1 # Reconsume
3933 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3934 # WARNING: put a comment token in tok_cur_tag before setting this state
3935 tok_state_bogus_comment = ->
3936 next_gt = txt.indexOf '>', cur
3938 val = txt.substr cur
3941 val = txt.substr cur, (next_gt - cur)
3943 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3944 tok_cur_tag.text += val
3945 tok_state = tok_state_data
3948 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3949 tok_state_markup_declaration_open = ->
3950 if txt.substr(cur, 2) is '--'
3952 tok_cur_tag = new_comment_token ''
3953 tok_state = tok_state_comment_start
3955 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3957 tok_state = tok_state_doctype
3959 acn = adjusted_current_node()
3960 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3962 tok_state = tok_state_cdata_section
3966 tok_cur_tag = new_comment_token ''
3967 tok_state = tok_state_bogus_comment
3970 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3971 tok_state_comment_start = ->
3972 switch c = txt.charAt(cur++)
3974 tok_state = tok_state_comment_start_dash
3977 tok_state = tok_state_comment
3978 return new_character_token "\ufffd"
3981 tok_state = tok_state_data
3985 tok_state = tok_state_data
3986 cur -= 1 # Reconsume
3989 tok_cur_tag.text += c
3990 tok_state = tok_state_comment
3993 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3994 tok_state_comment_start_dash = ->
3995 switch c = txt.charAt(cur++)
3997 tok_state = tok_state_comment_end
4000 tok_cur_tag.text += "-\ufffd"
4001 tok_state = tok_state_comment
4004 tok_state = tok_state_data
4008 tok_state = tok_state_data
4009 cur -= 1 # Reconsume
4012 tok_cur_tag.text += "-#{c}"
4013 tok_state = tok_state_comment
4016 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
4017 tok_state_comment = ->
4018 switch c = txt.charAt(cur++)
4020 tok_state = tok_state_comment_end_dash
4023 tok_cur_tag.text += "\ufffd"
4026 tok_state = tok_state_data
4027 cur -= 1 # Reconsume
4030 tok_cur_tag.text += c
4033 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
4034 tok_state_comment_end_dash = ->
4035 switch c = txt.charAt(cur++)
4037 tok_state = tok_state_comment_end
4040 tok_cur_tag.text += "-\ufffd"
4041 tok_state = tok_state_comment
4044 tok_state = tok_state_data
4045 cur -= 1 # Reconsume
4048 tok_cur_tag.text += "-#{c}"
4049 tok_state = tok_state_comment
4052 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
4053 tok_state_comment_end = ->
4054 switch c = txt.charAt(cur++)
4056 tok_state = tok_state_data
4060 tok_cur_tag.text += "--\ufffd"
4061 tok_state = tok_state_comment
4064 tok_state = tok_state_comment_end_bang
4067 tok_cur_tag.text += '-'
4070 tok_state = tok_state_data
4071 cur -= 1 # Reconsume
4075 tok_cur_tag.text += "--#{c}"
4076 tok_state = tok_state_comment
4079 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
4080 tok_state_comment_end_bang = ->
4081 switch c = txt.charAt(cur++)
4083 tok_cur_tag.text += "--!#{c}"
4084 tok_state = tok_state_comment_end_dash
4086 tok_state = tok_state_data
4090 tok_cur_tag.text += "--!\ufffd"
4091 tok_state = tok_state_comment
4094 tok_state = tok_state_data
4095 cur -= 1 # Reconsume
4098 tok_cur_tag.text += "--!#{c}"
4099 tok_state = tok_state_comment
4102 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4103 tok_state_doctype = ->
4104 switch c = txt.charAt(cur++)
4105 when "\t", "\u000a", "\u000c", ' '
4106 tok_state = tok_state_before_doctype_name
4109 tok_state = tok_state_data
4110 el = new_doctype_token ''
4111 el.flag 'force-quirks', true
4112 cur -= 1 # Reconsume
4116 tok_state = tok_state_before_doctype_name
4117 cur -= 1 # Reconsume
4120 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4121 tok_state_before_doctype_name = ->
4122 c = txt.charAt(cur++)
4123 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4126 tok_cur_tag = new_doctype_token c.toLowerCase()
4127 tok_state = tok_state_doctype_name
4131 tok_cur_tag = new_doctype_token "\ufffd"
4132 tok_state = tok_state_doctype_name
4136 el = new_doctype_token ''
4137 el.flag 'force-quirks', true
4138 tok_state = tok_state_data
4142 tok_state = tok_state_data
4143 el = new_doctype_token ''
4144 el.flag 'force-quirks', true
4145 cur -= 1 # Reconsume
4148 tok_cur_tag = new_doctype_token c
4149 tok_state = tok_state_doctype_name
4152 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4153 tok_state_doctype_name = ->
4154 c = txt.charAt(cur++)
4155 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4156 tok_state = tok_state_after_doctype_name
4159 tok_state = tok_state_data
4162 tok_cur_tag.name += c.toLowerCase()
4166 tok_cur_tag.name += "\ufffd"
4170 tok_state = tok_state_data
4171 tok_cur_tag.flag 'force-quirks', true
4172 cur -= 1 # Reconsume
4175 tok_cur_tag.name += c
4178 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4179 tok_state_after_doctype_name = ->
4180 c = txt.charAt(cur++)
4181 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4184 tok_state = tok_state_data
4188 tok_state = tok_state_data
4189 tok_cur_tag.flag 'force-quirks', true
4190 cur -= 1 # Reconsume
4193 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4195 tok_state = tok_state_after_doctype_public_keyword
4197 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4199 tok_state = tok_state_after_doctype_system_keyword
4202 tok_cur_tag.flag 'force-quirks', true
4203 tok_state = tok_state_bogus_doctype
4206 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4207 tok_state_after_doctype_public_keyword = ->
4208 c = txt.charAt(cur++)
4209 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4210 tok_state = tok_state_before_doctype_public_identifier
4214 tok_cur_tag.public_identifier = ''
4215 tok_state = tok_state_doctype_public_identifier_double_quoted
4219 tok_cur_tag.public_identifier = ''
4220 tok_state = tok_state_doctype_public_identifier_single_quoted
4224 tok_cur_tag.flag 'force-quirks', true
4225 tok_state = tok_state_data
4229 tok_state = tok_state_data
4230 tok_cur_tag.flag 'force-quirks', true
4231 cur -= 1 # Reconsume
4235 tok_cur_tag.flag 'force-quirks', true
4236 tok_state = tok_state_bogus_doctype
4239 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4240 tok_state_before_doctype_public_identifier = ->
4241 c = txt.charAt(cur++)
4242 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4246 tok_cur_tag.public_identifier = ''
4247 tok_state = tok_state_doctype_public_identifier_double_quoted
4251 tok_cur_tag.public_identifier = ''
4252 tok_state = tok_state_doctype_public_identifier_single_quoted
4256 tok_cur_tag.flag 'force-quirks', true
4257 tok_state = tok_state_data
4261 tok_state = tok_state_data
4262 tok_cur_tag.flag 'force-quirks', true
4263 cur -= 1 # Reconsume
4267 tok_cur_tag.flag 'force-quirks', true
4268 tok_state = tok_state_bogus_doctype
4272 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4273 tok_state_doctype_public_identifier_double_quoted = ->
4274 c = txt.charAt(cur++)
4276 tok_state = tok_state_after_doctype_public_identifier
4280 tok_cur_tag.public_identifier += "\ufffd"
4284 tok_cur_tag.flag 'force-quirks', true
4285 tok_state = tok_state_data
4289 tok_state = tok_state_data
4290 tok_cur_tag.flag 'force-quirks', true
4291 cur -= 1 # Reconsume
4294 tok_cur_tag.public_identifier += c
4297 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4298 tok_state_doctype_public_identifier_single_quoted = ->
4299 c = txt.charAt(cur++)
4301 tok_state = tok_state_after_doctype_public_identifier
4305 tok_cur_tag.public_identifier += "\ufffd"
4309 tok_cur_tag.flag 'force-quirks', true
4310 tok_state = tok_state_data
4314 tok_state = tok_state_data
4315 tok_cur_tag.flag 'force-quirks', true
4316 cur -= 1 # Reconsume
4319 tok_cur_tag.public_identifier += c
4322 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4323 tok_state_after_doctype_public_identifier = ->
4324 c = txt.charAt(cur++)
4325 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4326 tok_state = tok_state_between_doctype_public_and_system_identifiers
4329 tok_state = tok_state_data
4333 tok_cur_tag.system_identifier = ''
4334 tok_state = tok_state_doctype_system_identifier_double_quoted
4338 tok_cur_tag.system_identifier = ''
4339 tok_state = tok_state_doctype_system_identifier_single_quoted
4343 tok_state = tok_state_data
4344 tok_cur_tag.flag 'force-quirks', true
4345 cur -= 1 # Reconsume
4349 tok_cur_tag.flag 'force-quirks', true
4350 tok_state = tok_state_bogus_doctype
4353 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4354 tok_state_between_doctype_public_and_system_identifiers = ->
4355 c = txt.charAt(cur++)
4356 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4359 tok_state = tok_state_data
4363 tok_cur_tag.system_identifier = ''
4364 tok_state = tok_state_doctype_system_identifier_double_quoted
4368 tok_cur_tag.system_identifier = ''
4369 tok_state = tok_state_doctype_system_identifier_single_quoted
4373 tok_state = tok_state_data
4374 tok_cur_tag.flag 'force-quirks', true
4375 cur -= 1 # Reconsume
4379 tok_cur_tag.flag 'force-quirks', true
4380 tok_state = tok_state_bogus_doctype
4383 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4384 tok_state_after_doctype_system_keyword = ->
4385 c = txt.charAt(cur++)
4386 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4387 tok_state = tok_state_before_doctype_system_identifier
4391 tok_cur_tag.system_identifier = ''
4392 tok_state = tok_state_doctype_system_identifier_double_quoted
4396 tok_cur_tag.system_identifier = ''
4397 tok_state = tok_state_doctype_system_identifier_single_quoted
4401 tok_cur_tag.flag 'force-quirks', true
4402 tok_state = tok_state_data
4406 tok_state = tok_state_data
4407 tok_cur_tag.flag 'force-quirks', true
4408 cur -= 1 # Reconsume
4412 tok_cur_tag.flag 'force-quirks', true
4413 tok_state = tok_state_bogus_doctype
4416 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4417 tok_state_before_doctype_system_identifier = ->
4418 c = txt.charAt(cur++)
4419 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4422 tok_cur_tag.system_identifier = ''
4423 tok_state = tok_state_doctype_system_identifier_double_quoted
4426 tok_cur_tag.system_identifier = ''
4427 tok_state = tok_state_doctype_system_identifier_single_quoted
4431 tok_cur_tag.flag 'force-quirks', true
4432 tok_state = tok_state_data
4436 tok_state = tok_state_data
4437 tok_cur_tag.flag 'force-quirks', true
4438 cur -= 1 # Reconsume
4442 tok_cur_tag.flag 'force-quirks', true
4443 tok_state = tok_state_bogus_doctype
4446 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4447 tok_state_doctype_system_identifier_double_quoted = ->
4448 c = txt.charAt(cur++)
4450 tok_state = tok_state_after_doctype_system_identifier
4454 tok_cur_tag.system_identifier += "\ufffd"
4458 tok_cur_tag.flag 'force-quirks', true
4459 tok_state = tok_state_data
4463 tok_state = tok_state_data
4464 tok_cur_tag.flag 'force-quirks', true
4465 cur -= 1 # Reconsume
4468 tok_cur_tag.system_identifier += c
4471 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4472 tok_state_doctype_system_identifier_single_quoted = ->
4473 c = txt.charAt(cur++)
4475 tok_state = tok_state_after_doctype_system_identifier
4479 tok_cur_tag.system_identifier += "\ufffd"
4483 tok_cur_tag.flag 'force-quirks', true
4484 tok_state = tok_state_data
4488 tok_state = tok_state_data
4489 tok_cur_tag.flag 'force-quirks', true
4490 cur -= 1 # Reconsume
4493 tok_cur_tag.system_identifier += c
4496 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4497 tok_state_after_doctype_system_identifier = ->
4498 c = txt.charAt(cur++)
4499 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4502 tok_state = tok_state_data
4506 tok_state = tok_state_data
4507 tok_cur_tag.flag 'force-quirks', true
4508 cur -= 1 # Reconsume
4512 # do _not_ tok_cur_tag.flag 'force-quirks', true
4513 tok_state = tok_state_bogus_doctype
4516 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4517 tok_state_bogus_doctype = ->
4518 c = txt.charAt(cur++)
4520 tok_state = tok_state_data
4523 tok_state = tok_state_data
4524 cur -= 1 # Reconsume
4529 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4530 tok_state_cdata_section = ->
4531 tok_state = tok_state_data
4532 next_gt = txt.indexOf ']]>', cur
4534 val = txt.substr cur
4537 val = txt.substr cur, (next_gt - cur)
4540 return new_character_token val # fixfull split
4543 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4544 # Don't set this as a state, just call it
4545 # returns a string (NOT a text node)
4546 parse_character_reference = (allowed_char = null, in_attr = false) ->
4547 if cur >= txt.length
4549 switch c = txt.charAt(cur)
4550 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4551 # explicitly not a parse error
4554 # there has to be "one or more" alnums between & and ; to be a parse error
4557 if cur + 1 >= txt.length
4559 if txt.charAt(cur + 1).toLowerCase() is 'x'
4568 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4573 if txt.charAt(start + i) is ';'
4577 code_point = txt.substr(start, i)
4578 while code_point.charAt(0) is '0' and code_point.length > 1
4579 code_point = code_point.substr 1
4580 code_point = parseInt(code_point, base)
4581 if unicode_fixes[code_point]?
4583 return unicode_fixes[code_point]
4585 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4589 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4591 return from_code_point code_point
4595 if alnum.indexOf(txt.charAt(cur + i)) is -1
4598 # exit early, because parse_error() below needs at least one alnum
4600 if txt.charAt(cur + i) is ';'
4601 i += 1 # include ';' terminator in value
4602 decoded = decode_named_char_ref txt.substr(cur, i)
4609 # no ';' terminator (only legacy char refs)
4611 for i in [2..max] # no prefix matches, so ok to check shortest first
4612 c = legacy_char_refs[txt.substr(cur, i)]
4615 if txt.charAt(cur + i) is '='
4616 # "because some legacy user agents will
4617 # misinterpret the markup in those cases"
4620 if alnum.indexOf(txt.charAt(cur + i)) > -1
4621 # this makes attributes forgiving about url args
4623 # ok, and besides the weird exceptions for attributes...
4624 # return the matching char
4625 cur += i # consume entity chars
4626 parse_error() # because no terminating ";"
4630 return # never reached
4632 # tree constructor initialization
4633 # see comments on TYPE_TAG/etc for the structure of this data
4636 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4637 doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
4639 afe = [] # active formatting elements
4640 template_ins_modes = []
4641 ins_mode = ins_mode_initial
4642 original_ins_mode = ins_mode # TODO check spec
4643 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4644 flag_frameset_ok = true
4646 flag_foster_parenting = false
4647 form_element_pointer = null
4648 temporary_buffer = null
4649 pending_table_character_tokens = []
4650 head_element_pointer = null
4651 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4652 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4653 prev_node_id = 0 # just for debugging
4655 # tokenizer initialization
4656 tok_state = tok_state_data
4658 # text pre-processing
4659 # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4660 txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4661 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4662 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4664 if args.name is "tests23.dat #1"
4667 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4672 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4675 serialize_els = (els, shallow, show_ids) ->
4681 serialized += t.serialize shallow, show_ids
4684 module.exports.parse_html = parse_html
4685 module.exports.debug_log_reset = debug_log_reset
4686 module.exports.debug_log_each = debug_log_each
4687 module.exports.TYPE_TAG = TYPE_TAG
4688 module.exports.TYPE_TEXT = TYPE_TEXT
4689 module.exports.TYPE_COMMENT = TYPE_COMMENT
4690 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4691 module.exports.NS_HTML = NS_HTML
4692 module.exports.NS_MATHML = NS_MATHML
4693 module.exports.NS_SVG = NS_SVG
4694 module.exports.QUIRKS_NO = QUIRKS_NO
4695 module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
4696 module.exports.QUIRKS_YES = QUIRKS_YES