1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
25 # Deviations from that spec:
27 # Purposeful: search this file for "WHATWG"
29 # Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
34 # the spec uses a many different words do indicate which ends of lists/stacks
35 # they are talking about (and relative movement within the lists/stacks). This
36 # section splains. I'm implementing "lists" (afe and open_els) the same way
39 # stacks grow downward (current element is index=0)
41 # example: open_els = [a, b, c, d, e, f, g]
43 # "grows downwards" means it's visualized like this: (index: el, names)
45 # 6: g "start of the list", "topmost", "first"
47 # 4: e "previous" (to d), "above", "before"
48 # 3: d (previous/next are relative to this element)
49 # 2: c "next", "after", "lower", "below"
51 # 0: a "end of the list", "current node", "bottommost", "last"
55 # note: to get this to run outside a browser, you'll have to write a native
56 # implementation of decode_named_char_ref()
57 unless module?.exports?
59 module = exports: window.wheic
61 from_code_point = (x) ->
62 if String.fromCodePoint?
63 return String.fromCodePoint x
66 return String.fromCharCode x
68 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
70 # Each node is an obect of the Node class. Here are the Node types:
71 TYPE_TAG = 0 # name, {attributes}, [children]
72 TYPE_TEXT = 1 # "text"
75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
77 TYPE_END_TAG = 5 # name
79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
87 # quirks mode constants
97 debug_log_each = (cb) ->
98 for str in g_debug_log
103 constructor: (type, args = {}) ->
104 @type = type # one of the TYPE_* constants above
105 @name = args.name ? '' # tag name
106 @text = args.text ? '' # contents for text/comment nodes
107 @attrs = args.attrs ? {}
108 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
109 @children = args.children ? []
110 @namespace = args.namespace ? NS_HTML
111 @parent = args.parent ? null
112 @token = args.token ? null
113 @flags = args.flags ? {}
117 @id = "#{++prev_node_id}"
118 acknowledge_self_closing: ->
120 @token.flag 'did_self_close', true
122 @flag 'did_self_close', true
123 flag: (key, value = null) ->
128 serialize: (shallow = false, show_ids = false) -> # for unit tests
133 ret += JSON.stringify @name
148 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
154 ret += c.serialize shallow, show_ids
158 ret += JSON.stringify @text
161 ret += JSON.stringify @text
163 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
166 when TYPE_AAA_BOOKMARK
167 ret += 'aaa_bookmark'
170 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
173 # helpers: (only take args that are normally known when parser creates nodes)
174 new_open_tag = (name) ->
175 return new Node TYPE_START_TAG, name: name
176 new_end_tag = (name) ->
177 return new Node TYPE_END_TAG, name: name
178 new_element = (name) ->
179 return new Node TYPE_TAG, name: name
180 new_text_node = (txt) ->
181 return new Node TYPE_TEXT, text: txt
182 new_character_token = new_text_node
183 new_comment_token = (txt) ->
184 return new Node TYPE_COMMENT, text: txt
185 new_doctype_token = (name) ->
186 return new Node TYPE_DOCTYPE, name: name
188 return new Node TYPE_EOF
190 return new Node TYPE_AFE_MARKER
191 new_aaa_bookmark = ->
192 return new Node TYPE_AAA_BOOKMARK
194 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
195 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
196 digits = "0123456789"
197 alnum = lc_alpha + uc_alpha + digits
198 hex_chars = digits + "abcdefABCDEF"
200 is_uc_alpha = (str) ->
201 return str.length is 1 and uc_alpha.indexOf(str) > -1
202 is_lc_alpha = (str) ->
203 return str.length is 1 and lc_alpha.indexOf(str) > -1
205 # some SVG elements have dashes in them
206 tag_name_chars = alnum + "-"
208 # http://www.w3.org/TR/html5/infrastructure.html#space-character
209 space_chars = "\u0009\u000a\u000c\u000d\u0020"
211 return txt.length is 1 and space_chars.indexOf(txt) > -1
212 is_space_tok = (t) ->
213 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
215 is_input_hidden_tok = (t) ->
216 return false unless t.type is TYPE_START_TAG
219 if a[1].toLowerCase() is 'hidden'
224 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
225 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
228 unicode_fixes[0x00] = "\uFFFD"
229 unicode_fixes[0x80] = "\u20AC"
230 unicode_fixes[0x82] = "\u201A"
231 unicode_fixes[0x83] = "\u0192"
232 unicode_fixes[0x84] = "\u201E"
233 unicode_fixes[0x85] = "\u2026"
234 unicode_fixes[0x86] = "\u2020"
235 unicode_fixes[0x87] = "\u2021"
236 unicode_fixes[0x88] = "\u02C6"
237 unicode_fixes[0x89] = "\u2030"
238 unicode_fixes[0x8A] = "\u0160"
239 unicode_fixes[0x8B] = "\u2039"
240 unicode_fixes[0x8C] = "\u0152"
241 unicode_fixes[0x8E] = "\u017D"
242 unicode_fixes[0x91] = "\u2018"
243 unicode_fixes[0x92] = "\u2019"
244 unicode_fixes[0x93] = "\u201C"
245 unicode_fixes[0x94] = "\u201D"
246 unicode_fixes[0x95] = "\u2022"
247 unicode_fixes[0x96] = "\u2013"
248 unicode_fixes[0x97] = "\u2014"
249 unicode_fixes[0x98] = "\u02DC"
250 unicode_fixes[0x99] = "\u2122"
251 unicode_fixes[0x9A] = "\u0161"
252 unicode_fixes[0x9B] = "\u203A"
253 unicode_fixes[0x9C] = "\u0153"
254 unicode_fixes[0x9E] = "\u017E"
255 unicode_fixes[0x9F] = "\u0178"
257 quirks_yes_pi_prefixes = [
258 "+//silmaril//dtd html pro v0r11 19970101//"
259 "-//as//dtd html 3.0 aswedit + extensions//"
260 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
261 "-//ietf//dtd html 2.0 level 1//"
262 "-//ietf//dtd html 2.0 level 2//"
263 "-//ietf//dtd html 2.0 strict level 1//"
264 "-//ietf//dtd html 2.0 strict level 2//"
265 "-//ietf//dtd html 2.0 strict//"
266 "-//ietf//dtd html 2.0//"
267 "-//ietf//dtd html 2.1e//"
268 "-//ietf//dtd html 3.0//"
269 "-//ietf//dtd html 3.2 final//"
270 "-//ietf//dtd html 3.2//"
271 "-//ietf//dtd html 3//"
272 "-//ietf//dtd html level 0//"
273 "-//ietf//dtd html level 1//"
274 "-//ietf//dtd html level 2//"
275 "-//ietf//dtd html level 3//"
276 "-//ietf//dtd html strict level 0//"
277 "-//ietf//dtd html strict level 1//"
278 "-//ietf//dtd html strict level 2//"
279 "-//ietf//dtd html strict level 3//"
280 "-//ietf//dtd html strict//"
281 "-//ietf//dtd html//"
282 "-//metrius//dtd metrius presentational//"
283 "-//microsoft//dtd internet explorer 2.0 html strict//"
284 "-//microsoft//dtd internet explorer 2.0 html//"
285 "-//microsoft//dtd internet explorer 2.0 tables//"
286 "-//microsoft//dtd internet explorer 3.0 html strict//"
287 "-//microsoft//dtd internet explorer 3.0 html//"
288 "-//microsoft//dtd internet explorer 3.0 tables//"
289 "-//netscape comm. corp.//dtd html//"
290 "-//netscape comm. corp.//dtd strict html//"
291 "-//o'reilly and associates//dtd html 2.0//"
292 "-//o'reilly and associates//dtd html extended 1.0//"
293 "-//o'reilly and associates//dtd html extended relaxed 1.0//"
294 "-//sq//dtd html 2.0 hotmetal + extensions//"
295 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
296 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
297 "-//spyglass//dtd html 2.0 extended//"
298 "-//sun microsystems corp.//dtd hotjava html//"
299 "-//sun microsystems corp.//dtd hotjava strict html//"
300 "-//w3c//dtd html 3 1995-03-24//"
301 "-//w3c//dtd html 3.2 draft//"
302 "-//w3c//dtd html 3.2 final//"
303 "-//w3c//dtd html 3.2//"
304 "-//w3c//dtd html 3.2s draft//"
305 "-//w3c//dtd html 4.0 frameset//"
306 "-//w3c//dtd html 4.0 transitional//"
307 "-//w3c//dtd html experimental 19960712//"
308 "-//w3c//dtd html experimental 970421//"
309 "-//w3c//dtd w3 html//"
310 "-//w3o//dtd w3 html 3.0//"
311 "-//webtechs//dtd mozilla html 2.0//"
312 "-//webtechs//dtd mozilla html//"
315 # These are the character references that don't need a terminating semicolon
316 # min length: 2, max: 6, none are a prefix of any other.
318 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
319 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
320 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
321 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
322 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
323 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
324 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
325 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
326 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
327 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
328 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
329 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
330 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
331 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
332 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
333 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
334 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
338 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
339 raw_text_elements = ['script', 'style']
340 escapable_raw_text_elements = ['textarea', 'title']
341 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
343 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
344 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
345 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
346 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
347 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
348 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
349 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
350 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
351 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
352 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
353 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
354 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
355 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
356 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
360 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
362 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
363 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
364 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
365 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
366 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
367 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
368 'determinant', 'diff', 'divergence', 'divide', 'domain',
369 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
370 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
371 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
372 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
373 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
374 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
375 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
376 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
377 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
378 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
379 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
380 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
381 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
382 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
383 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
384 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
385 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
386 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
387 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
388 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
389 'vectorproduct', 'xor'
391 # foreign_elements = [svg_elements..., mathml_elements...]
392 #normal_elements = All other allowed HTML elements are normal elements.
396 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
397 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
398 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
399 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
400 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
401 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
402 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
403 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
404 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
405 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
406 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
408 menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
410 meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
411 noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
412 plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
413 select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
414 table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
415 textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
416 tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
419 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
420 'annotation-xml':NS_MATHML,
423 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
426 formatting_elements = {
427 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
428 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
432 mathml_text_integration = {
433 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
435 is_mathml_text_integration_point = (el) ->
436 return mathml_text_integration[el.name] is el.namespace
437 is_html_integration = (el) -> # DON'T PASS A TOKEN
438 if el.namespace is NS_MATHML
439 if el.name is 'annotation-xml'
440 if el.attrs.encoding?
441 if el.attrs.encoding.toLowerCase() is 'text/html'
443 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
446 if el.namespace is NS_SVG
447 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
452 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
455 foster_parenting_targets = {
476 el_is_special = (e) ->
477 return special_elements[e.name] is e.namespace
479 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
480 el_is_special_not_adp = (el) ->
481 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
485 altglyphdef: 'altGlyphDef'
486 altglyphitem: 'altGlyphItem'
487 animatecolor: 'animateColor'
488 animatemotion: 'animateMotion'
489 animatetransform: 'animateTransform'
492 fecolormatrix: 'feColorMatrix'
493 fecomponenttransfer: 'feComponentTransfer'
494 fecomposite: 'feComposite'
495 feconvolvematrix: 'feConvolveMatrix'
496 fediffuselighting: 'feDiffuseLighting'
497 fedisplacementmap: 'feDisplacementMap'
498 fedistantlight: 'feDistantLight'
499 fedropshadow: 'feDropShadow'
505 fegaussianblur: 'feGaussianBlur'
508 femergenode: 'feMergeNode'
509 femorphology: 'feMorphology'
511 fepointlight: 'fePointLight'
512 fespecularlighting: 'feSpecularLighting'
513 fespotlight: 'feSpotLight'
515 feturbulence: 'feTurbulence'
516 foreignobject: 'foreignObject'
518 lineargradient: 'linearGradient'
519 radialgradient: 'radialGradient'
522 svg_attribute_fixes = {
523 attributename: 'attributeName'
524 attributetype: 'attributeType'
525 basefrequency: 'baseFrequency'
526 baseprofile: 'baseProfile'
528 clippathunits: 'clipPathUnits'
529 contentscripttype: 'contentScriptType'
530 contentstyletype: 'contentStyleType'
531 diffuseconstant: 'diffuseConstant'
533 externalresourcesrequired: 'externalResourcesRequired'
534 # WHATWG removes this: filterres: 'filterRes'
535 filterunits: 'filterUnits'
537 gradienttransform: 'gradientTransform'
538 gradientunits: 'gradientUnits'
539 kernelmatrix: 'kernelMatrix'
540 kernelunitlength: 'kernelUnitLength'
541 keypoints: 'keyPoints'
542 keysplines: 'keySplines'
544 lengthadjust: 'lengthAdjust'
545 limitingconeangle: 'limitingConeAngle'
546 markerheight: 'markerHeight'
547 markerunits: 'markerUnits'
548 markerwidth: 'markerWidth'
549 maskcontentunits: 'maskContentUnits'
550 maskunits: 'maskUnits'
551 numoctaves: 'numOctaves'
552 pathlength: 'pathLength'
553 patterncontentunits: 'patternContentUnits'
554 patterntransform: 'patternTransform'
555 patternunits: 'patternUnits'
556 pointsatx: 'pointsAtX'
557 pointsaty: 'pointsAtY'
558 pointsatz: 'pointsAtZ'
559 preservealpha: 'preserveAlpha'
560 preserveaspectratio: 'preserveAspectRatio'
561 primitiveunits: 'primitiveUnits'
564 repeatcount: 'repeatCount'
565 repeatdur: 'repeatDur'
566 requiredextensions: 'requiredExtensions'
567 requiredfeatures: 'requiredFeatures'
568 specularconstant: 'specularConstant'
569 specularexponent: 'specularExponent'
570 spreadmethod: 'spreadMethod'
571 startoffset: 'startOffset'
572 stddeviation: 'stdDeviation'
573 stitchtiles: 'stitchTiles'
574 surfacescale: 'surfaceScale'
575 systemlanguage: 'systemLanguage'
576 tablevalues: 'tableValues'
579 textlength: 'textLength'
581 viewtarget: 'viewTarget'
582 xchannelselector: 'xChannelSelector'
583 ychannelselector: 'yChannelSelector'
584 zoomandpan: 'zoomAndPan'
586 foreign_attr_fixes = {
587 'xlink:actuate': 'xlink actuate'
588 'xlink:arcrole': 'xlink arcrole'
589 'xlink:href': 'xlink href'
590 'xlink:role': 'xlink role'
591 'xlink:show': 'xlink show'
592 'xlink:title': 'xlink title'
593 'xlink:type': 'xlink type'
594 'xml:base': 'xml base'
595 'xml:lang': 'xml lang'
596 'xml:space': 'xml space'
598 'xmlns:xlink': 'xmlns xlink'
600 adjust_mathml_attributes = (t) ->
602 if a[0] is 'definitionurl'
603 a[0] = 'definitionURL'
605 adjust_svg_attributes = (t) ->
607 if svg_attribute_fixes[a[0]]?
608 a[0] = svg_attribute_fixes[a[0]]
610 adjust_foreign_attributes = (t) ->
613 if foreign_attr_fixes[a[0]]?
614 a[0] = foreign_attr_fixes[a[0]]
617 # decode_named_char_ref()
619 # The list of named character references is _huge_ so ask the browser to decode
620 # for us instead of wasting bandwidth/space on including the table here.
622 # Pass without the "&" but with the ";" examples:
623 # for "&" pass "amp;"
624 # for "′" pass "x2032;"
627 textarea: document.createElement('textarea')
629 # TODO test this in IE8
630 decode_named_char_ref = (txt) ->
632 decoded = g_dncr.cache[txt]
633 return decoded if decoded?
634 g_dncr.textarea.innerHTML = txt
635 decoded = g_dncr.textarea.value
636 return null if decoded is txt
637 return g_dncr.cache[txt] = decoded
639 parse_html = (args) ->
641 cur = null # index of next char in txt to be parsed
642 # declare doc and tokenizer variables so they're in scope below
644 open_els = null # stack of open elements
645 afe = null # active formatting elements
646 template_ins_modes = null
648 original_ins_mode = null
650 tok_cur_tag = null # partially parsed tag
651 flag_scripting = null
652 flag_frameset_ok = null
654 flag_foster_parenting = null
655 form_element_pointer = null
656 temporary_buffer = null
657 pending_table_character_tokens = null
658 head_element_pointer = null
659 flag_fragment_parsing = null
660 context_element = null
669 console.log "Parse error at character #{cur} of #{txt.length}"
671 # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
672 # "Noah's Ark clause" but with three
673 afe_push = (new_el) ->
676 if el.type is TYPE_AFE_MARKER
678 if el.name is new_el.name and el.namespace is new_el.namespace
681 unless new_el.attrs[k] is v
685 for k, v of new_el.attrs
686 unless el.attrs[k] is v
696 afe.unshift new_afe_marker()
698 # the functions below impliment the Tree Contstruction algorithm
699 # http://www.w3.org/TR/html5/syntax.html#tree-construction
701 # But first... the helpers
702 template_tag_is_open = ->
704 if el.name is 'template' and el.namespace is NS_HTML
707 is_in_scope_x = (tag_name, scope, namespace) ->
709 if el.name is tag_name and (namespace is null or namespace is el.namespace)
711 if scope[el.name] is el.namespace
714 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
716 if el.name is tag_name and (namespace is null or namespace is el.namespace)
718 if scope[el.name] is el.namespace
720 if scope2[el.name] is el.namespace
724 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
725 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
728 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
729 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
731 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
733 button_scopers = button: NS_HTML
734 li_scopers = ol: NS_HTML, ul: NS_HTML
735 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
736 is_in_scope = (tag_name, namespace = null) ->
737 return is_in_scope_x tag_name, standard_scopers, namespace
738 is_in_button_scope = (tag_name, namespace = null) ->
739 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
740 is_in_table_scope = (tag_name, namespace = null) ->
741 return is_in_scope_x tag_name, table_scopers, namespace
742 # aka is_in_list_item_scope
743 is_in_li_scope = (tag_name, namespace = null) ->
744 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
745 is_in_select_scope = (tag_name, namespace = null) ->
747 if t.name is tag_name and (namespace is null or namespace is t.namespace)
749 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
752 # this checks for a particular element, not by name
753 # this requires a namespace match
754 el_is_in_scope = (needle) ->
758 if standard_scopers[el.name] is el.namespace
762 clear_to_table_stopers = {
767 clear_stack_to_table_context = ->
769 if clear_to_table_stopers[open_els[0].name]?
773 clear_to_table_body_stopers = {
780 clear_stack_to_table_body_context = ->
782 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
786 clear_to_table_row_stopers = {
791 clear_stack_to_table_row_context = ->
793 if clear_to_table_row_stopers[open_els[0].name]?
797 clear_afe_to_marker = ->
799 return unless afe.length > 0 # this happens in fragment case, ?spec error
801 if el.type is TYPE_AFE_MARKER
806 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
808 # 1. Let last be false.
810 # 2. Let node be the last node in the stack of open elements.
812 node = open_els[node_i]
813 # 3. Loop: If node is the first node in the stack of open elements,
814 # then set last to true, and, if the parser was originally created as
815 # part of the HTML fragment parsing algorithm (fragment case) set node
816 # to the context element.
818 if node_i is open_els.length - 1
820 # fixfull (fragment case)
822 # 4. If node is a select element, run these substeps:
823 if node.name is 'select' and node.namespace is NS_HTML
824 # 1. If last is true, jump to the step below labeled done.
826 # 2. Let ancestor be node.
829 # 3. Loop: If ancestor is the first node in the stack of
830 # open elements, jump to the step below labeled done.
832 if ancestor_i is open_els.length - 1
834 # 4. Let ancestor be the node before ancestor in the stack
837 ancestor = open_els[ancestor_i]
838 # 5. If ancestor is a template node, jump to the step below
840 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
842 # 6. If ancestor is a table node, switch the insertion mode
843 # to "in select in table" and abort these steps.
844 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
845 ins_mode = ins_mode_in_select_in_table
847 # 7. Jump back to the step labeled loop.
848 # 8. Done: Switch the insertion mode to "in select" and abort
850 ins_mode = ins_mode_in_select
852 # 5. If node is a td or th element and last is false, then switch
853 # the insertion mode to "in cell" and abort these steps.
854 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
855 ins_mode = ins_mode_in_cell
857 # 6. If node is a tr element, then switch the insertion mode to "in
858 # row" and abort these steps.
859 if node.name is 'tr' and node.namespace is NS_HTML
860 ins_mode = ins_mode_in_row
862 # 7. If node is a tbody, thead, or tfoot element, then switch the
863 # insertion mode to "in table body" and abort these steps.
864 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
865 ins_mode = ins_mode_in_table_body
867 # 8. If node is a caption element, then switch the insertion mode
868 # to "in caption" and abort these steps.
869 if node.name is 'caption' and node.namespace is NS_HTML
870 ins_mode = ins_mode_in_caption
872 # 9. If node is a colgroup element, then switch the insertion mode
873 # to "in column group" and abort these steps.
874 if node.name is 'colgroup' and node.namespace is NS_HTML
875 ins_mode = ins_mode_in_column_group
877 # 10. If node is a table element, then switch the insertion mode to
878 # "in table" and abort these steps.
879 if node.name is 'table' and node.namespace is NS_HTML
880 ins_mode = ins_mode_in_table
882 # 11. If node is a template element, then switch the insertion mode
883 # to the current template insertion mode and abort these steps.
884 if node.name is 'template' and node.namespace is NS_HTML
885 ins_mode = template_ins_modes[0]
887 # 12. If node is a head element and last is true, then switch the
888 # insertion mode to "in body" ("in body"! not "in head"!) and abort
889 # these steps. (fragment case)
890 if node.name is 'head' and node.namespace is NS_HTML and last
891 ins_mode = ins_mode_in_body
893 # 13. If node is a head element and last is false, then switch the
894 # insertion mode to "in head" and abort these steps.
895 if node.name is 'head' and node.namespace is NS_HTML and last is false
896 ins_mode = ins_mode_in_head
898 # 14. If node is a body element, then switch the insertion mode to
899 # "in body" and abort these steps.
900 if node.name is 'body' and node.namespace is NS_HTML
901 ins_mode = ins_mode_in_body
903 # 15. If node is a frameset element, then switch the insertion mode
904 # to "in frameset" and abort these steps. (fragment case)
905 if node.name is 'frameset' and node.namespace is NS_HTML
906 ins_mode = ins_mode_in_frameset
908 # 16. If node is an html element, run these substeps:
909 if node.name is 'html' and node.namespace is NS_HTML
910 # 1. If the head element pointer is null, switch the insertion
911 # mode to "before head" and abort these steps. (fragment case)
912 if head_element_pointer is null
913 ins_mode = ins_mode_before_head
915 # 2. Otherwise, the head element pointer is not null,
916 # switch the insertion mode to "after head" and abort these
918 ins_mode = ins_mode_after_head
920 # 17. If last is true, then switch the insertion mode to "in body"
921 # and abort these steps. (fragment case)
923 ins_mode = ins_mode_in_body
925 # 18. Let node now be the node before node in the stack of open
928 node = open_els[node_i]
929 # 19. Return to the step labeled loop.
933 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
934 adjusted_current_node = ->
935 if open_els.length is 1 and flag_fragment_parsing
936 return context_element
939 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
940 # this implementation is structured (mostly) as described at the link above.
941 # capitalized comments are the "labels" described at the link above.
943 return if afe.length is 0
944 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
949 if i is afe.length - 1
952 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
957 el = insert_html_element afe[i].token
962 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
963 # adoption agency algorithm
965 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
966 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
967 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
968 adoption_agency = (subject) ->
969 debug_log "adoption_agency()"
970 debug_log "tree: #{serialize_els doc.children, false, true}"
971 debug_log "open_els: #{serialize_els open_els, true, true}"
972 debug_log "afe: #{serialize_els afe, true, true}"
973 # this block implements tha W3C spec
974 # # 1. If the current node is an HTML element whose tag name is subject,
975 # # then run these substeps:
977 # # 1. Let element be the current node.
979 # # 2. Pop element off the stack of open elements.
981 # # 3. If element is also in the list of active formatting elements,
982 # # remove the element from the list.
984 # # 4. Abort the adoption agency algorithm.
985 # if open_els[0].name is subject and open_els[0].namespace is NS_HTML
986 # el = open_els.shift()
987 # # remove it from the list of active formatting elements (if found)
992 # debug_log "aaa: starting off with subject on top of stack, exiting"
994 # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
995 # If the current node is an HTML element whose tag name is subject, and
996 # the current node is not in the list of active formatting elements,
997 # then pop the current node off the stack of open elements, and abort
999 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
1000 debug_log "aaa: starting off with subject on top of stack, exiting"
1001 # remove it from the list of active formatting elements (if found)
1004 if el is open_els[0]
1008 debug_log "aaa: ...and not in afe, aaa done"
1018 # 5. Let formatting element be the last element in the list of
1019 # active formatting elements that: is between the end of the list
1020 # and the last scope marker in the list, if any, or the start of
1021 # the list otherwise, and has the tag name subject.
1023 for t, fe_of_afe in afe
1024 if t.type is TYPE_AFE_MARKER
1026 if t.name is subject
1029 # If there is no such element, then abort these steps and instead
1030 # act as described in the "any other end tag" entry above.
1032 debug_log "aaa: fe not found in afe"
1033 in_body_any_other_end_tag subject
1035 # 6. If formatting element is not in the stack of open elements,
1036 # then this is a parse error; remove the element from the list, and
1037 # abort these steps.
1039 for t, fe_of_open_els in open_els
1044 debug_log "aaa: fe not found in open_els"
1046 # "remove it from the list" must mean afe, since it's not in open_els
1047 afe.splice fe_of_afe, 1
1049 # 7. If formatting element is in the stack of open elements, but
1050 # the element is not in scope, then this is a parse error; abort
1052 unless el_is_in_scope fe
1053 debug_log "aaa: fe not in scope"
1056 # 8. If formatting element is not the current node, this is a parse
1057 # error. (But do not abort these steps.)
1058 unless open_els[0] is fe
1061 # 9. Let furthest block be the topmost node in the stack of open
1062 # elements that is lower in the stack than formatting element, and
1063 # is an element in the special category. There might not be one.
1065 fb_of_open_els = null
1066 for t, i in open_els
1072 # and continue, to see if there's one that's more "topmost"
1073 # 10. If there is no furthest block, then the UA must first pop all
1074 # the nodes from the bottom of the stack of open elements, from the
1075 # current node up to and including formatting element, then remove
1076 # formatting element from the list of active formatting elements,
1077 # and finally abort these steps.
1079 debug_log "aaa: no fb"
1081 t = open_els.shift()
1083 afe.splice fe_of_afe, 1
1085 # 11. Let common ancestor be the element immediately above
1086 # formatting element in the stack of open elements.
1087 ca = open_els[fe_of_open_els + 1] # common ancestor
1089 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
1090 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
1091 bookmark = new_aaa_bookmark()
1094 afe.splice i, 0, bookmark
1096 node = last_node = fb
1100 # 3. Let node be the element immediately above node in the
1101 # stack of open elements, or if node is no longer in the stack
1102 # of open elements (e.g. because it got removed by this
1103 # algorithm), the element that was immediately above node in
1104 # the stack of open elements before node was removed.
1106 for t, i in open_els
1108 node_next = open_els[i + 1]
1110 node = node_next ? node_above
1111 debug_log "inner loop #{inner}"
1112 debug_log "tree: #{serialize_els doc.children, false, true}"
1113 debug_log "open_els: #{serialize_els open_els, true, true}"
1114 debug_log "afe: #{serialize_els afe, true, true}"
1115 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1116 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1117 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1118 debug_log "node: #{node.serialize true, true}"
1119 # TODO make sure node_above gets re-set if/when node is removed from open_els
1121 # 4. If node is formatting element, then go to the next step in
1122 # the overall algorithm.
1125 debug_log "the meat"
1126 # 5. If inner loop counter is greater than three and node is in
1127 # the list of active formatting elements, then remove node from
1128 # the list of active formatting elements.
1134 debug_log "max out inner"
1139 # 6. If node is not in the list of active formatting elements,
1140 # then remove node from the stack of open elements and then go
1141 # back to the step labeled inner loop.
1143 debug_log "not in afe"
1144 for t, i in open_els
1146 node_above = open_els[i + 1]
1147 open_els.splice i, 1
1150 debug_log "the bones"
1151 # 7. create an element for the token for which the element node
1152 # was created, in the HTML namespace, with common ancestor as
1153 # the intended parent; replace the entry for node in the list
1154 # of active formatting elements with an entry for the new
1155 # element, replace the entry for node in the stack of open
1156 # elements with an entry for the new element, and let node be
1158 new_node = token_to_element node.token, NS_HTML, ca
1162 debug_log "replaced in afe"
1164 for t, i in open_els
1166 node_above = open_els[i + 1]
1167 open_els[i] = new_node
1168 debug_log "replaced in open_els"
1171 # 8. If last node is furthest block, then move the
1172 # aforementioned bookmark to be immediately after the new node
1173 # in the list of active formatting elements.
1178 debug_log "removed bookmark"
1182 # "after" means lower
1183 afe.splice i, 0, bookmark # "after as <-
1184 debug_log "placed bookmark after node"
1185 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1187 # 9. Insert last node into node, first removing it from its
1188 # previous parent node if any.
1189 if last_node.parent?
1190 debug_log "last_node has parent"
1191 for c, i in last_node.parent.children
1193 debug_log "removing last_node from parent"
1194 last_node.parent.children.splice i, 1
1196 node.children.push last_node
1197 last_node.parent = node
1198 # 10. Let last node be node.
1201 # 11. Return to the step labeled inner loop.
1202 # 14. Insert whatever last node ended up being in the previous step
1203 # at the appropriate place for inserting a node, but using common
1204 # ancestor as the override target.
1206 # In the case where fe is immediately followed by fb:
1207 # * inner loop exits out early (node==fe)
1209 # * last_node is still in the tree (not a duplicate)
1210 if last_node.parent?
1211 debug_log "FEFIRST? last_node has parent"
1212 for c, i in last_node.parent.children
1214 debug_log "removing last_node from parent"
1215 last_node.parent.children.splice i, 1
1218 debug_log "after aaa inner loop"
1219 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1220 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1221 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1222 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1223 debug_log "tree: #{serialize_els doc.children, false, true}"
1228 # can't use standard insert token thing, because it's already in
1229 # open_els and must stay at it's current position in open_els
1230 dest = adjusted_insertion_location ca
1231 dest[0].children.splice dest[1], 0, last_node
1232 last_node.parent = dest[0]
1235 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1236 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1237 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1238 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1239 debug_log "tree: #{serialize_els doc.children, false, true}"
1241 # 15. Create an element for the token for which formatting element
1242 # was created, in the HTML namespace, with furthest block as the
1244 new_element = token_to_element fe.token, NS_HTML, fb
1245 # 16. Take all of the child nodes of furthest block and append them
1246 # to the element created in the last step.
1247 while fb.children.length
1248 t = fb.children.shift()
1249 t.parent = new_element
1250 new_element.children.push t
1251 # 17. Append that new element to furthest block.
1252 new_element.parent = fb
1253 fb.children.push new_element
1254 # 18. Remove formatting element from the list of active formatting
1255 # elements, and insert the new element into the list of active
1256 # formatting elements at the position of the aforementioned
1264 afe[i] = new_element
1266 # 19. Remove formatting element from the stack of open elements,
1267 # and insert the new element into the stack of open elements
1268 # immediately below the position of furthest block in that stack.
1269 for t, i in open_els
1271 open_els.splice i, 1
1273 for t, i in open_els
1275 open_els.splice i, 0, new_element
1277 # 20. Jump back to the step labeled outer loop.
1278 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1279 debug_log "tree: #{serialize_els doc.children, false, true}"
1280 debug_log "open_els: #{serialize_els open_els, true, true}"
1281 debug_log "afe: #{serialize_els afe, true, true}"
1282 debug_log "AAA DONE"
1284 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1285 close_p_element = ->
1286 generate_implied_end_tags 'p' # arg is exception
1287 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1289 while open_els.length > 1 # just in case
1290 el = open_els.shift()
1291 if el.name is 'p' and el.namespace is NS_HTML
1293 close_p_if_in_button_scope = ->
1294 if is_in_button_scope 'p', NS_HTML
1297 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1298 # aka insert_a_character = (t) ->
1299 insert_character = (t) ->
1300 dest = adjusted_insertion_location()
1301 # fixfull check for Document node
1303 prev = dest[0].children[dest[1] - 1]
1304 if prev.type is TYPE_TEXT
1307 dest[0].children.splice dest[1], 0, t
1310 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1311 process_token = (t) ->
1312 acn = adjusted_current_node()
1316 if acn.namespace is NS_HTML
1319 if is_mathml_text_integration_point(acn)
1320 if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1323 if t.type is TYPE_TEXT
1326 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1329 if is_html_integration acn
1330 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1333 if t.type is TYPE_EOF
1336 in_foreign_content t
1340 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1341 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1342 adjusted_insertion_location = (override_target = null) ->
1343 # 1. If there was an override target specified, then let target be the
1346 target = override_target
1347 else # Otherwise, let target be the current node.
1348 target = open_els[0]
1349 # 2. Determine the adjusted insertion location using the first matching
1350 # steps from the following list:
1352 # If foster parenting is enabled and target is a table, tbody, tfoot,
1353 # thead, or tr element Foster parenting happens when content is
1354 # misnested in tables.
1355 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1356 loop # once. this is here so we can ``break`` to "abort these substeps"
1357 # 1. Let last template be the last template element in the
1358 # stack of open elements, if any.
1359 last_template = null
1360 last_template_i = null
1361 for el, i in open_els
1362 if el.name is 'template' and el.namespace is NS_HTML
1366 # 2. Let last table be the last table element in the stack of
1367 # open elements, if any.
1370 for el, i in open_els
1371 if el.name is 'table' and el.namespace is NS_HTML
1375 # 3. If there is a last template and either there is no last
1376 # table, or there is one, but last template is lower (more
1377 # recently added) than last table in the stack of open
1378 # elements, then: let adjusted insertion location be inside
1379 # last template's template contents, after its last child (if
1380 # any), and abort these substeps.
1381 if last_template and (last_table is null or last_template_i < last_table_i)
1382 target = last_template # fixfull should be it's contents
1383 target_i = target.children.length
1385 # 4. If there is no last table, then let adjusted insertion
1386 # location be inside the first element in the stack of open
1387 # elements (the html element), after its last child (if any),
1388 # and abort these substeps. (fragment case)
1389 if last_table is null
1391 target = open_els[open_els.length - 1]
1392 target_i = target.children.length
1394 # 5. If last table has a parent element, then let adjusted
1395 # insertion location be inside last table's parent element,
1396 # immediately before last table, and abort these substeps.
1397 if last_table.parent?
1398 for c, i in last_table.parent.children
1400 target = last_table.parent
1404 # 6. Let previous element be the element immediately above last
1405 # table in the stack of open elements.
1407 # huh? how could it not have a parent?
1408 previous_element = open_els[last_table_i + 1]
1409 # 7. Let adjusted insertion location be inside previous
1410 # element, after its last child (if any).
1411 target = previous_element
1412 target_i = target.children.length
1413 # Note: These steps are involved in part because it's possible
1414 # for elements, the table element in this case in particular,
1415 # to have been moved by a script around in the DOM, or indeed
1416 # removed from the DOM entirely, after the element was inserted
1418 break # don't really loop
1420 # Otherwise Let adjusted insertion location be inside target, after
1421 # its last child (if any).
1422 target_i = target.children.length
1424 # 3. If the adjusted insertion location is inside a template element,
1425 # let it instead be inside the template element's template contents,
1426 # after its last child (if any).
1427 # fixfull (template)
1429 # 4. Return the adjusted insertion location.
1430 return [target, target_i]
1432 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1433 # aka create_an_element_for_token
1434 token_to_element = (t, namespace, intended_parent) ->
1435 # convert attributes into a hash
1438 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1439 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1441 # TODO 2. If the newly created element has an xmlns attribute in the
1442 # XMLNS namespace whose value is not exactly the same as the element's
1443 # namespace, that is a parse error. Similarly, if the newly created
1444 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1445 # value is not the XLink Namespace, that is a parse error.
1447 # fixfull: the spec says stuff about form pointers and ownerDocument
1451 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1452 insert_foreign_element = (token, namespace) ->
1453 ail = adjusted_insertion_location()
1456 el = token_to_element token, namespace, ail_el
1457 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1459 ail_el.children.splice ail_i, 0, el
1462 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1463 insert_html_element = (token) ->
1464 insert_foreign_element token, NS_HTML
1466 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1467 # position should be [node, index_within_children]
1468 insert_comment = (t, position = null) ->
1469 position ?= adjusted_insertion_location()
1470 position[0].children.splice position[1], 0, t
1473 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1474 parse_generic_raw_text = (t) ->
1475 insert_html_element t
1476 tok_state = tok_state_rawtext
1477 original_ins_mode = ins_mode
1478 ins_mode = ins_mode_text
1479 parse_generic_rcdata_text = (t) ->
1480 insert_html_element t
1481 tok_state = tok_state_rcdata
1482 original_ins_mode = ins_mode
1483 ins_mode = ins_mode_text
1485 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1486 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1487 generate_implied_end_tags = (except = null) ->
1488 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1491 # 8.2.5.4 The rules for parsing tokens in HTML content
1492 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1494 # 8.2.5.4.1 The "initial" insertion mode
1495 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1496 is_quirks_yes_doctype = (t) ->
1497 if t.flag 'force-quirks'
1499 if t.name isnt 'html'
1501 if t.public_identifier?
1502 pi = t.public_identifier.toLowerCase()
1503 for p in quirks_yes_pi_prefixes
1504 if pi.substr(0, p.length) is p
1506 if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
1508 if t.system_identifier?
1509 if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
1511 else if t.public_identifier?
1512 # already did this: pi = t.public_identifier.toLowerCase()
1513 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1516 is_quirks_limited_doctype = (t) ->
1517 if t.public_identifier?
1518 pi = t.public_identifier.toLowerCase()
1519 if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
1521 if t.system_identifier?
1522 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1525 ins_mode_initial = (t) ->
1528 if t.type is TYPE_COMMENT
1532 if t.type is TYPE_DOCTYPE
1533 # fixfull syntax error from first paragraph and following bullets
1534 # fixfull set doc.doctype
1535 # fixfull is the "not an iframe srcdoc" thing relevant?
1536 if is_quirks_yes_doctype t
1537 doc.flag 'quirks mode', QUIRKS_YES
1538 else if is_quirks_limited_doctype t
1539 doc.flag 'quirks mode', QUIRKS_LIMITED
1541 ins_mode = ins_mode_before_html
1544 # fixfull not iframe srcdoc?
1546 doc.flag 'quirks mode', QUIRKS_YES
1547 ins_mode = ins_mode_before_html
1551 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1552 ins_mode_before_html = (t) ->
1553 if t.type is TYPE_DOCTYPE
1556 if t.type is TYPE_COMMENT
1561 if t.type is TYPE_START_TAG and t.name is 'html'
1562 el = token_to_element t, NS_HTML, doc
1563 doc.children.push el
1564 open_els.unshift(el)
1565 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1566 ins_mode = ins_mode_before_head
1568 if t.type is TYPE_END_TAG
1569 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1570 # fall through to "anything else"
1575 el = token_to_element new_open_tag('html'), NS_HTML, doc
1576 doc.children.push el
1579 # ?fixfull browsing context
1580 ins_mode = ins_mode_before_head
1584 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1585 ins_mode_before_head = (t) ->
1588 if t.type is TYPE_COMMENT
1591 if t.type is TYPE_DOCTYPE
1594 if t.type is TYPE_START_TAG and t.name is 'html'
1597 if t.type is TYPE_START_TAG and t.name is 'head'
1598 el = insert_html_element t
1599 head_element_pointer = el
1600 ins_mode = ins_mode_in_head
1602 if t.type is TYPE_END_TAG
1603 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1604 # fall through to Anything else below
1609 el = insert_html_element new_open_tag 'head'
1610 head_element_pointer = el
1611 ins_mode = ins_mode_in_head
1614 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1615 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1616 open_els.shift() # spec says this will be a 'head' node
1617 ins_mode = ins_mode_after_head
1619 ins_mode_in_head = (t) ->
1620 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1623 if t.type is TYPE_COMMENT
1626 if t.type is TYPE_DOCTYPE
1629 if t.type is TYPE_START_TAG and t.name is 'html'
1632 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1633 el = insert_html_element t
1635 t.acknowledge_self_closing()
1637 if t.type is TYPE_START_TAG and t.name is 'meta'
1638 el = insert_html_element t
1640 t.acknowledge_self_closing()
1641 # fixfull encoding stuff
1643 if t.type is TYPE_START_TAG and t.name is 'title'
1644 parse_generic_rcdata_text t
1646 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1647 parse_generic_raw_text t
1649 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1650 insert_html_element t
1651 ins_mode = ins_mode_in_head_noscript
1653 if t.type is TYPE_START_TAG and t.name is 'script'
1654 ail = adjusted_insertion_location()
1655 el = token_to_element t, NS_HTML, ail
1656 el.flag 'parser-inserted', true
1657 # fixfull frament case
1658 ail[0].children.splice ail[1], 0, el
1660 tok_state = tok_state_script_data
1661 original_ins_mode = ins_mode # make sure orig... is defined
1662 ins_mode = ins_mode_text
1664 if t.type is TYPE_END_TAG and t.name is 'head'
1665 open_els.shift() # will be a head element... spec says so
1666 ins_mode = ins_mode_after_head
1668 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1669 ins_mode_in_head_else t
1671 if t.type is TYPE_START_TAG and t.name is 'template'
1672 insert_html_element t
1674 flag_frameset_ok = false
1675 ins_mode = ins_mode_in_template
1676 template_ins_modes.unshift ins_mode_in_template
1678 if t.type is TYPE_END_TAG and t.name is 'template'
1679 if template_tag_is_open()
1680 generate_implied_end_tags
1681 if open_els[0].name isnt 'template'
1684 el = open_els.shift()
1685 if el.name is 'template' and el.namespace is NS_HTML
1687 clear_afe_to_marker()
1688 template_ins_modes.shift()
1693 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1696 ins_mode_in_head_else t
1698 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1699 ins_mode_in_head_noscript_else = (t) ->
1702 ins_mode = ins_mode_in_head
1704 ins_mode_in_head_noscript = (t) ->
1705 if t.type is TYPE_DOCTYPE
1708 if t.type is TYPE_START_TAG and t.name is 'html'
1711 if t.type is TYPE_END_TAG and t.name is 'noscript'
1713 ins_mode = ins_mode_in_head
1715 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1718 if t.type is TYPE_END_TAG and t.name is 'br'
1719 ins_mode_in_head_noscript_else t
1721 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1725 ins_mode_in_head_noscript_else t
1730 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1731 ins_mode_after_head_else = (t) ->
1732 body_tok = new_open_tag 'body'
1733 insert_html_element body_tok
1734 ins_mode = ins_mode_in_body
1737 ins_mode_after_head = (t) ->
1741 if t.type is TYPE_COMMENT
1744 if t.type is TYPE_DOCTYPE
1747 if t.type is TYPE_START_TAG and t.name is 'html'
1750 if t.type is TYPE_START_TAG and t.name is 'body'
1751 insert_html_element t
1752 flag_frameset_ok = false
1753 ins_mode = ins_mode_in_body
1755 if t.type is TYPE_START_TAG and t.name is 'frameset'
1756 insert_html_element t
1757 ins_mode = ins_mode_in_frameset
1759 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1761 open_els.unshift head_element_pointer
1763 for el, i in open_els
1764 if el is head_element_pointer
1765 open_els.splice i, 1
1767 console.log "warning: 23904 couldn't find head element in open_els"
1769 if t.type is TYPE_END_TAG and t.name is 'template'
1772 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1773 ins_mode_after_head_else t
1775 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1779 ins_mode_after_head_else t
1781 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1782 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1785 if node.name is name and node.namespace is NS_HTML
1786 generate_implied_end_tags name # arg is exception
1787 unless node is open_els[0]
1790 el = open_els.shift()
1793 if special_elements[node.name] is node.namespace
1796 for el, i in open_els
1798 node = open_els[i + 1]
1801 ins_mode_in_body = (t) ->
1802 if t.type is TYPE_TEXT and t.text is "\u0000"
1809 if t.type is TYPE_TEXT
1812 flag_frameset_ok = false
1814 if t.type is TYPE_COMMENT
1817 if t.type is TYPE_DOCTYPE
1820 if t.type is TYPE_START_TAG and t.name is 'html'
1822 return if template_tag_is_open()
1823 root_attrs = open_els[open_els.length - 1].attrs
1825 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1828 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1831 if t.type is TYPE_START_TAG and t.name is 'body'
1833 return if open_els.length < 2
1834 second = open_els[open_els.length - 2]
1835 return unless second.namespace is NS_HTML
1836 return unless second.name is 'body'
1837 return if template_tag_is_open()
1838 flag_frameset_ok = false
1840 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1842 if t.type is TYPE_START_TAG and t.name is 'frameset'
1844 return if open_els.length < 2
1845 second_i = open_els.length - 2
1846 second = open_els[second_i]
1847 return unless second.namespace is NS_HTML
1848 return unless second.name is 'body'
1849 if flag_frameset_ok is false
1852 for el, i in second.parent.children
1854 second.parent.children.splice i, 1
1856 open_els.splice second_i, 1
1857 # pop everything except the "root html element"
1858 while open_els.length > 1
1860 insert_html_element t
1861 ins_mode = ins_mode_in_frameset
1863 if t.type is TYPE_EOF
1865 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1866 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1867 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1870 unless ok_tags[t.name] is el.namespace
1873 if template_ins_modes.length > 0
1874 ins_mode_in_template t
1878 if t.type is TYPE_END_TAG and t.name is 'body'
1879 unless is_in_scope 'body', NS_HTML
1883 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1884 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1885 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1886 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1890 unless ok_tags[t.name] is el.namespace
1893 ins_mode = ins_mode_after_body
1895 if t.type is TYPE_END_TAG and t.name is 'html'
1896 unless is_in_scope 'body', NS_HTML
1900 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1901 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1902 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1903 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1907 unless ok_tags[t.name] is el.namespace
1910 ins_mode = ins_mode_after_body
1913 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1914 close_p_if_in_button_scope()
1915 insert_html_element t
1917 if t.type is TYPE_START_TAG and h_tags[t.name]?
1918 close_p_if_in_button_scope()
1919 if h_tags[open_els[0].name] is open_els[0].namespace
1922 insert_html_element t
1924 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1925 close_p_if_in_button_scope()
1926 insert_html_element t
1927 eat_next_token_if_newline()
1928 flag_frameset_ok = false
1930 if t.type is TYPE_START_TAG and t.name is 'form'
1931 unless form_element_pointer is null or template_tag_is_open()
1934 close_p_if_in_button_scope()
1935 el = insert_html_element t
1936 unless template_tag_is_open()
1937 form_element_pointer = el
1939 if t.type is TYPE_START_TAG and t.name is 'li'
1940 flag_frameset_ok = false
1941 for node in open_els
1942 if node.name is 'li' and node.namespace is NS_HTML
1943 generate_implied_end_tags 'li' # arg is exception
1944 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1947 el = open_els.shift()
1948 if el.name is 'li' and el.namespace is NS_HTML
1951 if el_is_special_not_adp node
1953 close_p_if_in_button_scope()
1954 insert_html_element t
1956 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1957 flag_frameset_ok = false
1958 for node in open_els
1959 if node.name is 'dd' and node.namespace is NS_HTML
1960 generate_implied_end_tags 'dd' # arg is exception
1961 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1964 el = open_els.shift()
1965 if el.name is 'dd' and el.namespace is NS_HTML
1968 if node.name is 'dt' and node.namespace is NS_HTML
1969 generate_implied_end_tags 'dt' # arg is exception
1970 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1973 el = open_els.shift()
1974 if el.name is 'dt' and el.namespace is NS_HTML
1977 if el_is_special_not_adp node
1979 close_p_if_in_button_scope()
1980 insert_html_element t
1982 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1983 close_p_if_in_button_scope()
1984 insert_html_element t
1985 tok_state = tok_state_plaintext
1987 if t.type is TYPE_START_TAG and t.name is 'button'
1988 if is_in_scope 'button', NS_HTML
1990 generate_implied_end_tags()
1992 el = open_els.shift()
1993 if el.name is 'button' and el.namespace is NS_HTML
1996 insert_html_element t
1997 flag_frameset_ok = false
1999 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
2000 unless is_in_scope t.name, NS_HTML
2003 generate_implied_end_tags()
2004 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
2007 el = open_els.shift()
2008 if el.name is t.name and el.namespace is NS_HTML
2011 if t.type is TYPE_END_TAG and t.name is 'form'
2012 unless template_tag_is_open()
2013 node = form_element_pointer
2014 form_element_pointer = null
2015 if node is null or not el_is_in_scope node
2018 generate_implied_end_tags()
2019 if open_els[0] isnt node
2021 for el, i in open_els
2023 open_els.splice i, 1
2026 unless is_in_scope 'form', NS_HTML
2029 generate_implied_end_tags()
2030 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
2033 el = open_els.shift()
2034 if el.name is 'form' and el.namespace is NS_HTML
2037 if t.type is TYPE_END_TAG and t.name is 'p'
2038 unless is_in_button_scope 'p', NS_HTML
2040 insert_html_element new_open_tag 'p'
2043 if t.type is TYPE_END_TAG and t.name is 'li'
2044 unless is_in_li_scope 'li', NS_HTML
2047 generate_implied_end_tags 'li' # arg is exception
2048 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
2051 el = open_els.shift()
2052 if el.name is 'li' and el.namespace is NS_HTML
2055 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
2056 unless is_in_scope t.name, NS_HTML
2059 generate_implied_end_tags t.name # arg is exception
2060 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2063 el = open_els.shift()
2064 if el.name is t.name and el.namespace is NS_HTML
2067 if t.type is TYPE_END_TAG and h_tags[t.name]?
2070 if h_tags[el.name] is el.namespace
2073 if standard_scopers[el.name] is el.namespace
2078 generate_implied_end_tags()
2079 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2082 el = open_els.shift()
2083 if h_tags[el.name] is el.namespace
2087 if t.type is TYPE_START_TAG and t.name is 'a'
2088 # If the list of active formatting elements contains an a element
2089 # between the end of the list and the last marker on the list (or
2090 # the start of the list if there is no marker on the list), then
2091 # this is a parse error; run the adoption agency algorithm for the
2092 # tag name "a", then remove that element from the list of active
2093 # formatting elements and the stack of open elements if the
2094 # adoption agency algorithm didn't already remove it (it might not
2095 # have if the element is not in table scope).
2098 if el.type is TYPE_AFE_MARKER
2100 if el.name is 'a' and el.namespace is NS_HTML
2108 for el, i in open_els
2110 open_els.splice i, 1
2112 el = insert_html_element t
2115 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2117 el = insert_html_element t
2120 if t.type is TYPE_START_TAG and t.name is 'nobr'
2122 if is_in_scope 'nobr', NS_HTML
2124 adoption_agency 'nobr'
2126 el = insert_html_element t
2129 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2130 adoption_agency t.name
2132 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2134 insert_html_element t
2136 flag_frameset_ok = false
2138 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2139 unless is_in_scope t.name, NS_HTML
2142 generate_implied_end_tags()
2143 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2146 el = open_els.shift()
2147 if el.name is t.name and el.namespace is NS_HTML
2149 clear_afe_to_marker()
2151 if t.type is TYPE_START_TAG and t.name is 'table'
2152 unless doc.flag('quirks mode') is QUIRKS_YES
2153 close_p_if_in_button_scope() # test
2154 insert_html_element t
2155 flag_frameset_ok = false
2156 ins_mode = ins_mode_in_table
2158 if t.type is TYPE_END_TAG and t.name is 'br'
2160 # W3C: t.type = TYPE_START_TAG
2161 t = new_open_tag 'br' # WHATWG
2163 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2165 insert_html_element t
2167 t.acknowledge_self_closing()
2168 flag_frameset_ok = false
2170 if t.type is TYPE_START_TAG and t.name is 'input'
2172 insert_html_element t
2174 t.acknowledge_self_closing()
2175 unless is_input_hidden_tok t
2176 flag_frameset_ok = false
2178 if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
2179 # WHATWG adds 'menuitem' for this block
2180 insert_html_element t
2182 t.acknowledge_self_closing()
2184 if t.type is TYPE_START_TAG and t.name is 'hr'
2185 close_p_if_in_button_scope()
2186 insert_html_element t
2188 t.acknowledge_self_closing()
2189 flag_frameset_ok = false
2191 if t.type is TYPE_START_TAG and t.name is 'image'
2196 if t.type is TYPE_START_TAG and t.name is 'isindex'
2198 if template_tag_is_open() is false and form_element_pointer isnt null
2200 t.acknowledge_self_closing()
2201 flag_frameset_ok = false
2202 close_p_if_in_button_scope()
2203 el = insert_html_element new_open_tag 'form'
2204 unless template_tag_is_open()
2205 form_element_pointer = el
2208 el.attrs['action'] = a[1]
2210 insert_html_element new_open_tag 'hr'
2213 insert_html_element new_open_tag 'label'
2214 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2215 input_el = new_open_tag 'input'
2220 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2221 input_el.attrs_a.push [a[0], a[1]]
2222 input_el.attrs_a.push ['name', 'isindex']
2223 # fixfull this next bit is in english... internationalize?
2224 prompt ?= "This is a searchable index. Enter search keywords: "
2225 insert_character new_character_token prompt # fixfull split
2226 # TODO submit typo "balue" in spec
2227 insert_html_element input_el
2229 # insert_character '' # you can put chars here if promt attr missing
2231 insert_html_element new_open_tag 'hr'
2234 unless template_tag_is_open()
2235 form_element_pointer = null
2237 if t.type is TYPE_START_TAG and t.name is 'textarea'
2238 insert_html_element t
2239 eat_next_token_if_newline()
2240 tok_state = tok_state_rcdata
2241 original_ins_mode = ins_mode
2242 flag_frameset_ok = false
2243 ins_mode = ins_mode_text
2245 if t.type is TYPE_START_TAG and t.name is 'xmp'
2246 close_p_if_in_button_scope()
2248 flag_frameset_ok = false
2249 parse_generic_raw_text t
2251 if t.type is TYPE_START_TAG and t.name is 'iframe'
2252 flag_frameset_ok = false
2253 parse_generic_raw_text t
2255 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2256 parse_generic_raw_text t
2258 if t.type is TYPE_START_TAG and t.name is 'select'
2260 insert_html_element t
2261 flag_frameset_ok = false
2262 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2263 ins_mode = ins_mode_in_select_in_table
2265 ins_mode = ins_mode_in_select
2267 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2268 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2271 insert_html_element t
2273 # this comment block implements the W3C spec
2274 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2275 # if is_in_scope 'ruby', NS_HTML
2276 # generate_implied_end_tags()
2277 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2279 # insert_html_element t
2281 # if t.type is TYPE_START_TAG and t.name is 'rt'
2282 # if is_in_scope 'ruby', NS_HTML
2283 # generate_implied_end_tags 'rtc' # arg is exception
2284 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2286 # insert_html_element t
2288 # below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2289 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2290 if is_in_scope 'ruby', NS_HTML
2291 generate_implied_end_tags()
2292 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2294 insert_html_element t
2296 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2297 if is_in_scope 'ruby', NS_HTML
2298 generate_implied_end_tags 'rtc'
2299 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2301 insert_html_element t
2304 if t.type is TYPE_START_TAG and t.name is 'math'
2306 adjust_mathml_attributes t
2307 adjust_foreign_attributes t
2308 insert_foreign_element t, NS_MATHML
2309 if t.flag 'self-closing'
2311 t.acknowledge_self_closing()
2313 if t.type is TYPE_START_TAG and t.name is 'svg'
2315 adjust_svg_attributes t
2316 adjust_foreign_attributes t
2317 insert_foreign_element t, NS_SVG
2318 if t.flag 'self-closing'
2320 t.acknowledge_self_closing()
2322 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2325 if t.type is TYPE_START_TAG # any other start tag
2327 insert_html_element t
2329 if t.type is TYPE_END_TAG # any other end tag
2330 in_body_any_other_end_tag t.name
2334 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2335 ins_mode_text = (t) ->
2336 if t.type is TYPE_TEXT
2339 if t.type is TYPE_EOF
2341 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2342 open_els[0].flag 'already started', true
2344 ins_mode = original_ins_mode
2347 if t.type is TYPE_END_TAG and t.name is 'script'
2349 ins_mode = original_ins_mode
2350 # fixfull the spec seems to assume that I'm going to run the script
2351 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2353 if t.type is TYPE_END_TAG
2355 ins_mode = original_ins_mode
2357 console.log 'warning: end of ins_mode_text reached'
2359 # the functions below implement the tokenizer stats described here:
2360 # http://www.w3.org/TR/html5/syntax.html#tokenization
2362 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2363 ins_mode_in_table_else = (t) ->
2365 flag_foster_parenting = true
2367 flag_foster_parenting = false
2369 ins_mode_in_table = (t) ->
2372 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2373 pending_table_character_tokens = []
2374 original_ins_mode = ins_mode
2375 ins_mode = ins_mode_in_table_text
2378 ins_mode_in_table_else t
2386 clear_stack_to_table_context()
2388 insert_html_element t
2389 ins_mode = ins_mode_in_caption
2391 clear_stack_to_table_context()
2392 insert_html_element t
2393 ins_mode = ins_mode_in_column_group
2395 clear_stack_to_table_context()
2396 insert_html_element new_open_tag 'colgroup'
2397 ins_mode = ins_mode_in_column_group
2399 when 'tbody', 'tfoot', 'thead'
2400 clear_stack_to_table_context()
2401 insert_html_element t
2402 ins_mode = ins_mode_in_table_body
2403 when 'td', 'th', 'tr'
2404 clear_stack_to_table_context()
2405 insert_html_element new_open_tag 'tbody'
2406 ins_mode = ins_mode_in_table_body
2410 if is_in_table_scope 'table', NS_HTML
2412 el = open_els.shift()
2413 if el.name is 'table' and el.namespace is NS_HTML
2417 when 'style', 'script', 'template'
2420 unless is_input_hidden_tok t
2421 ins_mode_in_table_else t
2424 el = insert_html_element t
2426 t.acknowledge_self_closing()
2429 if form_element_pointer?
2431 if template_tag_is_open()
2433 form_element_pointer = insert_html_element t
2436 ins_mode_in_table_else t
2440 if is_in_table_scope 'table', NS_HTML
2442 el = open_els.shift()
2443 if el.name is 'table' and el.namespace is NS_HTML
2448 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2453 ins_mode_in_table_else t
2457 ins_mode_in_table_else t
2460 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2461 ins_mode_in_table_text = (t) ->
2462 if t.type is TYPE_TEXT and t.text is "\u0000"
2466 if t.type is TYPE_TEXT
2467 pending_table_character_tokens.push t
2471 for old in pending_table_character_tokens
2472 unless is_space_tok old
2476 for old in pending_table_character_tokens
2477 insert_character old
2479 for old in pending_table_character_tokens
2480 ins_mode_in_table_else old
2481 pending_table_character_tokens = []
2482 ins_mode = original_ins_mode
2485 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2486 ins_mode_in_caption = (t) ->
2487 if t.type is TYPE_END_TAG and t.name is 'caption'
2488 if is_in_table_scope 'caption', NS_HTML
2489 generate_implied_end_tags()
2490 if open_els[0].name isnt 'caption'
2493 el = open_els.shift()
2494 if el.name is 'caption' and el.namespace is NS_HTML
2496 clear_afe_to_marker()
2497 ins_mode = ins_mode_in_table
2502 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2504 if is_in_table_scope 'caption', NS_HTML
2506 el = open_els.shift()
2507 if el.name is 'caption' and el.namespace is NS_HTML
2509 clear_afe_to_marker()
2510 ins_mode = ins_mode_in_table
2512 # else fragment case
2514 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2520 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2521 ins_mode_in_column_group = (t) ->
2525 if t.type is TYPE_COMMENT
2528 if t.type is TYPE_DOCTYPE
2531 if t.type is TYPE_START_TAG and t.name is 'html'
2534 if t.type is TYPE_START_TAG and t.name is 'col'
2535 el = insert_html_element t
2537 t.acknowledge_self_closing()
2539 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2540 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2542 ins_mode = ins_mode_in_table
2546 if t.type is TYPE_END_TAG and t.name is 'col'
2549 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2552 if t.type is TYPE_EOF
2556 if open_els[0].name isnt 'colgroup'
2560 ins_mode = ins_mode_in_table
2564 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2565 ins_mode_in_table_body = (t) ->
2566 if t.type is TYPE_START_TAG and t.name is 'tr'
2567 clear_stack_to_table_body_context()
2568 insert_html_element t
2569 ins_mode = ins_mode_in_row
2571 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2573 clear_stack_to_table_body_context()
2574 insert_html_element new_open_tag 'tr'
2575 ins_mode = ins_mode_in_row
2578 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2579 unless is_in_table_scope t.name, NS_HTML
2582 clear_stack_to_table_body_context()
2584 ins_mode = ins_mode_in_table
2586 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2589 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2592 if table_scopers[el.name] is el.namespace
2597 clear_stack_to_table_body_context()
2599 ins_mode = ins_mode_in_table
2602 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2608 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2609 ins_mode_in_row = (t) ->
2610 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2611 clear_stack_to_table_row_context()
2612 insert_html_element t
2613 ins_mode = ins_mode_in_cell
2616 if t.type is TYPE_END_TAG and t.name is 'tr'
2617 if is_in_table_scope 'tr', NS_HTML
2618 clear_stack_to_table_row_context()
2620 ins_mode = ins_mode_in_table_body
2624 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2625 if is_in_table_scope 'tr', NS_HTML
2626 clear_stack_to_table_row_context()
2628 ins_mode = ins_mode_in_table_body
2633 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2634 if is_in_table_scope t.name, NS_HTML
2635 if is_in_table_scope 'tr', NS_HTML
2636 clear_stack_to_table_row_context()
2638 ins_mode = ins_mode_in_table_body
2643 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2649 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2651 generate_implied_end_tags()
2652 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2655 el = open_els.shift()
2656 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2658 clear_afe_to_marker()
2659 ins_mode = ins_mode_in_row
2661 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2662 ins_mode_in_cell = (t) ->
2663 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2664 if is_in_table_scope t.name, NS_HTML
2665 generate_implied_end_tags()
2666 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2669 el = open_els.shift()
2670 if el.name is t.name and el.namespace is NS_HTML
2672 clear_afe_to_marker()
2673 ins_mode = ins_mode_in_row
2677 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2680 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2683 if table_scopers[el.name] is el.namespace
2691 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2694 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2695 if is_in_table_scope t.name, NS_HTML
2704 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2705 ins_mode_in_select = (t) ->
2706 if t.type is TYPE_TEXT and t.text is "\u0000"
2709 if t.type is TYPE_TEXT
2712 if t.type is TYPE_COMMENT
2715 if t.type is TYPE_DOCTYPE
2718 if t.type is TYPE_START_TAG and t.name is 'html'
2721 if t.type is TYPE_START_TAG and t.name is 'option'
2722 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2724 insert_html_element t
2726 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2727 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2729 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2731 insert_html_element t
2733 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2734 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2735 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2737 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2742 if t.type is TYPE_END_TAG and t.name is 'option'
2743 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2748 if t.type is TYPE_END_TAG and t.name is 'select'
2749 if is_in_select_scope 'select', NS_HTML
2751 el = open_els.shift()
2752 if el.name is 'select' and el.namespace is NS_HTML
2758 if t.type is TYPE_START_TAG and t.name is 'select'
2761 el = open_els.shift()
2762 if el.name is 'select' and el.namespace is NS_HTML
2765 # spec says that this is the same as </select> but it doesn't say
2766 # to check scope first
2768 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2770 unless is_in_select_scope 'select', NS_HTML
2773 el = open_els.shift()
2774 if el.name is 'select' and el.namespace is NS_HTML
2779 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2782 if t.type is TYPE_EOF
2789 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2790 ins_mode_in_select_in_table = (t) ->
2791 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2794 el = open_els.shift()
2795 if el.name is 'select' and el.namespace is NS_HTML
2800 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2802 unless is_in_table_scope t.name, NS_HTML
2805 el = open_els.shift()
2806 if el.name is 'select' and el.namespace is NS_HTML
2812 ins_mode_in_select t
2815 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2816 ins_mode_in_template = (t) ->
2817 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2820 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2823 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2824 template_ins_modes.shift()
2825 template_ins_modes.unshift ins_mode_in_table
2826 ins_mode = ins_mode_in_table
2829 if t.type is TYPE_START_TAG and t.name is 'col'
2830 template_ins_modes.shift()
2831 template_ins_modes.unshift ins_mode_in_column_group
2832 ins_mode = ins_mode_in_column_group
2835 if t.type is TYPE_START_TAG and t.name is 'tr'
2836 template_ins_modes.shift()
2837 template_ins_modes.unshift ins_mode_in_table_body
2838 ins_mode = ins_mode_in_table_body
2841 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2842 template_ins_modes.shift()
2843 template_ins_modes.unshift ins_mode_in_row
2844 ins_mode = ins_mode_in_row
2847 if t.type is TYPE_START_TAG
2848 template_ins_modes.shift()
2849 template_ins_modes.unshift ins_mode_in_body
2850 ins_mode = ins_mode_in_body
2853 if t.type is TYPE_END_TAG
2856 if t.type is TYPE_EOF
2857 unless template_tag_is_open()
2862 el = open_els.shift()
2863 if el.name is 'template' and el.namespace is NS_HTML
2865 clear_afe_to_marker()
2866 template_ins_modes.shift()
2870 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2871 ins_mode_after_body = (t) ->
2875 if t.type is TYPE_COMMENT
2876 first = open_els[open_els.length - 1]
2877 insert_comment t, [first, first.children.length]
2879 if t.type is TYPE_DOCTYPE
2882 if t.type is TYPE_START_TAG and t.name is 'html'
2885 if t.type is TYPE_END_TAG and t.name is 'html'
2886 if flag_fragment_parsing
2889 ins_mode = ins_mode_after_after_body
2891 if t.type is TYPE_EOF
2896 ins_mode = ins_mode_in_body
2899 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2900 ins_mode_in_frameset = (t) ->
2904 if t.type is TYPE_COMMENT
2907 if t.type is TYPE_DOCTYPE
2910 if t.type is TYPE_START_TAG and t.name is 'html'
2913 if t.type is TYPE_START_TAG and t.name is 'frameset'
2914 insert_html_element t
2916 if t.type is TYPE_END_TAG and t.name is 'frameset'
2917 if open_els.length is 1
2919 return # fragment case
2921 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2922 ins_mode = ins_mode_after_frameset
2924 if t.type is TYPE_START_TAG and t.name is 'frame'
2925 insert_html_element t
2927 t.acknowledge_self_closing()
2929 if t.type is TYPE_START_TAG and t.name is 'noframes'
2932 if t.type is TYPE_EOF
2933 if open_els.length isnt 1
2941 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2942 ins_mode_after_frameset = (t) ->
2946 if t.type is TYPE_COMMENT
2949 if t.type is TYPE_DOCTYPE
2952 if t.type is TYPE_START_TAG and t.name is 'html'
2955 if t.type is TYPE_END_TAG and t.name is 'html'
2956 ins_mode = ins_mode_after_after_frameset
2958 if t.type is TYPE_START_TAG and t.name is 'noframes'
2961 if t.type is TYPE_EOF
2968 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2969 ins_mode_after_after_body = (t) ->
2970 if t.type is TYPE_COMMENT
2971 insert_comment t, [doc, doc.children.length]
2973 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2976 if t.type is TYPE_EOF
2981 ins_mode = ins_mode_in_body
2985 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2986 ins_mode_after_after_frameset = (t) ->
2987 if t.type is TYPE_COMMENT
2988 insert_comment t, [doc, doc.children.length]
2990 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2993 if t.type is TYPE_EOF
2996 if t.type is TYPE_START_TAG and t.name is 'noframes'
3003 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
3004 has_color_face_or_size = (t) ->
3006 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
3009 in_foreign_content_end_script = ->
3013 in_foreign_content_other_start = (t) ->
3014 acn = adjusted_current_node()
3015 if acn.namespace is NS_MATHML
3016 adjust_mathml_attributes t
3017 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
3018 t.name = svg_name_fixes[t.name]
3019 if acn.namespace is NS_SVG
3020 adjust_svg_attributes t
3021 adjust_foreign_attributes t
3022 insert_foreign_element t, acn.namespace
3023 if t.flag 'self-closing'
3024 if t.name is 'script'
3025 t.acknowledge_self_closing()
3026 in_foreign_content_end_script()
3030 t.acknowledge_self_closing()
3032 in_foreign_content = (t) ->
3033 if t.type is TYPE_TEXT and t.text is "\u0000"
3035 insert_character new_character_token "\ufffd"
3040 if t.type is TYPE_TEXT
3041 flag_frameset_ok = false
3044 if t.type is TYPE_COMMENT
3047 if t.type is TYPE_DOCTYPE
3050 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
3052 if flag_fragment_parsing
3053 in_foreign_content_other_start t
3055 loop # is this safe?
3057 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
3061 if t.type is TYPE_START_TAG
3062 in_foreign_content_other_start t
3064 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
3065 in_foreign_content_end_script()
3067 if t.type is TYPE_END_TAG
3070 if node.name.toLowerCase() isnt t.name
3073 if node is open_els[open_els.length - 1]
3075 if node.name.toLowerCase() is t.name
3077 el = open_els.shift()
3082 if node.namespace is NS_HTML
3084 ins_mode t # explicitly call HTML insertion mode
3087 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
3089 switch c = txt.charAt(cur++)
3091 return new_text_node parse_character_reference()
3093 tok_state = tok_state_tag_open
3096 return new_text_node c
3098 return new_eof_token()
3100 return new_text_node c
3103 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
3104 # not needed: tok_state_character_reference_in_data = ->
3105 # just call parse_character_reference()
3107 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
3108 tok_state_rcdata = ->
3109 switch c = txt.charAt(cur++)
3111 return new_text_node parse_character_reference()
3113 tok_state = tok_state_rcdata_less_than_sign
3116 return new_character_token "\ufffd"
3118 return new_eof_token()
3120 return new_character_token c
3123 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
3124 # not needed: tok_state_character_reference_in_rcdata = ->
3125 # just call parse_character_reference()
3127 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
3128 tok_state_rawtext = ->
3129 switch c = txt.charAt(cur++)
3131 tok_state = tok_state_rawtext_less_than_sign
3134 return new_character_token "\ufffd"
3136 return new_eof_token()
3138 return new_character_token c
3141 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
3142 tok_state_script_data = ->
3143 switch c = txt.charAt(cur++)
3145 tok_state = tok_state_script_data_less_than_sign
3148 return new_character_token "\ufffd"
3150 return new_eof_token()
3152 return new_character_token c
3155 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3156 tok_state_plaintext = ->
3157 switch c = txt.charAt(cur++)
3160 return new_character_token "\ufffd"
3162 return new_eof_token()
3164 return new_character_token c
3168 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3169 tok_state_tag_open = ->
3170 c = txt.charAt(cur++)
3172 tok_state = tok_state_markup_declaration_open
3175 tok_state = tok_state_end_tag_open
3178 tok_cur_tag = new_open_tag c.toLowerCase()
3179 tok_state = tok_state_tag_name
3182 tok_cur_tag = new_open_tag c
3183 tok_state = tok_state_tag_name
3187 tok_cur_tag = new_comment_token '?' # FIXME right?
3188 tok_state = tok_state_bogus_comment
3192 tok_state = tok_state_data
3193 cur -= 1 # we didn't parse/handle the char after <
3194 return new_text_node '<'
3196 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3197 tok_state_end_tag_open = ->
3198 c = txt.charAt(cur++)
3200 tok_cur_tag = new_end_tag c.toLowerCase()
3201 tok_state = tok_state_tag_name
3204 tok_cur_tag = new_end_tag c
3205 tok_state = tok_state_tag_name
3209 tok_state = tok_state_data
3213 tok_state = tok_state_data
3214 return new_text_node '</'
3217 tok_cur_tag = new_comment_token c
3218 tok_state = tok_state_bogus_comment
3221 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3222 tok_state_tag_name = ->
3223 switch c = txt.charAt(cur++)
3224 when "\t", "\n", "\u000c", ' '
3225 tok_state = tok_state_before_attribute_name
3227 tok_state = tok_state_self_closing_start_tag
3229 tok_state = tok_state_data
3235 tok_cur_tag.name += "\ufffd"
3238 tok_state = tok_state_data
3241 tok_cur_tag.name += c.toLowerCase()
3243 tok_cur_tag.name += c
3246 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3247 tok_state_rcdata_less_than_sign = ->
3248 c = txt.charAt(cur++)
3250 temporary_buffer = ''
3251 tok_state = tok_state_rcdata_end_tag_open
3254 tok_state = tok_state_rcdata
3255 cur -= 1 # reconsume the input character
3256 return new_character_token '<'
3258 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3259 tok_state_rcdata_end_tag_open = ->
3260 c = txt.charAt(cur++)
3262 tok_cur_tag = new_end_tag c.toLowerCase()
3263 temporary_buffer += c
3264 tok_state = tok_state_rcdata_end_tag_name
3267 tok_cur_tag = new_end_tag c
3268 temporary_buffer += c
3269 tok_state = tok_state_rcdata_end_tag_name
3272 tok_state = tok_state_rcdata
3273 cur -= 1 # reconsume the input character
3274 return new_character_token "</" # fixfull separate these
3276 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3277 is_appropriate_end_tag = (t) ->
3278 # spec says to check against "the tag name of the last start tag to
3279 # have been emitted from this tokenizer", but this is only called from
3280 # the various "raw" states, so it's hopefully ok to assume that
3281 # open_els[0].name will work instead TODO: verify this after the script
3282 # data states are implemented
3283 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3284 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3286 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3287 tok_state_rcdata_end_tag_name = ->
3288 c = txt.charAt(cur++)
3289 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3290 if is_appropriate_end_tag tok_cur_tag
3291 tok_state = tok_state_before_attribute_name
3293 # else fall through to "Anything else"
3295 if is_appropriate_end_tag tok_cur_tag
3296 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3298 # else fall through to "Anything else"
3300 if is_appropriate_end_tag tok_cur_tag
3301 tok_state = tok_state_data
3303 # else fall through to "Anything else"
3305 tok_cur_tag.name += c.toLowerCase()
3306 temporary_buffer += c
3309 tok_cur_tag.name += c
3310 temporary_buffer += c
3313 tok_state = tok_state_rcdata
3314 cur -= 1 # reconsume the input character
3315 return new_character_token '</' + temporary_buffer # fixfull separate these
3317 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3318 tok_state_rawtext_less_than_sign = ->
3319 c = txt.charAt(cur++)
3321 temporary_buffer = ''
3322 tok_state = tok_state_rawtext_end_tag_open
3325 tok_state = tok_state_rawtext
3326 cur -= 1 # reconsume the input character
3327 return new_character_token '<'
3329 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3330 tok_state_rawtext_end_tag_open = ->
3331 c = txt.charAt(cur++)
3333 tok_cur_tag = new_end_tag c.toLowerCase()
3334 temporary_buffer += c
3335 tok_state = tok_state_rawtext_end_tag_name
3338 tok_cur_tag = new_end_tag c
3339 temporary_buffer += c
3340 tok_state = tok_state_rawtext_end_tag_name
3343 tok_state = tok_state_rawtext
3344 cur -= 1 # reconsume the input character
3345 return new_character_token "</" # fixfull separate these
3347 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3348 tok_state_rawtext_end_tag_name = ->
3349 c = txt.charAt(cur++)
3350 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3351 if is_appropriate_end_tag tok_cur_tag
3352 tok_state = tok_state_before_attribute_name
3354 # else fall through to "Anything else"
3356 if is_appropriate_end_tag tok_cur_tag
3357 tok_state = tok_state_self_closing_start_tag
3359 # else fall through to "Anything else"
3361 if is_appropriate_end_tag tok_cur_tag
3362 tok_state = tok_state_data
3364 # else fall through to "Anything else"
3366 tok_cur_tag.name += c.toLowerCase()
3367 temporary_buffer += c
3370 tok_cur_tag.name += c
3371 temporary_buffer += c
3374 tok_state = tok_state_rawtext
3375 cur -= 1 # reconsume the input character
3376 return new_character_token '</' + temporary_buffer # fixfull separate these
3378 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3379 tok_state_script_data_less_than_sign = ->
3380 c = txt.charAt(cur++)
3382 temporary_buffer = ''
3383 tok_state = tok_state_script_data_end_tag_open
3386 tok_state = tok_state_script_data_escape_start
3387 return new_character_token '<!' # fixfull split
3389 tok_state = tok_state_script_data
3390 cur -= 1 # Reconsume
3391 return new_character_token '<'
3393 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3394 tok_state_script_data_end_tag_open = ->
3395 c = txt.charAt(cur++)
3397 tok_cur_tag = new_end_tag c.toLowerCase()
3398 temporary_buffer += c
3399 tok_state = tok_state_script_data_end_tag_name
3402 tok_cur_tag = new_end_tag c
3403 temporary_buffer += c
3404 tok_state = tok_state_script_data_end_tag_name
3407 tok_state = tok_state_script_data
3408 cur -= 1 # Reconsume
3409 return new_character_token '</'
3411 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3412 tok_state_script_data_end_tag_name = ->
3413 c = txt.charAt(cur++)
3414 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3415 if is_appropriate_end_tag tok_cur_tag
3416 tok_state = tok_state_before_attribute_name
3420 if is_appropriate_end_tag tok_cur_tag
3421 tok_state = tok_state_self_closing_start_tag
3425 if is_appropriate_end_tag tok_cur_tag
3426 tok_state = tok_state_data
3430 tok_cur_tag.name += c.toLowerCase()
3431 temporary_buffer += c
3434 tok_cur_tag.name += c
3435 temporary_buffer += c
3438 tok_state = tok_state_script_data
3439 cur -= 1 # Reconsume
3440 return new_character_token "</#{temporary_buffer}" # fixfull split
3442 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3443 tok_state_script_data_escape_start = ->
3444 c = txt.charAt(cur++)
3446 tok_state = tok_state_script_data_escape_start_dash
3447 return new_character_token '-'
3449 tok_state = tok_state_script_data
3450 cur -= 1 # Reconsume
3453 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3454 tok_state_script_data_escape_start_dash = ->
3455 c = txt.charAt(cur++)
3457 tok_state = tok_state_script_data_escaped_dash_dash
3458 return new_character_token '-'
3460 tok_state = tok_state_script_data
3461 cur -= 1 # Reconsume
3464 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3465 tok_state_script_data_escaped = ->
3466 c = txt.charAt(cur++)
3468 tok_state = tok_state_script_data_escaped_dash
3469 return new_character_token '-'
3471 tok_state = tok_state_script_data_escaped_less_than_sign
3475 return new_character_token "\ufffd"
3477 tok_state = tok_state_data
3479 cur -= 1 # Reconsume
3482 return new_character_token c
3484 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3485 tok_state_script_data_escaped_dash = ->
3486 c = txt.charAt(cur++)
3488 tok_state = tok_state_script_data_escaped_dash_dash
3489 return new_character_token '-'
3491 tok_state = tok_state_script_data_escaped_less_than_sign
3495 tok_state = tok_state_script_data_escaped
3496 return new_character_token "\ufffd"
3498 tok_state = tok_state_data
3500 cur -= 1 # Reconsume
3503 tok_state = tok_state_script_data_escaped
3504 return new_character_token c
3506 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3507 tok_state_script_data_escaped_dash_dash = ->
3508 c = txt.charAt(cur++)
3510 return new_character_token '-'
3512 tok_state = tok_state_script_data_escaped_less_than_sign
3515 tok_state = tok_state_script_data
3516 return new_character_token '>'
3519 tok_state = tok_state_script_data_escaped
3520 return new_character_token "\ufffd"
3523 tok_state = tok_state_data
3524 cur -= 1 # Reconsume
3527 tok_state = tok_state_script_data_escaped
3528 return new_character_token c
3530 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3531 tok_state_script_data_escaped_less_than_sign = ->
3532 c = txt.charAt(cur++)
3534 temporary_buffer = ''
3535 tok_state = tok_state_script_data_escaped_end_tag_open
3538 temporary_buffer = c.toLowerCase() # yes, really
3539 tok_state = tok_state_script_data_double_escape_start
3540 return new_character_token "<#{c}" # fixfull split
3542 temporary_buffer = c
3543 tok_state = tok_state_script_data_double_escape_start
3544 return new_character_token "<#{c}" # fixfull split
3546 tok_state = tok_state_script_data_escaped
3547 cur -= 1 # Reconsume
3548 return new_character_token '<'
3550 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3551 tok_state_script_data_escaped_end_tag_open = ->
3552 c = txt.charAt(cur++)
3554 tok_cur_tag = new_end_tag c.toLowerCase()
3555 temporary_buffer += c
3556 tok_state = tok_state_script_data_escaped_end_tag_name
3559 tok_cur_tag = new_end_tag c
3560 temporary_buffer += c
3561 tok_state = tok_state_script_data_escaped_end_tag_name
3564 tok_state = tok_state_script_data_escaped
3565 cur -= 1 # Reconsume
3566 return new_character_token '</' # fixfull split
3568 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3569 tok_state_script_data_escaped_end_tag_name = ->
3570 c = txt.charAt(cur++)
3571 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3572 if is_appropriate_end_tag tok_cur_tag
3573 tok_state = tok_state_before_attribute_name
3577 if is_appropriate_end_tag tok_cur_tag
3578 tok_state = tok_state_self_closing_start_tag
3582 if is_appropriate_end_tag tok_cur_tag
3583 tok_state = tok_state_data
3587 tok_cur_tag.name += c.toLowerCase()
3588 temporary_buffer += c.toLowerCase()
3591 tok_cur_tag.name += c
3592 temporary_buffer += c.toLowerCase()
3595 tok_state = tok_state_script_data_escaped
3596 cur -= 1 # Reconsume
3597 return new_character_token "</#{temporary_buffer}" # fixfull split
3599 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3600 tok_state_script_data_double_escape_start = ->
3601 c = txt.charAt(cur++)
3602 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3603 if temporary_buffer is 'script'
3604 tok_state = tok_state_script_data_double_escaped
3606 tok_state = tok_state_script_data_escaped
3607 return new_character_token c
3609 temporary_buffer += c.toLowerCase() # yes, really lowercase
3610 return new_character_token c
3612 temporary_buffer += c
3613 return new_character_token c
3615 tok_state = tok_state_script_data_escaped
3616 cur -= 1 # Reconsume
3619 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3620 tok_state_script_data_double_escaped = ->
3621 c = txt.charAt(cur++)
3623 tok_state = tok_state_script_data_double_escaped_dash
3624 return new_character_token '-'
3626 tok_state = tok_state_script_data_double_escaped_less_than_sign
3627 return new_character_token '<'
3630 return new_character_token "\ufffd"
3633 tok_state = tok_state_data
3634 cur -= 1 # Reconsume
3637 return new_character_token c
3639 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3640 tok_state_script_data_double_escaped_dash = ->
3641 c = txt.charAt(cur++)
3643 tok_state = tok_state_script_data_double_escaped_dash_dash
3644 return new_character_token '-'
3646 tok_state = tok_state_script_data_double_escaped_less_than_sign
3647 return new_character_token '<'
3650 tok_state = tok_state_script_data_double_escaped
3651 return new_character_token "\ufffd"
3654 tok_state = tok_state_data
3655 cur -= 1 # Reconsume
3658 tok_state = tok_state_script_data_double_escaped
3659 return new_character_token c
3661 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3662 tok_state_script_data_double_escaped_dash_dash = ->
3663 c = txt.charAt(cur++)
3665 return new_character_token '-'
3667 tok_state = tok_state_script_data_double_escaped_less_than_sign
3668 return new_character_token '<'
3670 tok_state = tok_state_script_data
3671 return new_character_token '>'
3674 tok_state = tok_state_script_data_double_escaped
3675 return new_character_token "\ufffd"
3678 tok_state = tok_state_data
3679 cur -= 1 # Reconsume
3682 tok_state = tok_state_script_data_double_escaped
3683 return new_character_token c
3685 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3686 tok_state_script_data_double_escaped_less_than_sign = ->
3687 c = txt.charAt(cur++)
3689 temporary_buffer = ''
3690 tok_state = tok_state_script_data_double_escape_end
3691 return new_character_token '/'
3693 tok_state = tok_state_script_data_double_escaped
3694 cur -= 1 # Reconsume
3697 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3698 tok_state_script_data_double_escape_end = ->
3699 c = txt.charAt(cur++)
3700 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3701 if temporary_buffer is 'script'
3702 tok_state = tok_state_script_data_escaped
3704 tok_state = tok_state_script_data_double_escaped
3705 return new_character_token c
3707 temporary_buffer += c.toLowerCase() # yes, really lowercase
3708 return new_character_token c
3710 temporary_buffer += c
3711 return new_character_token c
3713 tok_state = tok_state_script_data_double_escaped
3714 cur -= 1 # Reconsume
3717 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3718 tok_state_before_attribute_name = ->
3720 switch c = txt.charAt(cur++)
3721 when "\t", "\n", "\u000c", ' '
3724 tok_state = tok_state_self_closing_start_tag
3727 tok_state = tok_state_data
3733 attr_name = "\ufffd"
3734 when '"', "'", '<', '='
3739 tok_state = tok_state_data
3742 attr_name = c.toLowerCase()
3746 tok_cur_tag.attrs_a.unshift [attr_name, '']
3747 tok_state = tok_state_attribute_name
3750 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3751 tok_state_attribute_name = ->
3752 switch c = txt.charAt(cur++)
3753 when "\t", "\n", "\u000c", ' '
3754 tok_state = tok_state_after_attribute_name
3756 tok_state = tok_state_self_closing_start_tag
3758 tok_state = tok_state_before_attribute_value
3760 tok_state = tok_state_data
3766 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3769 tok_cur_tag.attrs_a[0][0] += c
3772 tok_state = tok_state_data
3775 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3777 tok_cur_tag.attrs_a[0][0] += c
3780 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3781 tok_state_after_attribute_name = ->
3782 c = txt.charAt(cur++)
3783 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3786 tok_state = tok_state_self_closing_start_tag
3789 tok_state = tok_state_before_attribute_value
3792 tok_state = tok_state_data
3795 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3796 tok_state = tok_state_attribute_name
3800 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3801 tok_state = tok_state_attribute_name
3805 tok_state = tok_state_data
3806 cur -= 1 # reconsume
3808 if c is '"' or c is "'" or c is '<'
3810 # fall through to Anything else
3812 tok_cur_tag.attrs_a.unshift [c, '']
3813 tok_state = tok_state_attribute_name
3815 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3816 tok_state_before_attribute_value = ->
3817 switch c = txt.charAt(cur++)
3818 when "\t", "\n", "\u000c", ' '
3821 tok_state = tok_state_attribute_value_double_quoted
3823 tok_state = tok_state_attribute_value_unquoted
3826 tok_state = tok_state_attribute_value_single_quoted
3829 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3830 tok_state = tok_state_attribute_value_unquoted
3833 tok_state = tok_state_data
3839 tok_state = tok_state_data
3841 tok_cur_tag.attrs_a[0][1] += c
3842 tok_state = tok_state_attribute_value_unquoted
3845 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3846 tok_state_attribute_value_double_quoted = ->
3847 switch c = txt.charAt(cur++)
3849 tok_state = tok_state_after_attribute_value_quoted
3851 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3854 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3857 tok_state = tok_state_data
3859 tok_cur_tag.attrs_a[0][1] += c
3862 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3863 tok_state_attribute_value_single_quoted = ->
3864 switch c = txt.charAt(cur++)
3866 tok_state = tok_state_after_attribute_value_quoted
3868 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3871 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3874 tok_state = tok_state_data
3876 tok_cur_tag.attrs_a[0][1] += c
3879 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3880 tok_state_attribute_value_unquoted = ->
3881 switch c = txt.charAt(cur++)
3882 when "\t", "\n", "\u000c", ' '
3883 tok_state = tok_state_before_attribute_name
3885 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3887 tok_state = tok_state_data
3892 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3895 tok_state = tok_state_data
3897 # Parse Error if ', <, = or ` (backtick)
3898 tok_cur_tag.attrs_a[0][1] += c
3901 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3902 tok_state_after_attribute_value_quoted = ->
3903 switch c = txt.charAt(cur++)
3904 when "\t", "\n", "\u000c", ' '
3905 tok_state = tok_state_before_attribute_name
3907 tok_state = tok_state_self_closing_start_tag
3909 tok_state = tok_state_data
3915 tok_state = tok_state_data
3918 tok_state = tok_state_before_attribute_name
3919 cur -= 1 # we didn't handle that char
3922 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3923 tok_state_self_closing_start_tag = ->
3924 c = txt.charAt(cur++)
3926 tok_cur_tag.flag 'self-closing', true
3927 tok_state = tok_state_data
3931 tok_state = tok_state_data
3932 cur -= 1 # Reconsume
3936 tok_state = tok_state_before_attribute_name
3937 cur -= 1 # Reconsume
3940 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3941 # WARNING: put a comment token in tok_cur_tag before setting this state
3942 tok_state_bogus_comment = ->
3943 next_gt = txt.indexOf '>', cur
3945 val = txt.substr cur
3948 val = txt.substr cur, (next_gt - cur)
3950 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3951 tok_cur_tag.text += val
3952 tok_state = tok_state_data
3955 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3956 tok_state_markup_declaration_open = ->
3957 if txt.substr(cur, 2) is '--'
3959 tok_cur_tag = new_comment_token ''
3960 tok_state = tok_state_comment_start
3962 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3964 tok_state = tok_state_doctype
3966 acn = adjusted_current_node()
3967 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3969 tok_state = tok_state_cdata_section
3973 tok_cur_tag = new_comment_token ''
3974 tok_state = tok_state_bogus_comment
3977 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3978 tok_state_comment_start = ->
3979 switch c = txt.charAt(cur++)
3981 tok_state = tok_state_comment_start_dash
3984 tok_state = tok_state_comment
3985 return new_character_token "\ufffd"
3988 tok_state = tok_state_data
3992 tok_state = tok_state_data
3993 cur -= 1 # Reconsume
3996 tok_cur_tag.text += c
3997 tok_state = tok_state_comment
4000 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
4001 tok_state_comment_start_dash = ->
4002 switch c = txt.charAt(cur++)
4004 tok_state = tok_state_comment_end
4007 tok_cur_tag.text += "-\ufffd"
4008 tok_state = tok_state_comment
4011 tok_state = tok_state_data
4015 tok_state = tok_state_data
4016 cur -= 1 # Reconsume
4019 tok_cur_tag.text += "-#{c}"
4020 tok_state = tok_state_comment
4023 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
4024 tok_state_comment = ->
4025 switch c = txt.charAt(cur++)
4027 tok_state = tok_state_comment_end_dash
4030 tok_cur_tag.text += "\ufffd"
4033 tok_state = tok_state_data
4034 cur -= 1 # Reconsume
4037 tok_cur_tag.text += c
4040 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
4041 tok_state_comment_end_dash = ->
4042 switch c = txt.charAt(cur++)
4044 tok_state = tok_state_comment_end
4047 tok_cur_tag.text += "-\ufffd"
4048 tok_state = tok_state_comment
4051 tok_state = tok_state_data
4052 cur -= 1 # Reconsume
4055 tok_cur_tag.text += "-#{c}"
4056 tok_state = tok_state_comment
4059 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
4060 tok_state_comment_end = ->
4061 switch c = txt.charAt(cur++)
4063 tok_state = tok_state_data
4067 tok_cur_tag.text += "--\ufffd"
4068 tok_state = tok_state_comment
4071 tok_state = tok_state_comment_end_bang
4074 tok_cur_tag.text += '-'
4077 tok_state = tok_state_data
4078 cur -= 1 # Reconsume
4082 tok_cur_tag.text += "--#{c}"
4083 tok_state = tok_state_comment
4086 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
4087 tok_state_comment_end_bang = ->
4088 switch c = txt.charAt(cur++)
4090 tok_cur_tag.text += "--!#{c}"
4091 tok_state = tok_state_comment_end_dash
4093 tok_state = tok_state_data
4097 tok_cur_tag.text += "--!\ufffd"
4098 tok_state = tok_state_comment
4101 tok_state = tok_state_data
4102 cur -= 1 # Reconsume
4105 tok_cur_tag.text += "--!#{c}"
4106 tok_state = tok_state_comment
4109 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4110 tok_state_doctype = ->
4111 switch c = txt.charAt(cur++)
4112 when "\t", "\u000a", "\u000c", ' '
4113 tok_state = tok_state_before_doctype_name
4116 tok_state = tok_state_data
4117 el = new_doctype_token ''
4118 el.flag 'force-quirks', true
4119 cur -= 1 # Reconsume
4123 tok_state = tok_state_before_doctype_name
4124 cur -= 1 # Reconsume
4127 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4128 tok_state_before_doctype_name = ->
4129 c = txt.charAt(cur++)
4130 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4133 tok_cur_tag = new_doctype_token c.toLowerCase()
4134 tok_state = tok_state_doctype_name
4138 tok_cur_tag = new_doctype_token "\ufffd"
4139 tok_state = tok_state_doctype_name
4143 el = new_doctype_token ''
4144 el.flag 'force-quirks', true
4145 tok_state = tok_state_data
4149 tok_state = tok_state_data
4150 el = new_doctype_token ''
4151 el.flag 'force-quirks', true
4152 cur -= 1 # Reconsume
4155 tok_cur_tag = new_doctype_token c
4156 tok_state = tok_state_doctype_name
4159 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4160 tok_state_doctype_name = ->
4161 c = txt.charAt(cur++)
4162 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4163 tok_state = tok_state_after_doctype_name
4166 tok_state = tok_state_data
4169 tok_cur_tag.name += c.toLowerCase()
4173 tok_cur_tag.name += "\ufffd"
4177 tok_state = tok_state_data
4178 tok_cur_tag.flag 'force-quirks', true
4179 cur -= 1 # Reconsume
4182 tok_cur_tag.name += c
4185 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4186 tok_state_after_doctype_name = ->
4187 c = txt.charAt(cur++)
4188 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4191 tok_state = tok_state_data
4195 tok_state = tok_state_data
4196 tok_cur_tag.flag 'force-quirks', true
4197 cur -= 1 # Reconsume
4200 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4202 tok_state = tok_state_after_doctype_public_keyword
4204 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4206 tok_state = tok_state_after_doctype_system_keyword
4209 tok_cur_tag.flag 'force-quirks', true
4210 tok_state = tok_state_bogus_doctype
4213 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4214 tok_state_after_doctype_public_keyword = ->
4215 c = txt.charAt(cur++)
4216 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4217 tok_state = tok_state_before_doctype_public_identifier
4221 tok_cur_tag.public_identifier = ''
4222 tok_state = tok_state_doctype_public_identifier_double_quoted
4226 tok_cur_tag.public_identifier = ''
4227 tok_state = tok_state_doctype_public_identifier_single_quoted
4231 tok_cur_tag.flag 'force-quirks', true
4232 tok_state = tok_state_data
4236 tok_state = tok_state_data
4237 tok_cur_tag.flag 'force-quirks', true
4238 cur -= 1 # Reconsume
4242 tok_cur_tag.flag 'force-quirks', true
4243 tok_state = tok_state_bogus_doctype
4246 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4247 tok_state_before_doctype_public_identifier = ->
4248 c = txt.charAt(cur++)
4249 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4253 tok_cur_tag.public_identifier = ''
4254 tok_state = tok_state_doctype_public_identifier_double_quoted
4258 tok_cur_tag.public_identifier = ''
4259 tok_state = tok_state_doctype_public_identifier_single_quoted
4263 tok_cur_tag.flag 'force-quirks', true
4264 tok_state = tok_state_data
4268 tok_state = tok_state_data
4269 tok_cur_tag.flag 'force-quirks', true
4270 cur -= 1 # Reconsume
4274 tok_cur_tag.flag 'force-quirks', true
4275 tok_state = tok_state_bogus_doctype
4279 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4280 tok_state_doctype_public_identifier_double_quoted = ->
4281 c = txt.charAt(cur++)
4283 tok_state = tok_state_after_doctype_public_identifier
4287 tok_cur_tag.public_identifier += "\ufffd"
4291 tok_cur_tag.flag 'force-quirks', true
4292 tok_state = tok_state_data
4296 tok_state = tok_state_data
4297 tok_cur_tag.flag 'force-quirks', true
4298 cur -= 1 # Reconsume
4301 tok_cur_tag.public_identifier += c
4304 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4305 tok_state_doctype_public_identifier_single_quoted = ->
4306 c = txt.charAt(cur++)
4308 tok_state = tok_state_after_doctype_public_identifier
4312 tok_cur_tag.public_identifier += "\ufffd"
4316 tok_cur_tag.flag 'force-quirks', true
4317 tok_state = tok_state_data
4321 tok_state = tok_state_data
4322 tok_cur_tag.flag 'force-quirks', true
4323 cur -= 1 # Reconsume
4326 tok_cur_tag.public_identifier += c
4329 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4330 tok_state_after_doctype_public_identifier = ->
4331 c = txt.charAt(cur++)
4332 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4333 tok_state = tok_state_between_doctype_public_and_system_identifiers
4336 tok_state = tok_state_data
4340 tok_cur_tag.system_identifier = ''
4341 tok_state = tok_state_doctype_system_identifier_double_quoted
4345 tok_cur_tag.system_identifier = ''
4346 tok_state = tok_state_doctype_system_identifier_single_quoted
4350 tok_state = tok_state_data
4351 tok_cur_tag.flag 'force-quirks', true
4352 cur -= 1 # Reconsume
4356 tok_cur_tag.flag 'force-quirks', true
4357 tok_state = tok_state_bogus_doctype
4360 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4361 tok_state_between_doctype_public_and_system_identifiers = ->
4362 c = txt.charAt(cur++)
4363 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4366 tok_state = tok_state_data
4370 tok_cur_tag.system_identifier = ''
4371 tok_state = tok_state_doctype_system_identifier_double_quoted
4375 tok_cur_tag.system_identifier = ''
4376 tok_state = tok_state_doctype_system_identifier_single_quoted
4380 tok_state = tok_state_data
4381 tok_cur_tag.flag 'force-quirks', true
4382 cur -= 1 # Reconsume
4386 tok_cur_tag.flag 'force-quirks', true
4387 tok_state = tok_state_bogus_doctype
4390 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4391 tok_state_after_doctype_system_keyword = ->
4392 c = txt.charAt(cur++)
4393 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4394 tok_state = tok_state_before_doctype_system_identifier
4398 tok_cur_tag.system_identifier = ''
4399 tok_state = tok_state_doctype_system_identifier_double_quoted
4403 tok_cur_tag.system_identifier = ''
4404 tok_state = tok_state_doctype_system_identifier_single_quoted
4408 tok_cur_tag.flag 'force-quirks', true
4409 tok_state = tok_state_data
4413 tok_state = tok_state_data
4414 tok_cur_tag.flag 'force-quirks', true
4415 cur -= 1 # Reconsume
4419 tok_cur_tag.flag 'force-quirks', true
4420 tok_state = tok_state_bogus_doctype
4423 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4424 tok_state_before_doctype_system_identifier = ->
4425 c = txt.charAt(cur++)
4426 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4429 tok_cur_tag.system_identifier = ''
4430 tok_state = tok_state_doctype_system_identifier_double_quoted
4433 tok_cur_tag.system_identifier = ''
4434 tok_state = tok_state_doctype_system_identifier_single_quoted
4438 tok_cur_tag.flag 'force-quirks', true
4439 tok_state = tok_state_data
4443 tok_state = tok_state_data
4444 tok_cur_tag.flag 'force-quirks', true
4445 cur -= 1 # Reconsume
4449 tok_cur_tag.flag 'force-quirks', true
4450 tok_state = tok_state_bogus_doctype
4453 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4454 tok_state_doctype_system_identifier_double_quoted = ->
4455 c = txt.charAt(cur++)
4457 tok_state = tok_state_after_doctype_system_identifier
4461 tok_cur_tag.system_identifier += "\ufffd"
4465 tok_cur_tag.flag 'force-quirks', true
4466 tok_state = tok_state_data
4470 tok_state = tok_state_data
4471 tok_cur_tag.flag 'force-quirks', true
4472 cur -= 1 # Reconsume
4475 tok_cur_tag.system_identifier += c
4478 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4479 tok_state_doctype_system_identifier_single_quoted = ->
4480 c = txt.charAt(cur++)
4482 tok_state = tok_state_after_doctype_system_identifier
4486 tok_cur_tag.system_identifier += "\ufffd"
4490 tok_cur_tag.flag 'force-quirks', true
4491 tok_state = tok_state_data
4495 tok_state = tok_state_data
4496 tok_cur_tag.flag 'force-quirks', true
4497 cur -= 1 # Reconsume
4500 tok_cur_tag.system_identifier += c
4503 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4504 tok_state_after_doctype_system_identifier = ->
4505 c = txt.charAt(cur++)
4506 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4509 tok_state = tok_state_data
4513 tok_state = tok_state_data
4514 tok_cur_tag.flag 'force-quirks', true
4515 cur -= 1 # Reconsume
4519 # do _not_ tok_cur_tag.flag 'force-quirks', true
4520 tok_state = tok_state_bogus_doctype
4523 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4524 tok_state_bogus_doctype = ->
4525 c = txt.charAt(cur++)
4527 tok_state = tok_state_data
4530 tok_state = tok_state_data
4531 cur -= 1 # Reconsume
4536 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4537 tok_state_cdata_section = ->
4538 tok_state = tok_state_data
4539 next_gt = txt.indexOf ']]>', cur
4541 val = txt.substr cur
4544 val = txt.substr cur, (next_gt - cur)
4546 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
4548 return new_character_token val # fixfull split
4551 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4552 # Don't set this as a state, just call it
4553 # returns a string (NOT a text node)
4554 parse_character_reference = (allowed_char = null, in_attr = false) ->
4555 if cur >= txt.length
4557 switch c = txt.charAt(cur)
4558 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4559 # explicitly not a parse error
4562 # there has to be "one or more" alnums between & and ; to be a parse error
4565 if cur + 1 >= txt.length
4567 if txt.charAt(cur + 1).toLowerCase() is 'x'
4576 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4581 if txt.charAt(start + i) is ';'
4585 code_point = txt.substr(start, i)
4586 while code_point.charAt(0) is '0' and code_point.length > 1
4587 code_point = code_point.substr 1
4588 code_point = parseInt(code_point, base)
4589 if unicode_fixes[code_point]?
4591 return unicode_fixes[code_point]
4593 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4597 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4599 return from_code_point code_point
4603 if alnum.indexOf(txt.charAt(cur + i)) is -1
4606 # exit early, because parse_error() below needs at least one alnum
4608 if txt.charAt(cur + i) is ';'
4609 i += 1 # include ';' terminator in value
4610 decoded = decode_named_char_ref txt.substr(cur, i)
4617 # no ';' terminator (only legacy char refs)
4619 for i in [2..max] # no prefix matches, so ok to check shortest first
4620 c = legacy_char_refs[txt.substr(cur, i)]
4623 if txt.charAt(cur + i) is '='
4624 # "because some legacy user agents will
4625 # misinterpret the markup in those cases"
4628 if alnum.indexOf(txt.charAt(cur + i)) > -1
4629 # this makes attributes forgiving about url args
4631 # ok, and besides the weird exceptions for attributes...
4632 # return the matching char
4633 cur += i # consume entity chars
4634 parse_error() # because no terminating ";"
4638 return # never reached
4640 eat_next_token_if_newline = ->
4645 if t.type is TYPE_TEXT
4646 # definition of a newline depends on whether it was a character ref or not
4647 if cur - old_cur is 1
4648 # not a character reference
4649 if t.text is "\u000d" or t.text is "\u000a"
4652 if t.text is "\u000a"
4658 # tree constructor initialization
4659 # see comments on TYPE_TAG/etc for the structure of this data
4662 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4663 doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
4665 afe = [] # active formatting elements
4666 template_ins_modes = []
4667 ins_mode = ins_mode_initial
4668 original_ins_mode = ins_mode # TODO check spec
4669 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4670 flag_frameset_ok = true
4672 flag_foster_parenting = false
4673 form_element_pointer = null
4674 temporary_buffer = null
4675 pending_table_character_tokens = []
4676 head_element_pointer = null
4677 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4678 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4679 prev_node_id = 0 # just for debugging
4681 # tokenizer initialization
4682 tok_state = tok_state_data
4684 # text pre-processing
4685 # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4686 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4687 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4689 if args.name is "webkit01.dat #12"
4692 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4693 parse_main_loop = ->
4698 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4702 serialize_els = (els, shallow, show_ids) ->
4708 serialized += t.serialize shallow, show_ids
4711 module.exports.parse_html = parse_html
4712 module.exports.debug_log_reset = debug_log_reset
4713 module.exports.debug_log_each = debug_log_each
4714 module.exports.TYPE_TAG = TYPE_TAG
4715 module.exports.TYPE_TEXT = TYPE_TEXT
4716 module.exports.TYPE_COMMENT = TYPE_COMMENT
4717 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4718 module.exports.NS_HTML = NS_HTML
4719 module.exports.NS_MATHML = NS_MATHML
4720 module.exports.NS_SVG = NS_SVG
4721 module.exports.QUIRKS_NO = QUIRKS_NO
4722 module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
4723 module.exports.QUIRKS_YES = QUIRKS_YES