1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
25 # Deviations from that spec:
27 # Purposeful: search this file for "WHATWG"
29 # Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
34 # the spec uses a many different words do indicate which ends of lists/stacks
35 # they are talking about (and relative movement within the lists/stacks). This
36 # section splains. I'm implementing "lists" (afe and open_els) the same way
39 # stacks grow downward (current element is index=0)
41 # example: open_els = [a, b, c, d, e, f, g]
43 # "grows downwards" means it's visualized like this: (index: el, names)
45 # 6: g "start of the list", "topmost", "first"
47 # 4: e "previous" (to d), "above", "before"
48 # 3: d (previous/next are relative to this element)
49 # 2: c "next", "after", "lower", "below"
51 # 0: a "end of the list", "current node", "bottommost", "last"
55 # note: to get this to run outside a browser, you'll have to write a native
56 # implementation of decode_named_char_ref()
57 unless module?.exports?
59 module = exports: window.wheic
61 from_code_point = (x) ->
62 if String.fromCodePoint?
63 return String.fromCodePoint x
66 return String.fromCharCode x
68 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
70 # Each node is an obect of the Node class. Here are the Node types:
71 TYPE_TAG = 0 # name, {attributes}, [children]
72 TYPE_TEXT = 1 # "text"
75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
77 TYPE_END_TAG = 5 # name
79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
87 # quirks mode constants
97 debug_log_each = (cb) ->
98 for str in g_debug_log
103 constructor: (type, args = {}) ->
104 @type = type # one of the TYPE_* constants above
105 @name = args.name ? '' # tag name
106 @text = args.text ? '' # contents for text/comment nodes
107 @attrs = args.attrs ? {}
108 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
109 @children = args.children ? []
110 @namespace = args.namespace ? NS_HTML
111 @parent = args.parent ? null
112 @token = args.token ? null
113 @flags = args.flags ? {}
117 @id = "#{++prev_node_id}"
118 acknowledge_self_closing: ->
120 @token.flag 'did_self_close', true
122 @flag 'did_self_close', true
123 flag: (key, value = null) ->
128 serialize: (shallow = false, show_ids = false) -> # for unit tests
133 ret += JSON.stringify @name
148 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
154 ret += c.serialize shallow, show_ids
158 ret += JSON.stringify @text
161 ret += JSON.stringify @text
163 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
166 when TYPE_AAA_BOOKMARK
167 ret += 'aaa_bookmark'
170 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
173 # helpers: (only take args that are normally known when parser creates nodes)
174 new_open_tag = (name) ->
175 return new Node TYPE_START_TAG, name: name
176 new_end_tag = (name) ->
177 return new Node TYPE_END_TAG, name: name
178 new_element = (name) ->
179 return new Node TYPE_TAG, name: name
180 new_text_node = (txt) ->
181 return new Node TYPE_TEXT, text: txt
182 new_character_token = new_text_node
183 new_comment_token = (txt) ->
184 return new Node TYPE_COMMENT, text: txt
185 new_doctype_token = (name) ->
186 return new Node TYPE_DOCTYPE, name: name
188 return new Node TYPE_EOF
190 return new Node TYPE_AFE_MARKER
191 new_aaa_bookmark = ->
192 return new Node TYPE_AAA_BOOKMARK
194 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
195 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
196 digits = "0123456789"
197 alnum = lc_alpha + uc_alpha + digits
198 hex_chars = digits + "abcdefABCDEF"
200 is_uc_alpha = (str) ->
201 return str.length is 1 and uc_alpha.indexOf(str) > -1
202 is_lc_alpha = (str) ->
203 return str.length is 1 and lc_alpha.indexOf(str) > -1
205 # some SVG elements have dashes in them
206 tag_name_chars = alnum + "-"
208 # http://www.w3.org/TR/html5/infrastructure.html#space-character
209 space_chars = "\u0009\u000a\u000c\u000d\u0020"
211 return txt.length is 1 and space_chars.indexOf(txt) > -1
212 is_space_tok = (t) ->
213 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
215 is_input_hidden_tok = (t) ->
216 return false unless t.type is TYPE_START_TAG
219 if a[1].toLowerCase() is 'hidden'
224 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
225 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
228 unicode_fixes[0x00] = "\uFFFD"
229 unicode_fixes[0x80] = "\u20AC"
230 unicode_fixes[0x82] = "\u201A"
231 unicode_fixes[0x83] = "\u0192"
232 unicode_fixes[0x84] = "\u201E"
233 unicode_fixes[0x85] = "\u2026"
234 unicode_fixes[0x86] = "\u2020"
235 unicode_fixes[0x87] = "\u2021"
236 unicode_fixes[0x88] = "\u02C6"
237 unicode_fixes[0x89] = "\u2030"
238 unicode_fixes[0x8A] = "\u0160"
239 unicode_fixes[0x8B] = "\u2039"
240 unicode_fixes[0x8C] = "\u0152"
241 unicode_fixes[0x8E] = "\u017D"
242 unicode_fixes[0x91] = "\u2018"
243 unicode_fixes[0x92] = "\u2019"
244 unicode_fixes[0x93] = "\u201C"
245 unicode_fixes[0x94] = "\u201D"
246 unicode_fixes[0x95] = "\u2022"
247 unicode_fixes[0x96] = "\u2013"
248 unicode_fixes[0x97] = "\u2014"
249 unicode_fixes[0x98] = "\u02DC"
250 unicode_fixes[0x99] = "\u2122"
251 unicode_fixes[0x9A] = "\u0161"
252 unicode_fixes[0x9B] = "\u203A"
253 unicode_fixes[0x9C] = "\u0153"
254 unicode_fixes[0x9E] = "\u017E"
255 unicode_fixes[0x9F] = "\u0178"
257 quirks_yes_pi_prefixes = [
258 "+//silmaril//dtd html pro v0r11 19970101//"
259 "-//as//dtd html 3.0 aswedit + extensions//"
260 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
261 "-//ietf//dtd html 2.0 level 1//"
262 "-//ietf//dtd html 2.0 level 2//"
263 "-//ietf//dtd html 2.0 strict level 1//"
264 "-//ietf//dtd html 2.0 strict level 2//"
265 "-//ietf//dtd html 2.0 strict//"
266 "-//ietf//dtd html 2.0//"
267 "-//ietf//dtd html 2.1e//"
268 "-//ietf//dtd html 3.0//"
269 "-//ietf//dtd html 3.2 final//"
270 "-//ietf//dtd html 3.2//"
271 "-//ietf//dtd html 3//"
272 "-//ietf//dtd html level 0//"
273 "-//ietf//dtd html level 1//"
274 "-//ietf//dtd html level 2//"
275 "-//ietf//dtd html level 3//"
276 "-//ietf//dtd html strict level 0//"
277 "-//ietf//dtd html strict level 1//"
278 "-//ietf//dtd html strict level 2//"
279 "-//ietf//dtd html strict level 3//"
280 "-//ietf//dtd html strict//"
281 "-//ietf//dtd html//"
282 "-//metrius//dtd metrius presentational//"
283 "-//microsoft//dtd internet explorer 2.0 html strict//"
284 "-//microsoft//dtd internet explorer 2.0 html//"
285 "-//microsoft//dtd internet explorer 2.0 tables//"
286 "-//microsoft//dtd internet explorer 3.0 html strict//"
287 "-//microsoft//dtd internet explorer 3.0 html//"
288 "-//microsoft//dtd internet explorer 3.0 tables//"
289 "-//netscape comm. corp.//dtd html//"
290 "-//netscape comm. corp.//dtd strict html//"
291 "-//o'reilly and associates//dtd html 2.0//"
292 "-//o'reilly and associates//dtd html extended 1.0//"
293 "-//o'reilly and associates//dtd html extended relaxed 1.0//"
294 "-//sq//dtd html 2.0 hotmetal + extensions//"
295 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
296 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
297 "-//spyglass//dtd html 2.0 extended//"
298 "-//sun microsystems corp.//dtd hotjava html//"
299 "-//sun microsystems corp.//dtd hotjava strict html//"
300 "-//w3c//dtd html 3 1995-03-24//"
301 "-//w3c//dtd html 3.2 draft//"
302 "-//w3c//dtd html 3.2 final//"
303 "-//w3c//dtd html 3.2//"
304 "-//w3c//dtd html 3.2s draft//"
305 "-//w3c//dtd html 4.0 frameset//"
306 "-//w3c//dtd html 4.0 transitional//"
307 "-//w3c//dtd html experimental 19960712//"
308 "-//w3c//dtd html experimental 970421//"
309 "-//w3c//dtd w3 html//"
310 "-//w3o//dtd w3 html 3.0//"
311 "-//webtechs//dtd mozilla html 2.0//"
312 "-//webtechs//dtd mozilla html//"
315 # These are the character references that don't need a terminating semicolon
316 # min length: 2, max: 6, none are a prefix of any other.
318 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
319 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
320 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
321 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
322 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
323 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
324 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
325 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
326 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
327 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
328 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
329 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
330 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
331 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
332 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
333 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
334 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
338 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
339 raw_text_elements = ['script', 'style']
340 escapable_raw_text_elements = ['textarea', 'title']
341 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
343 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
344 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
345 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
346 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
347 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
348 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
349 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
350 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
351 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
352 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
353 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
354 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
355 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
356 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
360 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
362 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
363 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
364 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
365 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
366 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
367 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
368 'determinant', 'diff', 'divergence', 'divide', 'domain',
369 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
370 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
371 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
372 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
373 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
374 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
375 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
376 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
377 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
378 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
379 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
380 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
381 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
382 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
383 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
384 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
385 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
386 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
387 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
388 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
389 'vectorproduct', 'xor'
391 # foreign_elements = [svg_elements..., mathml_elements...]
392 #normal_elements = All other allowed HTML elements are normal elements.
396 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
397 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
398 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
399 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
400 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
401 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
402 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
403 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
404 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
405 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
406 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
408 menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
410 meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
411 noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
412 plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
413 select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
414 table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
415 textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
416 tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
419 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
420 'annotation-xml':NS_MATHML,
423 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
426 formatting_elements = {
427 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
428 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
432 mathml_text_integration = {
433 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
435 is_mathml_text_integration_point = (el) ->
436 return mathml_text_integration[el.name] is el.namespace
437 is_html_integration = (el) -> # DON'T PASS A TOKEN
438 if el.namespace is NS_MATHML
439 if el.name is 'annotation-xml'
440 if el.attrs.encoding?
441 if el.attrs.encoding.toLowerCase() is 'text/html'
443 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
446 if el.namespace is NS_SVG
447 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
452 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
455 foster_parenting_targets = {
476 el_is_special = (e) ->
477 return special_elements[e.name] is e.namespace
479 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
480 el_is_special_not_adp = (el) ->
481 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
485 altglyphdef: 'altGlyphDef'
486 altglyphitem: 'altGlyphItem'
487 animatecolor: 'animateColor'
488 animatemotion: 'animateMotion'
489 animatetransform: 'animateTransform'
492 fecolormatrix: 'feColorMatrix'
493 fecomponenttransfer: 'feComponentTransfer'
494 fecomposite: 'feComposite'
495 feconvolvematrix: 'feConvolveMatrix'
496 fediffuselighting: 'feDiffuseLighting'
497 fedisplacementmap: 'feDisplacementMap'
498 fedistantlight: 'feDistantLight'
499 fedropshadow: 'feDropShadow'
505 fegaussianblur: 'feGaussianBlur'
508 femergenode: 'feMergeNode'
509 femorphology: 'feMorphology'
511 fepointlight: 'fePointLight'
512 fespecularlighting: 'feSpecularLighting'
513 fespotlight: 'feSpotLight'
515 feturbulence: 'feTurbulence'
516 foreignobject: 'foreignObject'
518 lineargradient: 'linearGradient'
519 radialgradient: 'radialGradient'
522 svg_attribute_fixes = {
523 attributename: 'attributeName'
524 attributetype: 'attributeType'
525 basefrequency: 'baseFrequency'
526 baseprofile: 'baseProfile'
528 clippathunits: 'clipPathUnits'
529 contentscripttype: 'contentScriptType'
530 contentstyletype: 'contentStyleType'
531 diffuseconstant: 'diffuseConstant'
533 externalresourcesrequired: 'externalResourcesRequired'
534 # WHATWG removes this: filterres: 'filterRes'
535 filterunits: 'filterUnits'
537 gradienttransform: 'gradientTransform'
538 gradientunits: 'gradientUnits'
539 kernelmatrix: 'kernelMatrix'
540 kernelunitlength: 'kernelUnitLength'
541 keypoints: 'keyPoints'
542 keysplines: 'keySplines'
544 lengthadjust: 'lengthAdjust'
545 limitingconeangle: 'limitingConeAngle'
546 markerheight: 'markerHeight'
547 markerunits: 'markerUnits'
548 markerwidth: 'markerWidth'
549 maskcontentunits: 'maskContentUnits'
550 maskunits: 'maskUnits'
551 numoctaves: 'numOctaves'
552 pathlength: 'pathLength'
553 patterncontentunits: 'patternContentUnits'
554 patterntransform: 'patternTransform'
555 patternunits: 'patternUnits'
556 pointsatx: 'pointsAtX'
557 pointsaty: 'pointsAtY'
558 pointsatz: 'pointsAtZ'
559 preservealpha: 'preserveAlpha'
560 preserveaspectratio: 'preserveAspectRatio'
561 primitiveunits: 'primitiveUnits'
564 repeatcount: 'repeatCount'
565 repeatdur: 'repeatDur'
566 requiredextensions: 'requiredExtensions'
567 requiredfeatures: 'requiredFeatures'
568 specularconstant: 'specularConstant'
569 specularexponent: 'specularExponent'
570 spreadmethod: 'spreadMethod'
571 startoffset: 'startOffset'
572 stddeviation: 'stdDeviation'
573 stitchtiles: 'stitchTiles'
574 surfacescale: 'surfaceScale'
575 systemlanguage: 'systemLanguage'
576 tablevalues: 'tableValues'
579 textlength: 'textLength'
581 viewtarget: 'viewTarget'
582 xchannelselector: 'xChannelSelector'
583 ychannelselector: 'yChannelSelector'
584 zoomandpan: 'zoomAndPan'
586 foreign_attr_fixes = {
587 'xlink:actuate': 'xlink actuate'
588 'xlink:arcrole': 'xlink arcrole'
589 'xlink:href': 'xlink href'
590 'xlink:role': 'xlink role'
591 'xlink:show': 'xlink show'
592 'xlink:title': 'xlink title'
593 'xlink:type': 'xlink type'
594 'xml:base': 'xml base'
595 'xml:lang': 'xml lang'
596 'xml:space': 'xml space'
598 'xmlns:xlink': 'xmlns xlink'
600 adjust_mathml_attributes = (t) ->
602 if a[0] is 'definitionurl'
603 a[0] = 'definitionURL'
605 adjust_svg_attributes = (t) ->
607 if svg_attribute_fixes[a[0]]?
608 a[0] = svg_attribute_fixes[a[0]]
610 adjust_foreign_attributes = (t) ->
613 if foreign_attr_fixes[a[0]]?
614 a[0] = foreign_attr_fixes[a[0]]
617 # decode_named_char_ref()
619 # The list of named character references is _huge_ so ask the browser to decode
620 # for us instead of wasting bandwidth/space on including the table here.
622 # Pass without the "&" but with the ";" examples:
623 # for "&" pass "amp;"
624 # for "′" pass "x2032;"
627 textarea: document.createElement('textarea')
629 # TODO test this in IE8
630 decode_named_char_ref = (txt) ->
632 decoded = g_dncr.cache[txt]
633 return decoded if decoded?
634 g_dncr.textarea.innerHTML = txt
635 decoded = g_dncr.textarea.value
636 return null if decoded is txt
637 return g_dncr.cache[txt] = decoded
639 parse_html = (args) ->
641 cur = null # index of next char in txt to be parsed
642 # declare doc and tokenizer variables so they're in scope below
644 open_els = null # stack of open elements
645 afe = null # active formatting elements
646 template_ins_modes = null
648 original_ins_mode = null
650 tok_cur_tag = null # partially parsed tag
651 flag_scripting = null
652 flag_frameset_ok = null
654 flag_foster_parenting = null
655 form_element_pointer = null
656 temporary_buffer = null
657 pending_table_character_tokens = null
658 head_element_pointer = null
659 flag_fragment_parsing = null
660 context_element = null
669 console.log "Parse error at character #{cur} of #{txt.length}"
671 # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
672 # "Noah's Ark clause" but with three
673 afe_push = (new_el) ->
676 if el.type is TYPE_AFE_MARKER
678 if el.name is new_el.name and el.namespace is new_el.namespace
681 unless new_el.attrs[k] is v
685 for k, v of new_el.attrs
686 unless el.attrs[k] is v
696 afe.unshift new_afe_marker()
698 # the functions below impliment the Tree Contstruction algorithm
699 # http://www.w3.org/TR/html5/syntax.html#tree-construction
701 # But first... the helpers
702 template_tag_is_open = ->
704 if el.name is 'template' and el.namespace is NS_HTML
707 is_in_scope_x = (tag_name, scope, namespace) ->
709 if el.name is tag_name and (namespace is null or namespace is el.namespace)
711 if scope[el.name] is el.namespace
714 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
716 if el.name is tag_name and (namespace is null or namespace is el.namespace)
718 if scope[el.name] is el.namespace
720 if scope2[el.name] is el.namespace
724 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
725 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
728 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
729 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
731 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
733 button_scopers = button: NS_HTML
734 li_scopers = ol: NS_HTML, ul: NS_HTML
735 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
736 is_in_scope = (tag_name, namespace = null) ->
737 return is_in_scope_x tag_name, standard_scopers, namespace
738 is_in_button_scope = (tag_name, namespace = null) ->
739 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
740 is_in_table_scope = (tag_name, namespace = null) ->
741 return is_in_scope_x tag_name, table_scopers, namespace
742 # aka is_in_list_item_scope
743 is_in_li_scope = (tag_name, namespace = null) ->
744 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
745 is_in_select_scope = (tag_name, namespace = null) ->
747 if t.name is tag_name and (namespace is null or namespace is t.namespace)
749 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
752 # this checks for a particular element, not by name
753 # this requires a namespace match
754 el_is_in_scope = (needle) ->
758 if standard_scopers[el.name] is el.namespace
762 clear_to_table_stopers = {
767 clear_stack_to_table_context = ->
769 if clear_to_table_stopers[open_els[0].name]?
773 clear_to_table_body_stopers = {
780 clear_stack_to_table_body_context = ->
782 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
786 clear_to_table_row_stopers = {
791 clear_stack_to_table_row_context = ->
793 if clear_to_table_row_stopers[open_els[0].name]?
797 clear_afe_to_marker = ->
799 return unless afe.length > 0 # this happens in fragment case, ?spec error
801 if el.type is TYPE_AFE_MARKER
806 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
808 # 1. Let last be false.
810 # 2. Let node be the last node in the stack of open elements.
812 node = open_els[node_i]
813 # 3. Loop: If node is the first node in the stack of open elements,
814 # then set last to true, and, if the parser was originally created as
815 # part of the HTML fragment parsing algorithm (fragment case) set node
816 # to the context element.
818 if node_i is open_els.length - 1
820 # fixfull (fragment case)
822 # 4. If node is a select element, run these substeps:
823 if node.name is 'select' and node.namespace is NS_HTML
824 # 1. If last is true, jump to the step below labeled done.
826 # 2. Let ancestor be node.
829 # 3. Loop: If ancestor is the first node in the stack of
830 # open elements, jump to the step below labeled done.
832 if ancestor_i is open_els.length - 1
834 # 4. Let ancestor be the node before ancestor in the stack
837 ancestor = open_els[ancestor_i]
838 # 5. If ancestor is a template node, jump to the step below
840 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
842 # 6. If ancestor is a table node, switch the insertion mode
843 # to "in select in table" and abort these steps.
844 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
845 ins_mode = ins_mode_in_select_in_table
847 # 7. Jump back to the step labeled loop.
848 # 8. Done: Switch the insertion mode to "in select" and abort
850 ins_mode = ins_mode_in_select
852 # 5. If node is a td or th element and last is false, then switch
853 # the insertion mode to "in cell" and abort these steps.
854 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
855 ins_mode = ins_mode_in_cell
857 # 6. If node is a tr element, then switch the insertion mode to "in
858 # row" and abort these steps.
859 if node.name is 'tr' and node.namespace is NS_HTML
860 ins_mode = ins_mode_in_row
862 # 7. If node is a tbody, thead, or tfoot element, then switch the
863 # insertion mode to "in table body" and abort these steps.
864 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
865 ins_mode = ins_mode_in_table_body
867 # 8. If node is a caption element, then switch the insertion mode
868 # to "in caption" and abort these steps.
869 if node.name is 'caption' and node.namespace is NS_HTML
870 ins_mode = ins_mode_in_caption
872 # 9. If node is a colgroup element, then switch the insertion mode
873 # to "in column group" and abort these steps.
874 if node.name is 'colgroup' and node.namespace is NS_HTML
875 ins_mode = ins_mode_in_column_group
877 # 10. If node is a table element, then switch the insertion mode to
878 # "in table" and abort these steps.
879 if node.name is 'table' and node.namespace is NS_HTML
880 ins_mode = ins_mode_in_table
882 # 11. If node is a template element, then switch the insertion mode
883 # to the current template insertion mode and abort these steps.
884 if node.name is 'template' and node.namespace is NS_HTML
885 ins_mode = template_ins_modes[0]
887 # 12. If node is a head element and last is true, then switch the
888 # insertion mode to "in body" ("in body"! not "in head"!) and abort
889 # these steps. (fragment case)
890 if node.name is 'head' and node.namespace is NS_HTML and last
891 ins_mode = ins_mode_in_body
893 # 13. If node is a head element and last is false, then switch the
894 # insertion mode to "in head" and abort these steps.
895 if node.name is 'head' and node.namespace is NS_HTML and last is false
896 ins_mode = ins_mode_in_head
898 # 14. If node is a body element, then switch the insertion mode to
899 # "in body" and abort these steps.
900 if node.name is 'body' and node.namespace is NS_HTML
901 ins_mode = ins_mode_in_body
903 # 15. If node is a frameset element, then switch the insertion mode
904 # to "in frameset" and abort these steps. (fragment case)
905 if node.name is 'frameset' and node.namespace is NS_HTML
906 ins_mode = ins_mode_in_frameset
908 # 16. If node is an html element, run these substeps:
909 if node.name is 'html' and node.namespace is NS_HTML
910 # 1. If the head element pointer is null, switch the insertion
911 # mode to "before head" and abort these steps. (fragment case)
912 if head_element_pointer is null
913 ins_mode = ins_mode_before_head
915 # 2. Otherwise, the head element pointer is not null,
916 # switch the insertion mode to "after head" and abort these
918 ins_mode = ins_mode_after_head
920 # 17. If last is true, then switch the insertion mode to "in body"
921 # and abort these steps. (fragment case)
923 ins_mode = ins_mode_in_body
925 # 18. Let node now be the node before node in the stack of open
928 node = open_els[node_i]
929 # 19. Return to the step labeled loop.
933 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
934 adjusted_current_node = ->
935 if open_els.length is 1 and flag_fragment_parsing
936 return context_element
939 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
940 # this implementation is structured (mostly) as described at the link above.
941 # capitalized comments are the "labels" described at the link above.
943 return if afe.length is 0
944 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
949 if i is afe.length - 1
952 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
957 el = insert_html_element afe[i].token
962 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
963 # adoption agency algorithm
965 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
966 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
967 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
968 adoption_agency = (subject) ->
969 debug_log "adoption_agency()"
970 debug_log "tree: #{serialize_els doc.children, false, true}"
971 debug_log "open_els: #{serialize_els open_els, true, true}"
972 debug_log "afe: #{serialize_els afe, true, true}"
973 # this block implements tha W3C spec
974 # # 1. If the current node is an HTML element whose tag name is subject,
975 # # then run these substeps:
977 # # 1. Let element be the current node.
979 # # 2. Pop element off the stack of open elements.
981 # # 3. If element is also in the list of active formatting elements,
982 # # remove the element from the list.
984 # # 4. Abort the adoption agency algorithm.
985 # if open_els[0].name is subject and open_els[0].namespace is NS_HTML
986 # el = open_els.shift()
987 # # remove it from the list of active formatting elements (if found)
992 # debug_log "aaa: starting off with subject on top of stack, exiting"
994 # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
995 # If the current node is an HTML element whose tag name is subject, and
996 # the current node is not in the list of active formatting elements,
997 # then pop the current node off the stack of open elements, and abort
999 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
1000 debug_log "aaa: starting off with subject on top of stack, exiting"
1001 # remove it from the list of active formatting elements (if found)
1004 if el is open_els[0]
1008 debug_log "aaa: ...and not in afe, aaa done"
1018 # 5. Let formatting element be the last element in the list of
1019 # active formatting elements that: is between the end of the list
1020 # and the last scope marker in the list, if any, or the start of
1021 # the list otherwise, and has the tag name subject.
1023 for t, fe_of_afe in afe
1024 if t.type is TYPE_AFE_MARKER
1026 if t.name is subject
1029 # If there is no such element, then abort these steps and instead
1030 # act as described in the "any other end tag" entry above.
1032 debug_log "aaa: fe not found in afe"
1033 in_body_any_other_end_tag subject
1035 # 6. If formatting element is not in the stack of open elements,
1036 # then this is a parse error; remove the element from the list, and
1037 # abort these steps.
1039 for t, fe_of_open_els in open_els
1044 debug_log "aaa: fe not found in open_els"
1046 # "remove it from the list" must mean afe, since it's not in open_els
1047 afe.splice fe_of_afe, 1
1049 # 7. If formatting element is in the stack of open elements, but
1050 # the element is not in scope, then this is a parse error; abort
1052 unless el_is_in_scope fe
1053 debug_log "aaa: fe not in scope"
1056 # 8. If formatting element is not the current node, this is a parse
1057 # error. (But do not abort these steps.)
1058 unless open_els[0] is fe
1061 # 9. Let furthest block be the topmost node in the stack of open
1062 # elements that is lower in the stack than formatting element, and
1063 # is an element in the special category. There might not be one.
1065 fb_of_open_els = null
1066 for t, i in open_els
1072 # and continue, to see if there's one that's more "topmost"
1073 # 10. If there is no furthest block, then the UA must first pop all
1074 # the nodes from the bottom of the stack of open elements, from the
1075 # current node up to and including formatting element, then remove
1076 # formatting element from the list of active formatting elements,
1077 # and finally abort these steps.
1079 debug_log "aaa: no fb"
1081 t = open_els.shift()
1083 afe.splice fe_of_afe, 1
1085 # 11. Let common ancestor be the element immediately above
1086 # formatting element in the stack of open elements.
1087 ca = open_els[fe_of_open_els + 1] # common ancestor
1089 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
1090 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
1091 bookmark = new_aaa_bookmark()
1094 afe.splice i, 0, bookmark
1096 node = last_node = fb
1100 # 3. Let node be the element immediately above node in the
1101 # stack of open elements, or if node is no longer in the stack
1102 # of open elements (e.g. because it got removed by this
1103 # algorithm), the element that was immediately above node in
1104 # the stack of open elements before node was removed.
1106 for t, i in open_els
1108 node_next = open_els[i + 1]
1110 node = node_next ? node_above
1111 debug_log "inner loop #{inner}"
1112 debug_log "tree: #{serialize_els doc.children, false, true}"
1113 debug_log "open_els: #{serialize_els open_els, true, true}"
1114 debug_log "afe: #{serialize_els afe, true, true}"
1115 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1116 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1117 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1118 debug_log "node: #{node.serialize true, true}"
1119 # TODO make sure node_above gets re-set if/when node is removed from open_els
1121 # 4. If node is formatting element, then go to the next step in
1122 # the overall algorithm.
1125 debug_log "the meat"
1126 # 5. If inner loop counter is greater than three and node is in
1127 # the list of active formatting elements, then remove node from
1128 # the list of active formatting elements.
1134 debug_log "max out inner"
1139 # 6. If node is not in the list of active formatting elements,
1140 # then remove node from the stack of open elements and then go
1141 # back to the step labeled inner loop.
1143 debug_log "not in afe"
1144 for t, i in open_els
1146 node_above = open_els[i + 1]
1147 open_els.splice i, 1
1150 debug_log "the bones"
1151 # 7. create an element for the token for which the element node
1152 # was created, in the HTML namespace, with common ancestor as
1153 # the intended parent; replace the entry for node in the list
1154 # of active formatting elements with an entry for the new
1155 # element, replace the entry for node in the stack of open
1156 # elements with an entry for the new element, and let node be
1158 new_node = token_to_element node.token, NS_HTML, ca
1162 debug_log "replaced in afe"
1164 for t, i in open_els
1166 node_above = open_els[i + 1]
1167 open_els[i] = new_node
1168 debug_log "replaced in open_els"
1171 # 8. If last node is furthest block, then move the
1172 # aforementioned bookmark to be immediately after the new node
1173 # in the list of active formatting elements.
1178 debug_log "removed bookmark"
1182 # "after" means lower
1183 afe.splice i, 0, bookmark # "after as <-
1184 debug_log "placed bookmark after node"
1185 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1187 # 9. Insert last node into node, first removing it from its
1188 # previous parent node if any.
1189 if last_node.parent?
1190 debug_log "last_node has parent"
1191 for c, i in last_node.parent.children
1193 debug_log "removing last_node from parent"
1194 last_node.parent.children.splice i, 1
1196 node.children.push last_node
1197 last_node.parent = node
1198 # 10. Let last node be node.
1201 # 11. Return to the step labeled inner loop.
1202 # 14. Insert whatever last node ended up being in the previous step
1203 # at the appropriate place for inserting a node, but using common
1204 # ancestor as the override target.
1206 # In the case where fe is immediately followed by fb:
1207 # * inner loop exits out early (node==fe)
1209 # * last_node is still in the tree (not a duplicate)
1210 if last_node.parent?
1211 debug_log "FEFIRST? last_node has parent"
1212 for c, i in last_node.parent.children
1214 debug_log "removing last_node from parent"
1215 last_node.parent.children.splice i, 1
1218 debug_log "after aaa inner loop"
1219 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1220 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1221 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1222 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1223 debug_log "tree: #{serialize_els doc.children, false, true}"
1228 # can't use standard insert token thing, because it's already in
1229 # open_els and must stay at it's current position in open_els
1230 dest = adjusted_insertion_location ca
1231 dest[0].children.splice dest[1], 0, last_node
1232 last_node.parent = dest[0]
1235 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1236 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1237 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1238 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1239 debug_log "tree: #{serialize_els doc.children, false, true}"
1241 # 15. Create an element for the token for which formatting element
1242 # was created, in the HTML namespace, with furthest block as the
1244 new_element = token_to_element fe.token, NS_HTML, fb
1245 # 16. Take all of the child nodes of furthest block and append them
1246 # to the element created in the last step.
1247 while fb.children.length
1248 t = fb.children.shift()
1249 t.parent = new_element
1250 new_element.children.push t
1251 # 17. Append that new element to furthest block.
1252 new_element.parent = fb
1253 fb.children.push new_element
1254 # 18. Remove formatting element from the list of active formatting
1255 # elements, and insert the new element into the list of active
1256 # formatting elements at the position of the aforementioned
1264 afe[i] = new_element
1266 # 19. Remove formatting element from the stack of open elements,
1267 # and insert the new element into the stack of open elements
1268 # immediately below the position of furthest block in that stack.
1269 for t, i in open_els
1271 open_els.splice i, 1
1273 for t, i in open_els
1275 open_els.splice i, 0, new_element
1277 # 20. Jump back to the step labeled outer loop.
1278 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1279 debug_log "tree: #{serialize_els doc.children, false, true}"
1280 debug_log "open_els: #{serialize_els open_els, true, true}"
1281 debug_log "afe: #{serialize_els afe, true, true}"
1282 debug_log "AAA DONE"
1284 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1285 close_p_element = ->
1286 generate_implied_end_tags 'p' # arg is exception
1287 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1289 while open_els.length > 1 # just in case
1290 el = open_els.shift()
1291 if el.name is 'p' and el.namespace is NS_HTML
1293 close_p_if_in_button_scope = ->
1294 if is_in_button_scope 'p', NS_HTML
1297 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1298 # aka insert_a_character = (t) ->
1299 insert_character = (t) ->
1300 dest = adjusted_insertion_location()
1301 # fixfull check for Document node
1303 prev = dest[0].children[dest[1] - 1]
1304 if prev.type is TYPE_TEXT
1307 dest[0].children.splice dest[1], 0, t
1310 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1311 process_token = (t) ->
1312 acn = adjusted_current_node()
1316 if acn.namespace is NS_HTML
1319 if is_mathml_text_integration_point(acn)
1320 if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1323 if t.type is TYPE_TEXT
1326 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1329 if is_html_integration acn
1330 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1333 if t.type is TYPE_EOF
1336 in_foreign_content t
1340 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1341 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1342 adjusted_insertion_location = (override_target = null) ->
1343 # 1. If there was an override target specified, then let target be the
1346 target = override_target
1347 else # Otherwise, let target be the current node.
1348 target = open_els[0]
1349 # 2. Determine the adjusted insertion location using the first matching
1350 # steps from the following list:
1352 # If foster parenting is enabled and target is a table, tbody, tfoot,
1353 # thead, or tr element Foster parenting happens when content is
1354 # misnested in tables.
1355 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1356 loop # once. this is here so we can ``break`` to "abort these substeps"
1357 # 1. Let last template be the last template element in the
1358 # stack of open elements, if any.
1359 last_template = null
1360 last_template_i = null
1361 for el, i in open_els
1362 if el.name is 'template' and el.namespace is NS_HTML
1366 # 2. Let last table be the last table element in the stack of
1367 # open elements, if any.
1370 for el, i in open_els
1371 if el.name is 'table' and el.namespace is NS_HTML
1375 # 3. If there is a last template and either there is no last
1376 # table, or there is one, but last template is lower (more
1377 # recently added) than last table in the stack of open
1378 # elements, then: let adjusted insertion location be inside
1379 # last template's template contents, after its last child (if
1380 # any), and abort these substeps.
1381 if last_template and (last_table is null or last_template_i < last_table_i)
1382 target = last_template # fixfull should be it's contents
1383 target_i = target.children.length
1385 # 4. If there is no last table, then let adjusted insertion
1386 # location be inside the first element in the stack of open
1387 # elements (the html element), after its last child (if any),
1388 # and abort these substeps. (fragment case)
1389 if last_table is null
1391 target = open_els[open_els.length - 1]
1392 target_i = target.children.length
1394 # 5. If last table has a parent element, then let adjusted
1395 # insertion location be inside last table's parent element,
1396 # immediately before last table, and abort these substeps.
1397 if last_table.parent?
1398 for c, i in last_table.parent.children
1400 target = last_table.parent
1404 # 6. Let previous element be the element immediately above last
1405 # table in the stack of open elements.
1407 # huh? how could it not have a parent?
1408 previous_element = open_els[last_table_i + 1]
1409 # 7. Let adjusted insertion location be inside previous
1410 # element, after its last child (if any).
1411 target = previous_element
1412 target_i = target.children.length
1413 # Note: These steps are involved in part because it's possible
1414 # for elements, the table element in this case in particular,
1415 # to have been moved by a script around in the DOM, or indeed
1416 # removed from the DOM entirely, after the element was inserted
1418 break # don't really loop
1420 # Otherwise Let adjusted insertion location be inside target, after
1421 # its last child (if any).
1422 target_i = target.children.length
1424 # 3. If the adjusted insertion location is inside a template element,
1425 # let it instead be inside the template element's template contents,
1426 # after its last child (if any).
1427 # fixfull (template)
1429 # 4. Return the adjusted insertion location.
1430 return [target, target_i]
1432 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1433 # aka create_an_element_for_token
1434 token_to_element = (t, namespace, intended_parent) ->
1435 # convert attributes into a hash
1438 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1439 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1441 # TODO 2. If the newly created element has an xmlns attribute in the
1442 # XMLNS namespace whose value is not exactly the same as the element's
1443 # namespace, that is a parse error. Similarly, if the newly created
1444 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1445 # value is not the XLink Namespace, that is a parse error.
1447 # fixfull: the spec says stuff about form pointers and ownerDocument
1451 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1452 insert_foreign_element = (token, namespace) ->
1453 ail = adjusted_insertion_location()
1456 el = token_to_element token, namespace, ail_el
1457 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1459 ail_el.children.splice ail_i, 0, el
1462 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1463 insert_html_element = (token) ->
1464 insert_foreign_element token, NS_HTML
1466 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1467 # position should be [node, index_within_children]
1468 insert_comment = (t, position = null) ->
1469 position ?= adjusted_insertion_location()
1470 position[0].children.splice position[1], 0, t
1473 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1474 parse_generic_raw_text = (t) ->
1475 insert_html_element t
1476 tok_state = tok_state_rawtext
1477 original_ins_mode = ins_mode
1478 ins_mode = ins_mode_text
1479 parse_generic_rcdata_text = (t) ->
1480 insert_html_element t
1481 tok_state = tok_state_rcdata
1482 original_ins_mode = ins_mode
1483 ins_mode = ins_mode_text
1485 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1486 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1487 generate_implied_end_tags = (except = null) ->
1488 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1491 # 8.2.5.4 The rules for parsing tokens in HTML content
1492 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1494 # 8.2.5.4.1 The "initial" insertion mode
1495 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1496 is_quirks_yes_doctype = (t) ->
1497 if t.flag 'force-quirks'
1499 if t.name isnt 'html'
1501 if t.public_identifier?
1502 pi = t.public_identifier.toLowerCase()
1503 for p in quirks_yes_pi_prefixes
1504 if pi.substr(0, p.length) is p
1506 if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
1508 if t.system_identifier?
1509 if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
1511 else if t.public_identifier?
1512 # already did this: pi = t.public_identifier.toLowerCase()
1513 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1516 is_quirks_limited_doctype = (t) ->
1517 if t.public_identifier?
1518 pi = t.public_identifier.toLowerCase()
1519 if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
1521 if t.system_identifier?
1522 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1525 ins_mode_initial = (t) ->
1528 if t.type is TYPE_COMMENT
1532 if t.type is TYPE_DOCTYPE
1533 # fixfull syntax error from first paragraph and following bullets
1534 # fixfull set doc.doctype
1535 # fixfull is the "not an iframe srcdoc" thing relevant?
1536 if is_quirks_yes_doctype t
1537 doc.flag 'quirks mode', QUIRKS_YES
1538 else if is_quirks_limited_doctype t
1539 doc.flag 'quirks mode', QUIRKS_LIMITED
1541 ins_mode = ins_mode_before_html
1544 # fixfull not iframe srcdoc?
1546 doc.flag 'quirks mode', QUIRKS_YES
1547 ins_mode = ins_mode_before_html
1551 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1552 ins_mode_before_html = (t) ->
1553 if t.type is TYPE_DOCTYPE
1556 if t.type is TYPE_COMMENT
1561 if t.type is TYPE_START_TAG and t.name is 'html'
1562 el = token_to_element t, NS_HTML, doc
1563 doc.children.push el
1564 open_els.unshift(el)
1565 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1566 ins_mode = ins_mode_before_head
1568 if t.type is TYPE_END_TAG
1569 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1570 # fall through to "anything else"
1575 el = token_to_element new_open_tag('html'), NS_HTML, doc
1576 doc.children.push el
1579 # ?fixfull browsing context
1580 ins_mode = ins_mode_before_head
1584 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1585 ins_mode_before_head = (t) ->
1588 if t.type is TYPE_COMMENT
1591 if t.type is TYPE_DOCTYPE
1594 if t.type is TYPE_START_TAG and t.name is 'html'
1597 if t.type is TYPE_START_TAG and t.name is 'head'
1598 el = insert_html_element t
1599 head_element_pointer = el
1600 ins_mode = ins_mode_in_head
1602 if t.type is TYPE_END_TAG
1603 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1604 # fall through to Anything else below
1609 el = insert_html_element new_open_tag 'head'
1610 head_element_pointer = el
1611 ins_mode = ins_mode_in_head
1614 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1615 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1616 open_els.shift() # spec says this will be a 'head' node
1617 ins_mode = ins_mode_after_head
1619 ins_mode_in_head = (t) ->
1620 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1623 if t.type is TYPE_COMMENT
1626 if t.type is TYPE_DOCTYPE
1629 if t.type is TYPE_START_TAG and t.name is 'html'
1632 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1633 el = insert_html_element t
1635 t.acknowledge_self_closing()
1637 if t.type is TYPE_START_TAG and t.name is 'meta'
1638 el = insert_html_element t
1640 t.acknowledge_self_closing()
1641 # fixfull encoding stuff
1643 if t.type is TYPE_START_TAG and t.name is 'title'
1644 parse_generic_rcdata_text t
1646 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1647 parse_generic_raw_text t
1649 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1650 insert_html_element t
1651 ins_mode = ins_mode_in_head_noscript
1653 if t.type is TYPE_START_TAG and t.name is 'script'
1654 ail = adjusted_insertion_location()
1655 el = token_to_element t, NS_HTML, ail
1656 el.flag 'parser-inserted', true
1657 # fixfull frament case
1658 ail[0].children.splice ail[1], 0, el
1660 tok_state = tok_state_script_data
1661 original_ins_mode = ins_mode # make sure orig... is defined
1662 ins_mode = ins_mode_text
1664 if t.type is TYPE_END_TAG and t.name is 'head'
1665 open_els.shift() # will be a head element... spec says so
1666 ins_mode = ins_mode_after_head
1668 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1669 ins_mode_in_head_else t
1671 if t.type is TYPE_START_TAG and t.name is 'template'
1672 insert_html_element t
1674 flag_frameset_ok = false
1675 ins_mode = ins_mode_in_template
1676 template_ins_modes.unshift ins_mode_in_template
1678 if t.type is TYPE_END_TAG and t.name is 'template'
1679 if template_tag_is_open()
1680 generate_implied_end_tags
1681 if open_els[0].name isnt 'template'
1684 el = open_els.shift()
1685 if el.name is 'template' and el.namespace is NS_HTML
1687 clear_afe_to_marker()
1688 template_ins_modes.shift()
1693 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1696 ins_mode_in_head_else t
1698 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1699 ins_mode_in_head_noscript_else = (t) ->
1702 ins_mode = ins_mode_in_head
1704 ins_mode_in_head_noscript = (t) ->
1705 if t.type is TYPE_DOCTYPE
1708 if t.type is TYPE_START_TAG and t.name is 'html'
1711 if t.type is TYPE_END_TAG and t.name is 'noscript'
1713 ins_mode = ins_mode_in_head
1715 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1718 if t.type is TYPE_END_TAG and t.name is 'br'
1719 ins_mode_in_head_noscript_else t
1721 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1725 ins_mode_in_head_noscript_else t
1730 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1731 ins_mode_after_head_else = (t) ->
1732 body_tok = new_open_tag 'body'
1733 insert_html_element body_tok
1734 ins_mode = ins_mode_in_body
1737 ins_mode_after_head = (t) ->
1741 if t.type is TYPE_COMMENT
1744 if t.type is TYPE_DOCTYPE
1747 if t.type is TYPE_START_TAG and t.name is 'html'
1750 if t.type is TYPE_START_TAG and t.name is 'body'
1751 insert_html_element t
1752 flag_frameset_ok = false
1753 ins_mode = ins_mode_in_body
1755 if t.type is TYPE_START_TAG and t.name is 'frameset'
1756 insert_html_element t
1757 ins_mode = ins_mode_in_frameset
1759 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1761 open_els.unshift head_element_pointer
1763 for el, i in open_els
1764 if el is head_element_pointer
1765 open_els.splice i, 1
1767 console.log "warning: 23904 couldn't find head element in open_els"
1769 if t.type is TYPE_END_TAG and t.name is 'template'
1772 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1773 ins_mode_after_head_else t
1775 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1779 ins_mode_after_head_else t
1781 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1782 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1785 if node.name is name and node.namespace is NS_HTML
1786 generate_implied_end_tags name # arg is exception
1787 unless node is open_els[0]
1790 el = open_els.shift()
1793 if special_elements[node.name] is node.namespace
1796 for el, i in open_els
1798 node = open_els[i + 1]
1801 ins_mode_in_body = (t) ->
1802 if t.type is TYPE_TEXT and t.text is "\u0000"
1809 if t.type is TYPE_TEXT
1812 flag_frameset_ok = false
1814 if t.type is TYPE_COMMENT
1817 if t.type is TYPE_DOCTYPE
1820 if t.type is TYPE_START_TAG and t.name is 'html'
1822 return if template_tag_is_open()
1823 root_attrs = open_els[open_els.length - 1].attrs
1825 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1828 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1831 if t.type is TYPE_START_TAG and t.name is 'body'
1833 return if open_els.length < 2
1834 second = open_els[open_els.length - 2]
1835 return unless second.namespace is NS_HTML
1836 return unless second.name is 'body'
1837 return if template_tag_is_open()
1838 flag_frameset_ok = false
1840 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1842 if t.type is TYPE_START_TAG and t.name is 'frameset'
1844 return if open_els.length < 2
1845 second_i = open_els.length - 2
1846 second = open_els[second_i]
1847 return unless second.namespace is NS_HTML
1848 return unless second.name is 'body'
1849 if flag_frameset_ok is false
1852 for el, i in second.parent.children
1854 second.parent.children.splice i, 1
1856 open_els.splice second_i, 1
1857 # pop everything except the "root html element"
1858 while open_els.length > 1
1860 insert_html_element t
1861 ins_mode = ins_mode_in_frameset
1863 if t.type is TYPE_EOF
1865 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1866 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1867 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1870 unless ok_tags[t.name] is el.namespace
1873 if template_ins_modes.length > 0
1874 ins_mode_in_template t
1878 if t.type is TYPE_END_TAG and t.name is 'body'
1879 unless is_in_scope 'body', NS_HTML
1883 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1884 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1885 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1886 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1890 unless ok_tags[t.name] is el.namespace
1893 ins_mode = ins_mode_after_body
1895 if t.type is TYPE_END_TAG and t.name is 'html'
1896 unless is_in_scope 'body', NS_HTML
1900 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1901 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1902 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1903 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1907 unless ok_tags[t.name] is el.namespace
1910 ins_mode = ins_mode_after_body
1913 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1914 close_p_if_in_button_scope()
1915 insert_html_element t
1917 if t.type is TYPE_START_TAG and h_tags[t.name]?
1918 close_p_if_in_button_scope()
1919 if h_tags[open_els[0].name] is open_els[0].namespace
1922 insert_html_element t
1924 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1925 close_p_if_in_button_scope()
1926 insert_html_element t
1927 eat_next_token_if_newline()
1928 flag_frameset_ok = false
1930 if t.type is TYPE_START_TAG and t.name is 'form'
1931 unless form_element_pointer is null or template_tag_is_open()
1934 close_p_if_in_button_scope()
1935 el = insert_html_element t
1936 unless template_tag_is_open()
1937 form_element_pointer = el
1939 if t.type is TYPE_START_TAG and t.name is 'li'
1940 flag_frameset_ok = false
1941 for node in open_els
1942 if node.name is 'li' and node.namespace is NS_HTML
1943 generate_implied_end_tags 'li' # arg is exception
1944 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1947 el = open_els.shift()
1948 if el.name is 'li' and el.namespace is NS_HTML
1951 if el_is_special_not_adp node
1953 close_p_if_in_button_scope()
1954 insert_html_element t
1956 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1957 flag_frameset_ok = false
1958 for node in open_els
1959 if node.name is 'dd' and node.namespace is NS_HTML
1960 generate_implied_end_tags 'dd' # arg is exception
1961 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1964 el = open_els.shift()
1965 if el.name is 'dd' and el.namespace is NS_HTML
1968 if node.name is 'dt' and node.namespace is NS_HTML
1969 generate_implied_end_tags 'dt' # arg is exception
1970 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1973 el = open_els.shift()
1974 if el.name is 'dt' and el.namespace is NS_HTML
1977 if el_is_special_not_adp node
1979 close_p_if_in_button_scope()
1980 insert_html_element t
1982 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1983 close_p_if_in_button_scope()
1984 insert_html_element t
1985 tok_state = tok_state_plaintext
1987 if t.type is TYPE_START_TAG and t.name is 'button'
1988 if is_in_scope 'button', NS_HTML
1990 generate_implied_end_tags()
1992 el = open_els.shift()
1993 if el.name is 'button' and el.namespace is NS_HTML
1996 insert_html_element t
1997 flag_frameset_ok = false
1999 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
2000 unless is_in_scope t.name, NS_HTML
2003 generate_implied_end_tags()
2004 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
2007 el = open_els.shift()
2008 if el.name is t.name and el.namespace is NS_HTML
2011 if t.type is TYPE_END_TAG and t.name is 'form'
2012 unless template_tag_is_open()
2013 node = form_element_pointer
2014 form_element_pointer = null
2015 if node is null or not el_is_in_scope node
2018 generate_implied_end_tags()
2019 if open_els[0] isnt node
2021 for el, i in open_els
2023 open_els.splice i, 1
2026 unless is_in_scope 'form', NS_HTML
2029 generate_implied_end_tags()
2030 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
2033 el = open_els.shift()
2034 if el.name is 'form' and el.namespace is NS_HTML
2037 if t.type is TYPE_END_TAG and t.name is 'p'
2038 unless is_in_button_scope 'p', NS_HTML
2040 insert_html_element new_open_tag 'p'
2043 if t.type is TYPE_END_TAG and t.name is 'li'
2044 unless is_in_li_scope 'li', NS_HTML
2047 generate_implied_end_tags 'li' # arg is exception
2048 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
2051 el = open_els.shift()
2052 if el.name is 'li' and el.namespace is NS_HTML
2055 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
2056 unless is_in_scope t.name, NS_HTML
2059 generate_implied_end_tags t.name # arg is exception
2060 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2063 el = open_els.shift()
2064 if el.name is t.name and el.namespace is NS_HTML
2067 if t.type is TYPE_END_TAG and h_tags[t.name]?
2070 if h_tags[el.name] is el.namespace
2073 if standard_scopers[el.name] is el.namespace
2078 generate_implied_end_tags()
2079 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2082 el = open_els.shift()
2083 if h_tags[el.name] is el.namespace
2087 if t.type is TYPE_START_TAG and t.name is 'a'
2088 # If the list of active formatting elements contains an a element
2089 # between the end of the list and the last marker on the list (or
2090 # the start of the list if there is no marker on the list), then
2091 # this is a parse error; run the adoption agency algorithm for the
2092 # tag name "a", then remove that element from the list of active
2093 # formatting elements and the stack of open elements if the
2094 # adoption agency algorithm didn't already remove it (it might not
2095 # have if the element is not in table scope).
2098 if el.type is TYPE_AFE_MARKER
2100 if el.name is 'a' and el.namespace is NS_HTML
2108 for el, i in open_els
2110 open_els.splice i, 1
2112 el = insert_html_element t
2115 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2117 el = insert_html_element t
2120 if t.type is TYPE_START_TAG and t.name is 'nobr'
2122 if is_in_scope 'nobr', NS_HTML
2124 adoption_agency 'nobr'
2126 el = insert_html_element t
2129 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2130 adoption_agency t.name
2132 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2134 insert_html_element t
2136 flag_frameset_ok = false
2138 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2139 unless is_in_scope t.name, NS_HTML
2142 generate_implied_end_tags()
2143 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2146 el = open_els.shift()
2147 if el.name is t.name and el.namespace is NS_HTML
2149 clear_afe_to_marker()
2151 if t.type is TYPE_START_TAG and t.name is 'table'
2152 unless doc.flag('quirks mode') is QUIRKS_YES
2153 close_p_if_in_button_scope() # test
2154 insert_html_element t
2155 flag_frameset_ok = false
2156 ins_mode = ins_mode_in_table
2158 if t.type is TYPE_END_TAG and t.name is 'br'
2160 t.type = TYPE_START_TAG
2162 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2164 insert_html_element t
2166 t.acknowledge_self_closing()
2167 flag_frameset_ok = false
2169 if t.type is TYPE_START_TAG and t.name is 'input'
2171 insert_html_element t
2173 t.acknowledge_self_closing()
2174 unless is_input_hidden_tok t
2175 flag_frameset_ok = false
2177 if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
2178 # WHATWG adds 'menuitem' for this block
2179 insert_html_element t
2181 t.acknowledge_self_closing()
2183 if t.type is TYPE_START_TAG and t.name is 'hr'
2184 close_p_if_in_button_scope()
2185 insert_html_element t
2187 t.acknowledge_self_closing()
2188 flag_frameset_ok = false
2190 if t.type is TYPE_START_TAG and t.name is 'image'
2195 if t.type is TYPE_START_TAG and t.name is 'isindex'
2197 if template_tag_is_open() is false and form_element_pointer isnt null
2199 t.acknowledge_self_closing()
2200 flag_frameset_ok = false
2201 close_p_if_in_button_scope()
2202 el = insert_html_element new_open_tag 'form'
2203 unless template_tag_is_open()
2204 form_element_pointer = el
2207 el.attrs['action'] = a[1]
2209 insert_html_element new_open_tag 'hr'
2212 insert_html_element new_open_tag 'label'
2213 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2214 input_el = new_open_tag 'input'
2219 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2220 input_el.attrs_a.push [a[0], a[1]]
2221 input_el.attrs_a.push ['name', 'isindex']
2222 # fixfull this next bit is in english... internationalize?
2223 prompt ?= "This is a searchable index. Enter search keywords: "
2224 insert_character new_character_token prompt # fixfull split
2225 # TODO submit typo "balue" in spec
2226 insert_html_element input_el
2228 # insert_character '' # you can put chars here if promt attr missing
2230 insert_html_element new_open_tag 'hr'
2233 unless template_tag_is_open()
2234 form_element_pointer = null
2236 if t.type is TYPE_START_TAG and t.name is 'textarea'
2237 insert_html_element t
2238 eat_next_token_if_newline()
2239 tok_state = tok_state_rcdata
2240 original_ins_mode = ins_mode
2241 flag_frameset_ok = false
2242 ins_mode = ins_mode_text
2244 if t.type is TYPE_START_TAG and t.name is 'xmp'
2245 close_p_if_in_button_scope()
2247 flag_frameset_ok = false
2248 parse_generic_raw_text t
2250 if t.type is TYPE_START_TAG and t.name is 'iframe'
2251 flag_frameset_ok = false
2252 parse_generic_raw_text t
2254 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2255 parse_generic_raw_text t
2257 if t.type is TYPE_START_TAG and t.name is 'select'
2259 insert_html_element t
2260 flag_frameset_ok = false
2261 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2262 ins_mode = ins_mode_in_select_in_table
2264 ins_mode = ins_mode_in_select
2266 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2267 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2270 insert_html_element t
2272 # this comment block implements the W3C spec
2273 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2274 # if is_in_scope 'ruby', NS_HTML
2275 # generate_implied_end_tags()
2276 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2278 # insert_html_element t
2280 # if t.type is TYPE_START_TAG and t.name is 'rt'
2281 # if is_in_scope 'ruby', NS_HTML
2282 # generate_implied_end_tags 'rtc' # arg is exception
2283 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2285 # insert_html_element t
2287 # below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2288 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2289 if is_in_scope 'ruby', NS_HTML
2290 generate_implied_end_tags()
2291 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2293 insert_html_element t
2295 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2296 if is_in_scope 'ruby', NS_HTML
2297 generate_implied_end_tags 'rtc'
2298 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2300 insert_html_element t
2303 if t.type is TYPE_START_TAG and t.name is 'math'
2305 adjust_mathml_attributes t
2306 adjust_foreign_attributes t
2307 insert_foreign_element t, NS_MATHML
2308 if t.flag 'self-closing'
2310 t.acknowledge_self_closing()
2312 if t.type is TYPE_START_TAG and t.name is 'svg'
2314 adjust_svg_attributes t
2315 adjust_foreign_attributes t
2316 insert_foreign_element t, NS_SVG
2317 if t.flag 'self-closing'
2319 t.acknowledge_self_closing()
2321 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2324 if t.type is TYPE_START_TAG # any other start tag
2326 insert_html_element t
2328 if t.type is TYPE_END_TAG # any other end tag
2329 in_body_any_other_end_tag t.name
2333 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2334 ins_mode_text = (t) ->
2335 if t.type is TYPE_TEXT
2338 if t.type is TYPE_EOF
2340 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2341 open_els[0].flag 'already started', true
2343 ins_mode = original_ins_mode
2346 if t.type is TYPE_END_TAG and t.name is 'script'
2348 ins_mode = original_ins_mode
2349 # fixfull the spec seems to assume that I'm going to run the script
2350 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2352 if t.type is TYPE_END_TAG
2354 ins_mode = original_ins_mode
2356 console.log 'warning: end of ins_mode_text reached'
2358 # the functions below implement the tokenizer stats described here:
2359 # http://www.w3.org/TR/html5/syntax.html#tokenization
2361 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2362 ins_mode_in_table_else = (t) ->
2364 flag_foster_parenting = true
2366 flag_foster_parenting = false
2368 ins_mode_in_table = (t) ->
2371 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2372 pending_table_character_tokens = []
2373 original_ins_mode = ins_mode
2374 ins_mode = ins_mode_in_table_text
2377 ins_mode_in_table_else t
2385 clear_stack_to_table_context()
2387 insert_html_element t
2388 ins_mode = ins_mode_in_caption
2390 clear_stack_to_table_context()
2391 insert_html_element t
2392 ins_mode = ins_mode_in_column_group
2394 clear_stack_to_table_context()
2395 insert_html_element new_open_tag 'colgroup'
2396 ins_mode = ins_mode_in_column_group
2398 when 'tbody', 'tfoot', 'thead'
2399 clear_stack_to_table_context()
2400 insert_html_element t
2401 ins_mode = ins_mode_in_table_body
2402 when 'td', 'th', 'tr'
2403 clear_stack_to_table_context()
2404 insert_html_element new_open_tag 'tbody'
2405 ins_mode = ins_mode_in_table_body
2409 if is_in_table_scope 'table', NS_HTML
2411 el = open_els.shift()
2412 if el.name is 'table' and el.namespace is NS_HTML
2416 when 'style', 'script', 'template'
2419 unless is_input_hidden_tok t
2420 ins_mode_in_table_else t
2423 el = insert_html_element t
2425 t.acknowledge_self_closing()
2428 if form_element_pointer?
2430 if template_tag_is_open()
2432 form_element_pointer = insert_html_element t
2435 ins_mode_in_table_else t
2439 if is_in_table_scope 'table', NS_HTML
2441 el = open_els.shift()
2442 if el.name is 'table' and el.namespace is NS_HTML
2447 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2452 ins_mode_in_table_else t
2456 ins_mode_in_table_else t
2459 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2460 ins_mode_in_table_text = (t) ->
2461 if t.type is TYPE_TEXT and t.text is "\u0000"
2465 if t.type is TYPE_TEXT
2466 pending_table_character_tokens.push t
2470 for old in pending_table_character_tokens
2471 unless is_space_tok old
2475 for old in pending_table_character_tokens
2476 insert_character old
2478 for old in pending_table_character_tokens
2479 ins_mode_in_table_else old
2480 pending_table_character_tokens = []
2481 ins_mode = original_ins_mode
2484 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2485 ins_mode_in_caption = (t) ->
2486 if t.type is TYPE_END_TAG and t.name is 'caption'
2487 if is_in_table_scope 'caption', NS_HTML
2488 generate_implied_end_tags()
2489 if open_els[0].name isnt 'caption'
2492 el = open_els.shift()
2493 if el.name is 'caption' and el.namespace is NS_HTML
2495 clear_afe_to_marker()
2496 ins_mode = ins_mode_in_table
2501 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2503 if is_in_table_scope 'caption', NS_HTML
2505 el = open_els.shift()
2506 if el.name is 'caption' and el.namespace is NS_HTML
2508 clear_afe_to_marker()
2509 ins_mode = ins_mode_in_table
2511 # else fragment case
2513 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2519 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2520 ins_mode_in_column_group = (t) ->
2524 if t.type is TYPE_COMMENT
2527 if t.type is TYPE_DOCTYPE
2530 if t.type is TYPE_START_TAG and t.name is 'html'
2533 if t.type is TYPE_START_TAG and t.name is 'col'
2534 el = insert_html_element t
2536 t.acknowledge_self_closing()
2538 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2539 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2541 ins_mode = ins_mode_in_table
2545 if t.type is TYPE_END_TAG and t.name is 'col'
2548 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2551 if t.type is TYPE_EOF
2555 if open_els[0].name isnt 'colgroup'
2559 ins_mode = ins_mode_in_table
2563 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2564 ins_mode_in_table_body = (t) ->
2565 if t.type is TYPE_START_TAG and t.name is 'tr'
2566 clear_stack_to_table_body_context()
2567 insert_html_element t
2568 ins_mode = ins_mode_in_row
2570 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2572 clear_stack_to_table_body_context()
2573 insert_html_element new_open_tag 'tr'
2574 ins_mode = ins_mode_in_row
2577 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2578 unless is_in_table_scope t.name, NS_HTML
2581 clear_stack_to_table_body_context()
2583 ins_mode = ins_mode_in_table
2585 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2588 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2591 if table_scopers[el.name] is el.namespace
2596 clear_stack_to_table_body_context()
2598 ins_mode = ins_mode_in_table
2601 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2607 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2608 ins_mode_in_row = (t) ->
2609 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2610 clear_stack_to_table_row_context()
2611 insert_html_element t
2612 ins_mode = ins_mode_in_cell
2615 if t.type is TYPE_END_TAG and t.name is 'tr'
2616 if is_in_table_scope 'tr', NS_HTML
2617 clear_stack_to_table_row_context()
2619 ins_mode = ins_mode_in_table_body
2623 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2624 if is_in_table_scope 'tr', NS_HTML
2625 clear_stack_to_table_row_context()
2627 ins_mode = ins_mode_in_table_body
2632 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2633 if is_in_table_scope t.name, NS_HTML
2634 if is_in_table_scope 'tr', NS_HTML
2635 clear_stack_to_table_row_context()
2637 ins_mode = ins_mode_in_table_body
2642 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2648 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2650 generate_implied_end_tags()
2651 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2654 el = open_els.shift()
2655 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2657 clear_afe_to_marker()
2658 ins_mode = ins_mode_in_row
2660 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2661 ins_mode_in_cell = (t) ->
2662 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2663 if is_in_table_scope t.name, NS_HTML
2664 generate_implied_end_tags()
2665 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2668 el = open_els.shift()
2669 if el.name is t.name and el.namespace is NS_HTML
2671 clear_afe_to_marker()
2672 ins_mode = ins_mode_in_row
2676 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2679 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2682 if table_scopers[el.name] is el.namespace
2690 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2693 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2694 if is_in_table_scope t.name, NS_HTML
2703 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2704 ins_mode_in_select = (t) ->
2705 if t.type is TYPE_TEXT and t.text is "\u0000"
2708 if t.type is TYPE_TEXT
2711 if t.type is TYPE_COMMENT
2714 if t.type is TYPE_DOCTYPE
2717 if t.type is TYPE_START_TAG and t.name is 'html'
2720 if t.type is TYPE_START_TAG and t.name is 'option'
2721 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2723 insert_html_element t
2725 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2726 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2728 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2730 insert_html_element t
2732 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2733 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2734 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2736 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2741 if t.type is TYPE_END_TAG and t.name is 'option'
2742 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2747 if t.type is TYPE_END_TAG and t.name is 'select'
2748 if is_in_select_scope 'select', NS_HTML
2750 el = open_els.shift()
2751 if el.name is 'select' and el.namespace is NS_HTML
2757 if t.type is TYPE_START_TAG and t.name is 'select'
2760 el = open_els.shift()
2761 if el.name is 'select' and el.namespace is NS_HTML
2764 # spec says that this is the same as </select> but it doesn't say
2765 # to check scope first
2767 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2769 unless is_in_select_scope 'select', NS_HTML
2772 el = open_els.shift()
2773 if el.name is 'select' and el.namespace is NS_HTML
2778 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2781 if t.type is TYPE_EOF
2788 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2789 ins_mode_in_select_in_table = (t) ->
2790 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2793 el = open_els.shift()
2794 if el.name is 'select' and el.namespace is NS_HTML
2799 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2801 unless is_in_table_scope t.name, NS_HTML
2804 el = open_els.shift()
2805 if el.name is 'select' and el.namespace is NS_HTML
2811 ins_mode_in_select t
2814 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2815 ins_mode_in_template = (t) ->
2816 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2819 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2822 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2823 template_ins_modes.shift()
2824 template_ins_modes.unshift ins_mode_in_table
2825 ins_mode = ins_mode_in_table
2828 if t.type is TYPE_START_TAG and t.name is 'col'
2829 template_ins_modes.shift()
2830 template_ins_modes.unshift ins_mode_in_column_group
2831 ins_mode = ins_mode_in_column_group
2834 if t.type is TYPE_START_TAG and t.name is 'tr'
2835 template_ins_modes.shift()
2836 template_ins_modes.unshift ins_mode_in_table_body
2837 ins_mode = ins_mode_in_table_body
2840 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2841 template_ins_modes.shift()
2842 template_ins_modes.unshift ins_mode_in_row
2843 ins_mode = ins_mode_in_row
2846 if t.type is TYPE_START_TAG
2847 template_ins_modes.shift()
2848 template_ins_modes.unshift ins_mode_in_body
2849 ins_mode = ins_mode_in_body
2852 if t.type is TYPE_END_TAG
2855 if t.type is TYPE_EOF
2856 unless template_tag_is_open()
2861 el = open_els.shift()
2862 if el.name is 'template' and el.namespace is NS_HTML
2864 clear_afe_to_marker()
2865 template_ins_modes.shift()
2869 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2870 ins_mode_after_body = (t) ->
2874 if t.type is TYPE_COMMENT
2875 first = open_els[open_els.length - 1]
2876 insert_comment t, [first, first.children.length]
2878 if t.type is TYPE_DOCTYPE
2881 if t.type is TYPE_START_TAG and t.name is 'html'
2884 if t.type is TYPE_END_TAG and t.name is 'html'
2885 if flag_fragment_parsing
2888 ins_mode = ins_mode_after_after_body
2890 if t.type is TYPE_EOF
2895 ins_mode = ins_mode_in_body
2898 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2899 ins_mode_in_frameset = (t) ->
2903 if t.type is TYPE_COMMENT
2906 if t.type is TYPE_DOCTYPE
2909 if t.type is TYPE_START_TAG and t.name is 'html'
2912 if t.type is TYPE_START_TAG and t.name is 'frameset'
2913 insert_html_element t
2915 if t.type is TYPE_END_TAG and t.name is 'frameset'
2916 if open_els.length is 1
2918 return # fragment case
2920 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2921 ins_mode = ins_mode_after_frameset
2923 if t.type is TYPE_START_TAG and t.name is 'frame'
2924 insert_html_element t
2926 t.acknowledge_self_closing()
2928 if t.type is TYPE_START_TAG and t.name is 'noframes'
2931 if t.type is TYPE_EOF
2932 if open_els.length isnt 1
2940 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2941 ins_mode_after_frameset = (t) ->
2945 if t.type is TYPE_COMMENT
2948 if t.type is TYPE_DOCTYPE
2951 if t.type is TYPE_START_TAG and t.name is 'html'
2954 if t.type is TYPE_END_TAG and t.name is 'html'
2955 ins_mode = ins_mode_after_after_frameset
2957 if t.type is TYPE_START_TAG and t.name is 'noframes'
2960 if t.type is TYPE_EOF
2967 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2968 ins_mode_after_after_body = (t) ->
2969 if t.type is TYPE_COMMENT
2970 insert_comment t, [doc, doc.children.length]
2972 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2975 if t.type is TYPE_EOF
2980 ins_mode = ins_mode_in_body
2984 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2985 ins_mode_after_after_frameset = (t) ->
2986 if t.type is TYPE_COMMENT
2987 insert_comment t, [doc, doc.children.length]
2989 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2992 if t.type is TYPE_EOF
2995 if t.type is TYPE_START_TAG and t.name is 'noframes'
3002 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
3003 has_color_face_or_size = (t) ->
3005 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
3008 in_foreign_content_end_script = ->
3012 in_foreign_content_other_start = (t) ->
3013 acn = adjusted_current_node()
3014 if acn.namespace is NS_MATHML
3015 adjust_mathml_attributes t
3016 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
3017 t.name = svg_name_fixes[t.name]
3018 if acn.namespace is NS_SVG
3019 adjust_svg_attributes t
3020 adjust_foreign_attributes t
3021 insert_foreign_element t, acn.namespace
3022 if t.flag 'self-closing'
3023 if t.name is 'script'
3024 t.acknowledge_self_closing()
3025 in_foreign_content_end_script()
3029 t.acknowledge_self_closing()
3031 in_foreign_content = (t) ->
3032 if t.type is TYPE_TEXT and t.text is "\u0000"
3034 insert_character new_character_token "\ufffd"
3039 if t.type is TYPE_TEXT
3040 flag_frameset_ok = false
3043 if t.type is TYPE_COMMENT
3046 if t.type is TYPE_DOCTYPE
3049 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
3051 if flag_fragment_parsing
3052 in_foreign_content_other_start t
3054 loop # is this safe?
3056 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
3060 if t.type is TYPE_START_TAG
3061 in_foreign_content_other_start t
3063 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
3064 in_foreign_content_end_script()
3066 if t.type is TYPE_END_TAG
3069 if node.name.toLowerCase() isnt t.name
3072 if node is open_els[open_els.length - 1]
3074 if node.name.toLowerCase() is t.name
3076 el = open_els.shift()
3081 if node.namespace is NS_HTML
3083 ins_mode t # explicitly call HTML insertion mode
3086 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
3088 switch c = txt.charAt(cur++)
3090 return new_text_node parse_character_reference()
3092 tok_state = tok_state_tag_open
3095 return new_text_node "\ufffd"
3097 return new_eof_token()
3099 return new_text_node c
3102 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
3103 # not needed: tok_state_character_reference_in_data = ->
3104 # just call parse_character_reference()
3106 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
3107 tok_state_rcdata = ->
3108 switch c = txt.charAt(cur++)
3110 return new_text_node parse_character_reference()
3112 tok_state = tok_state_rcdata_less_than_sign
3115 return new_character_token "\ufffd"
3117 return new_eof_token()
3119 return new_character_token c
3122 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
3123 # not needed: tok_state_character_reference_in_rcdata = ->
3124 # just call parse_character_reference()
3126 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
3127 tok_state_rawtext = ->
3128 switch c = txt.charAt(cur++)
3130 tok_state = tok_state_rawtext_less_than_sign
3133 return new_character_token "\ufffd"
3135 return new_eof_token()
3137 return new_character_token c
3140 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
3141 tok_state_script_data = ->
3142 switch c = txt.charAt(cur++)
3144 tok_state = tok_state_script_data_less_than_sign
3147 return new_character_token "\ufffd"
3149 return new_eof_token()
3151 return new_character_token c
3154 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3155 tok_state_plaintext = ->
3156 switch c = txt.charAt(cur++)
3159 return new_character_token "\ufffd"
3161 return new_eof_token()
3163 return new_character_token c
3167 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3168 tok_state_tag_open = ->
3169 c = txt.charAt(cur++)
3171 tok_state = tok_state_markup_declaration_open
3174 tok_state = tok_state_end_tag_open
3177 tok_cur_tag = new_open_tag c.toLowerCase()
3178 tok_state = tok_state_tag_name
3181 tok_cur_tag = new_open_tag c
3182 tok_state = tok_state_tag_name
3186 tok_cur_tag = new_comment_token '?' # FIXME right?
3187 tok_state = tok_state_bogus_comment
3191 tok_state = tok_state_data
3192 cur -= 1 # we didn't parse/handle the char after <
3193 return new_text_node '<'
3195 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3196 tok_state_end_tag_open = ->
3197 c = txt.charAt(cur++)
3199 tok_cur_tag = new_end_tag c.toLowerCase()
3200 tok_state = tok_state_tag_name
3203 tok_cur_tag = new_end_tag c
3204 tok_state = tok_state_tag_name
3208 tok_state = tok_state_data
3212 tok_state = tok_state_data
3213 return new_text_node '</'
3216 tok_cur_tag = new_comment_token c
3217 tok_state = tok_state_bogus_comment
3220 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3221 tok_state_tag_name = ->
3222 switch c = txt.charAt(cur++)
3223 when "\t", "\n", "\u000c", ' '
3224 tok_state = tok_state_before_attribute_name
3226 tok_state = tok_state_self_closing_start_tag
3228 tok_state = tok_state_data
3234 tok_cur_tag.name += "\ufffd"
3237 tok_state = tok_state_data
3240 tok_cur_tag.name += c.toLowerCase()
3242 tok_cur_tag.name += c
3245 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3246 tok_state_rcdata_less_than_sign = ->
3247 c = txt.charAt(cur++)
3249 temporary_buffer = ''
3250 tok_state = tok_state_rcdata_end_tag_open
3253 tok_state = tok_state_rcdata
3254 cur -= 1 # reconsume the input character
3255 return new_character_token '<'
3257 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3258 tok_state_rcdata_end_tag_open = ->
3259 c = txt.charAt(cur++)
3261 tok_cur_tag = new_end_tag c.toLowerCase()
3262 temporary_buffer += c
3263 tok_state = tok_state_rcdata_end_tag_name
3266 tok_cur_tag = new_end_tag c
3267 temporary_buffer += c
3268 tok_state = tok_state_rcdata_end_tag_name
3271 tok_state = tok_state_rcdata
3272 cur -= 1 # reconsume the input character
3273 return new_character_token "</" # fixfull separate these
3275 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3276 is_appropriate_end_tag = (t) ->
3277 # spec says to check against "the tag name of the last start tag to
3278 # have been emitted from this tokenizer", but this is only called from
3279 # the various "raw" states, so it's hopefully ok to assume that
3280 # open_els[0].name will work instead TODO: verify this after the script
3281 # data states are implemented
3282 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3283 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3285 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3286 tok_state_rcdata_end_tag_name = ->
3287 c = txt.charAt(cur++)
3288 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3289 if is_appropriate_end_tag tok_cur_tag
3290 tok_state = tok_state_before_attribute_name
3292 # else fall through to "Anything else"
3294 if is_appropriate_end_tag tok_cur_tag
3295 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3297 # else fall through to "Anything else"
3299 if is_appropriate_end_tag tok_cur_tag
3300 tok_state = tok_state_data
3302 # else fall through to "Anything else"
3304 tok_cur_tag.name += c.toLowerCase()
3305 temporary_buffer += c
3308 tok_cur_tag.name += c
3309 temporary_buffer += c
3312 tok_state = tok_state_rcdata
3313 cur -= 1 # reconsume the input character
3314 return new_character_token '</' + temporary_buffer # fixfull separate these
3316 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3317 tok_state_rawtext_less_than_sign = ->
3318 c = txt.charAt(cur++)
3320 temporary_buffer = ''
3321 tok_state = tok_state_rawtext_end_tag_open
3324 tok_state = tok_state_rawtext
3325 cur -= 1 # reconsume the input character
3326 return new_character_token '<'
3328 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3329 tok_state_rawtext_end_tag_open = ->
3330 c = txt.charAt(cur++)
3332 tok_cur_tag = new_end_tag c.toLowerCase()
3333 temporary_buffer += c
3334 tok_state = tok_state_rawtext_end_tag_name
3337 tok_cur_tag = new_end_tag c
3338 temporary_buffer += c
3339 tok_state = tok_state_rawtext_end_tag_name
3342 tok_state = tok_state_rawtext
3343 cur -= 1 # reconsume the input character
3344 return new_character_token "</" # fixfull separate these
3346 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3347 tok_state_rawtext_end_tag_name = ->
3348 c = txt.charAt(cur++)
3349 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3350 if is_appropriate_end_tag tok_cur_tag
3351 tok_state = tok_state_before_attribute_name
3353 # else fall through to "Anything else"
3355 if is_appropriate_end_tag tok_cur_tag
3356 tok_state = tok_state_self_closing_start_tag
3358 # else fall through to "Anything else"
3360 if is_appropriate_end_tag tok_cur_tag
3361 tok_state = tok_state_data
3363 # else fall through to "Anything else"
3365 tok_cur_tag.name += c.toLowerCase()
3366 temporary_buffer += c
3369 tok_cur_tag.name += c
3370 temporary_buffer += c
3373 tok_state = tok_state_rawtext
3374 cur -= 1 # reconsume the input character
3375 return new_character_token '</' + temporary_buffer # fixfull separate these
3377 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3378 tok_state_script_data_less_than_sign = ->
3379 c = txt.charAt(cur++)
3381 temporary_buffer = ''
3382 tok_state = tok_state_script_data_end_tag_open
3385 tok_state = tok_state_script_data_escape_start
3386 return new_character_token '<!' # fixfull split
3388 tok_state = tok_state_script_data
3389 cur -= 1 # Reconsume
3390 return new_character_token '<'
3392 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3393 tok_state_script_data_end_tag_open = ->
3394 c = txt.charAt(cur++)
3396 tok_cur_tag = new_end_tag c.toLowerCase()
3397 temporary_buffer += c
3398 tok_state = tok_state_script_data_end_tag_name
3401 tok_cur_tag = new_end_tag c
3402 temporary_buffer += c
3403 tok_state = tok_state_script_data_end_tag_name
3406 tok_state = tok_state_script_data
3407 cur -= 1 # Reconsume
3408 return new_character_token '</'
3410 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3411 tok_state_script_data_end_tag_name = ->
3412 c = txt.charAt(cur++)
3413 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3414 if is_appropriate_end_tag tok_cur_tag
3415 tok_state = tok_state_before_attribute_name
3419 if is_appropriate_end_tag tok_cur_tag
3420 tok_state = tok_state_self_closing_start_tag
3424 if is_appropriate_end_tag tok_cur_tag
3425 tok_state = tok_state_data
3429 tok_cur_tag.name += c.toLowerCase()
3430 temporary_buffer += c
3433 tok_cur_tag.name += c
3434 temporary_buffer += c
3437 tok_state = tok_state_script_data
3438 cur -= 1 # Reconsume
3439 return new_character_token "</#{temporary_buffer}" # fixfull split
3441 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3442 tok_state_script_data_escape_start = ->
3443 c = txt.charAt(cur++)
3445 tok_state = tok_state_script_data_escape_start_dash
3446 return new_character_token '-'
3448 tok_state = tok_state_script_data
3449 cur -= 1 # Reconsume
3452 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3453 tok_state_script_data_escape_start_dash = ->
3454 c = txt.charAt(cur++)
3456 tok_state = tok_state_script_data_escaped_dash_dash
3457 return new_character_token '-'
3459 tok_state = tok_state_script_data
3460 cur -= 1 # Reconsume
3463 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3464 tok_state_script_data_escaped = ->
3465 c = txt.charAt(cur++)
3467 tok_state = tok_state_script_data_escaped_dash
3468 return new_character_token '-'
3470 tok_state = tok_state_script_data_escaped_less_than_sign
3474 return new_character_token "\ufffd"
3476 tok_state = tok_state_data
3478 cur -= 1 # Reconsume
3481 return new_character_token c
3483 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3484 tok_state_script_data_escaped_dash = ->
3485 c = txt.charAt(cur++)
3487 tok_state = tok_state_script_data_escaped_dash_dash
3488 return new_character_token '-'
3490 tok_state = tok_state_script_data_escaped_less_than_sign
3494 tok_state = tok_state_script_data_escaped
3495 return new_character_token "\ufffd"
3497 tok_state = tok_state_data
3499 cur -= 1 # Reconsume
3502 tok_state = tok_state_script_data_escaped
3503 return new_character_token c
3505 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3506 tok_state_script_data_escaped_dash_dash = ->
3507 c = txt.charAt(cur++)
3509 return new_character_token '-'
3511 tok_state = tok_state_script_data_escaped_less_than_sign
3514 tok_state = tok_state_script_data
3515 return new_character_token '>'
3518 tok_state = tok_state_script_data_escaped
3519 return new_character_token "\ufffd"
3522 tok_state = tok_state_data
3523 cur -= 1 # Reconsume
3526 tok_state = tok_state_script_data_escaped
3527 return new_character_token c
3529 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3530 tok_state_script_data_escaped_less_than_sign = ->
3531 c = txt.charAt(cur++)
3533 temporary_buffer = ''
3534 tok_state = tok_state_script_data_escaped_end_tag_open
3537 temporary_buffer = c.toLowerCase() # yes, really
3538 tok_state = tok_state_script_data_double_escape_start
3539 return new_character_token "<#{c}" # fixfull split
3541 temporary_buffer = c
3542 tok_state = tok_state_script_data_double_escape_start
3543 return new_character_token "<#{c}" # fixfull split
3545 tok_state = tok_state_script_data_escaped
3546 cur -= 1 # Reconsume
3547 return new_character_token '<'
3549 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3550 tok_state_script_data_escaped_end_tag_open = ->
3551 c = txt.charAt(cur++)
3553 tok_cur_tag = new_end_tag c.toLowerCase()
3554 temporary_buffer += c
3555 tok_state = tok_state_script_data_escaped_end_tag_name
3558 tok_cur_tag = new_end_tag c
3559 temporary_buffer += c
3560 tok_state = tok_state_script_data_escaped_end_tag_name
3563 tok_state = tok_state_script_data_escaped
3564 cur -= 1 # Reconsume
3565 return new_character_token '</' # fixfull split
3567 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3568 tok_state_script_data_escaped_end_tag_name = ->
3569 c = txt.charAt(cur++)
3570 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3571 if is_appropriate_end_tag tok_cur_tag
3572 tok_state = tok_state_before_attribute_name
3576 if is_appropriate_end_tag tok_cur_tag
3577 tok_state = tok_state_self_closing_start_tag
3581 if is_appropriate_end_tag tok_cur_tag
3582 tok_state = tok_state_data
3586 tok_cur_tag.name += c.toLowerCase()
3587 temporary_buffer += c.toLowerCase()
3590 tok_cur_tag.name += c
3591 temporary_buffer += c.toLowerCase()
3594 tok_state = tok_state_script_data_escaped
3595 cur -= 1 # Reconsume
3596 return new_character_token "</#{temporary_buffer}" # fixfull split
3598 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3599 tok_state_script_data_double_escape_start = ->
3600 c = txt.charAt(cur++)
3601 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3602 if temporary_buffer is 'script'
3603 tok_state = tok_state_script_data_double_escaped
3605 tok_state = tok_state_script_data_escaped
3606 return new_character_token c
3608 temporary_buffer += c.toLowerCase() # yes, really lowercase
3609 return new_character_token c
3611 temporary_buffer += c
3612 return new_character_token c
3614 tok_state = tok_state_script_data_escaped
3615 cur -= 1 # Reconsume
3618 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3619 tok_state_script_data_double_escaped = ->
3620 c = txt.charAt(cur++)
3622 tok_state = tok_state_script_data_double_escaped_dash
3623 return new_character_token '-'
3625 tok_state = tok_state_script_data_double_escaped_less_than_sign
3626 return new_character_token '<'
3629 return new_character_token "\ufffd"
3632 tok_state = tok_state_data
3633 cur -= 1 # Reconsume
3636 return new_character_token c
3638 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3639 tok_state_script_data_double_escaped_dash = ->
3640 c = txt.charAt(cur++)
3642 tok_state = tok_state_script_data_double_escaped_dash_dash
3643 return new_character_token '-'
3645 tok_state = tok_state_script_data_double_escaped_less_than_sign
3646 return new_character_token '<'
3649 tok_state = tok_state_script_data_double_escaped
3650 return new_character_token "\ufffd"
3653 tok_state = tok_state_data
3654 cur -= 1 # Reconsume
3657 tok_state = tok_state_script_data_double_escaped
3658 return new_character_token c
3660 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3661 tok_state_script_data_double_escaped_dash_dash = ->
3662 c = txt.charAt(cur++)
3664 return new_character_token '-'
3666 tok_state = tok_state_script_data_double_escaped_less_than_sign
3667 return new_character_token '<'
3669 tok_state = tok_state_script_data
3670 return new_character_token '>'
3673 tok_state = tok_state_script_data_double_escaped
3674 return new_character_token "\ufffd"
3677 tok_state = tok_state_data
3678 cur -= 1 # Reconsume
3681 tok_state = tok_state_script_data_double_escaped
3682 return new_character_token c
3684 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3685 tok_state_script_data_double_escaped_less_than_sign = ->
3686 c = txt.charAt(cur++)
3688 temporary_buffer = ''
3689 tok_state = tok_state_script_data_double_escape_end
3690 return new_character_token '/'
3692 tok_state = tok_state_script_data_double_escaped
3693 cur -= 1 # Reconsume
3696 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3697 tok_state_script_data_double_escape_end = ->
3698 c = txt.charAt(cur++)
3699 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3700 if temporary_buffer is 'script'
3701 tok_state = tok_state_script_data_escaped
3703 tok_state = tok_state_script_data_double_escaped
3704 return new_character_token c
3706 temporary_buffer += c.toLowerCase() # yes, really lowercase
3707 return new_character_token c
3709 temporary_buffer += c
3710 return new_character_token c
3712 tok_state = tok_state_script_data_double_escaped
3713 cur -= 1 # Reconsume
3716 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3717 tok_state_before_attribute_name = ->
3719 switch c = txt.charAt(cur++)
3720 when "\t", "\n", "\u000c", ' '
3723 tok_state = tok_state_self_closing_start_tag
3726 tok_state = tok_state_data
3732 attr_name = "\ufffd"
3733 when '"', "'", '<', '='
3738 tok_state = tok_state_data
3741 attr_name = c.toLowerCase()
3745 tok_cur_tag.attrs_a.unshift [attr_name, '']
3746 tok_state = tok_state_attribute_name
3749 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3750 tok_state_attribute_name = ->
3751 switch c = txt.charAt(cur++)
3752 when "\t", "\n", "\u000c", ' '
3753 tok_state = tok_state_after_attribute_name
3755 tok_state = tok_state_self_closing_start_tag
3757 tok_state = tok_state_before_attribute_value
3759 tok_state = tok_state_data
3765 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3768 tok_cur_tag.attrs_a[0][0] += c
3771 tok_state = tok_state_data
3774 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3776 tok_cur_tag.attrs_a[0][0] += c
3779 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3780 tok_state_after_attribute_name = ->
3781 c = txt.charAt(cur++)
3782 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3785 tok_state = tok_state_self_closing_start_tag
3788 tok_state = tok_state_before_attribute_value
3791 tok_state = tok_state_data
3794 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3795 tok_state = tok_state_attribute_name
3799 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3800 tok_state = tok_state_attribute_name
3804 tok_state = tok_state_data
3805 cur -= 1 # reconsume
3807 if c is '"' or c is "'" or c is '<'
3809 # fall through to Anything else
3811 tok_cur_tag.attrs_a.unshift [c, '']
3812 tok_state = tok_state_attribute_name
3814 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3815 tok_state_before_attribute_value = ->
3816 switch c = txt.charAt(cur++)
3817 when "\t", "\n", "\u000c", ' '
3820 tok_state = tok_state_attribute_value_double_quoted
3822 tok_state = tok_state_attribute_value_unquoted
3825 tok_state = tok_state_attribute_value_single_quoted
3828 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3829 tok_state = tok_state_attribute_value_unquoted
3832 tok_state = tok_state_data
3838 tok_state = tok_state_data
3840 tok_cur_tag.attrs_a[0][1] += c
3841 tok_state = tok_state_attribute_value_unquoted
3844 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3845 tok_state_attribute_value_double_quoted = ->
3846 switch c = txt.charAt(cur++)
3848 tok_state = tok_state_after_attribute_value_quoted
3850 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3853 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3856 tok_state = tok_state_data
3858 tok_cur_tag.attrs_a[0][1] += c
3861 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3862 tok_state_attribute_value_single_quoted = ->
3863 switch c = txt.charAt(cur++)
3865 tok_state = tok_state_after_attribute_value_quoted
3867 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3870 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3873 tok_state = tok_state_data
3875 tok_cur_tag.attrs_a[0][1] += c
3878 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3879 tok_state_attribute_value_unquoted = ->
3880 switch c = txt.charAt(cur++)
3881 when "\t", "\n", "\u000c", ' '
3882 tok_state = tok_state_before_attribute_name
3884 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3886 tok_state = tok_state_data
3891 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3894 tok_state = tok_state_data
3896 # Parse Error if ', <, = or ` (backtick)
3897 tok_cur_tag.attrs_a[0][1] += c
3900 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3901 tok_state_after_attribute_value_quoted = ->
3902 switch c = txt.charAt(cur++)
3903 when "\t", "\n", "\u000c", ' '
3904 tok_state = tok_state_before_attribute_name
3906 tok_state = tok_state_self_closing_start_tag
3908 tok_state = tok_state_data
3914 tok_state = tok_state_data
3917 tok_state = tok_state_before_attribute_name
3918 cur -= 1 # we didn't handle that char
3921 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3922 tok_state_self_closing_start_tag = ->
3923 c = txt.charAt(cur++)
3925 tok_cur_tag.flag 'self-closing', true
3926 tok_state = tok_state_data
3930 tok_state = tok_state_data
3931 cur -= 1 # Reconsume
3935 tok_state = tok_state_before_attribute_name
3936 cur -= 1 # Reconsume
3939 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3940 # WARNING: put a comment token in tok_cur_tag before setting this state
3941 tok_state_bogus_comment = ->
3942 next_gt = txt.indexOf '>', cur
3944 val = txt.substr cur
3947 val = txt.substr cur, (next_gt - cur)
3949 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3950 tok_cur_tag.text += val
3951 tok_state = tok_state_data
3954 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3955 tok_state_markup_declaration_open = ->
3956 if txt.substr(cur, 2) is '--'
3958 tok_cur_tag = new_comment_token ''
3959 tok_state = tok_state_comment_start
3961 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3963 tok_state = tok_state_doctype
3965 acn = adjusted_current_node()
3966 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3968 tok_state = tok_state_cdata_section
3972 tok_cur_tag = new_comment_token ''
3973 tok_state = tok_state_bogus_comment
3976 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3977 tok_state_comment_start = ->
3978 switch c = txt.charAt(cur++)
3980 tok_state = tok_state_comment_start_dash
3983 tok_state = tok_state_comment
3984 return new_character_token "\ufffd"
3987 tok_state = tok_state_data
3991 tok_state = tok_state_data
3992 cur -= 1 # Reconsume
3995 tok_cur_tag.text += c
3996 tok_state = tok_state_comment
3999 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
4000 tok_state_comment_start_dash = ->
4001 switch c = txt.charAt(cur++)
4003 tok_state = tok_state_comment_end
4006 tok_cur_tag.text += "-\ufffd"
4007 tok_state = tok_state_comment
4010 tok_state = tok_state_data
4014 tok_state = tok_state_data
4015 cur -= 1 # Reconsume
4018 tok_cur_tag.text += "-#{c}"
4019 tok_state = tok_state_comment
4022 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
4023 tok_state_comment = ->
4024 switch c = txt.charAt(cur++)
4026 tok_state = tok_state_comment_end_dash
4029 tok_cur_tag.text += "\ufffd"
4032 tok_state = tok_state_data
4033 cur -= 1 # Reconsume
4036 tok_cur_tag.text += c
4039 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
4040 tok_state_comment_end_dash = ->
4041 switch c = txt.charAt(cur++)
4043 tok_state = tok_state_comment_end
4046 tok_cur_tag.text += "-\ufffd"
4047 tok_state = tok_state_comment
4050 tok_state = tok_state_data
4051 cur -= 1 # Reconsume
4054 tok_cur_tag.text += "-#{c}"
4055 tok_state = tok_state_comment
4058 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
4059 tok_state_comment_end = ->
4060 switch c = txt.charAt(cur++)
4062 tok_state = tok_state_data
4066 tok_cur_tag.text += "--\ufffd"
4067 tok_state = tok_state_comment
4070 tok_state = tok_state_comment_end_bang
4073 tok_cur_tag.text += '-'
4076 tok_state = tok_state_data
4077 cur -= 1 # Reconsume
4081 tok_cur_tag.text += "--#{c}"
4082 tok_state = tok_state_comment
4085 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
4086 tok_state_comment_end_bang = ->
4087 switch c = txt.charAt(cur++)
4089 tok_cur_tag.text += "--!#{c}"
4090 tok_state = tok_state_comment_end_dash
4092 tok_state = tok_state_data
4096 tok_cur_tag.text += "--!\ufffd"
4097 tok_state = tok_state_comment
4100 tok_state = tok_state_data
4101 cur -= 1 # Reconsume
4104 tok_cur_tag.text += "--!#{c}"
4105 tok_state = tok_state_comment
4108 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4109 tok_state_doctype = ->
4110 switch c = txt.charAt(cur++)
4111 when "\t", "\u000a", "\u000c", ' '
4112 tok_state = tok_state_before_doctype_name
4115 tok_state = tok_state_data
4116 el = new_doctype_token ''
4117 el.flag 'force-quirks', true
4118 cur -= 1 # Reconsume
4122 tok_state = tok_state_before_doctype_name
4123 cur -= 1 # Reconsume
4126 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4127 tok_state_before_doctype_name = ->
4128 c = txt.charAt(cur++)
4129 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4132 tok_cur_tag = new_doctype_token c.toLowerCase()
4133 tok_state = tok_state_doctype_name
4137 tok_cur_tag = new_doctype_token "\ufffd"
4138 tok_state = tok_state_doctype_name
4142 el = new_doctype_token ''
4143 el.flag 'force-quirks', true
4144 tok_state = tok_state_data
4148 tok_state = tok_state_data
4149 el = new_doctype_token ''
4150 el.flag 'force-quirks', true
4151 cur -= 1 # Reconsume
4154 tok_cur_tag = new_doctype_token c
4155 tok_state = tok_state_doctype_name
4158 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4159 tok_state_doctype_name = ->
4160 c = txt.charAt(cur++)
4161 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4162 tok_state = tok_state_after_doctype_name
4165 tok_state = tok_state_data
4168 tok_cur_tag.name += c.toLowerCase()
4172 tok_cur_tag.name += "\ufffd"
4176 tok_state = tok_state_data
4177 tok_cur_tag.flag 'force-quirks', true
4178 cur -= 1 # Reconsume
4181 tok_cur_tag.name += c
4184 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4185 tok_state_after_doctype_name = ->
4186 c = txt.charAt(cur++)
4187 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4190 tok_state = tok_state_data
4194 tok_state = tok_state_data
4195 tok_cur_tag.flag 'force-quirks', true
4196 cur -= 1 # Reconsume
4199 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4201 tok_state = tok_state_after_doctype_public_keyword
4203 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4205 tok_state = tok_state_after_doctype_system_keyword
4208 tok_cur_tag.flag 'force-quirks', true
4209 tok_state = tok_state_bogus_doctype
4212 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4213 tok_state_after_doctype_public_keyword = ->
4214 c = txt.charAt(cur++)
4215 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4216 tok_state = tok_state_before_doctype_public_identifier
4220 tok_cur_tag.public_identifier = ''
4221 tok_state = tok_state_doctype_public_identifier_double_quoted
4225 tok_cur_tag.public_identifier = ''
4226 tok_state = tok_state_doctype_public_identifier_single_quoted
4230 tok_cur_tag.flag 'force-quirks', true
4231 tok_state = tok_state_data
4235 tok_state = tok_state_data
4236 tok_cur_tag.flag 'force-quirks', true
4237 cur -= 1 # Reconsume
4241 tok_cur_tag.flag 'force-quirks', true
4242 tok_state = tok_state_bogus_doctype
4245 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4246 tok_state_before_doctype_public_identifier = ->
4247 c = txt.charAt(cur++)
4248 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4252 tok_cur_tag.public_identifier = ''
4253 tok_state = tok_state_doctype_public_identifier_double_quoted
4257 tok_cur_tag.public_identifier = ''
4258 tok_state = tok_state_doctype_public_identifier_single_quoted
4262 tok_cur_tag.flag 'force-quirks', true
4263 tok_state = tok_state_data
4267 tok_state = tok_state_data
4268 tok_cur_tag.flag 'force-quirks', true
4269 cur -= 1 # Reconsume
4273 tok_cur_tag.flag 'force-quirks', true
4274 tok_state = tok_state_bogus_doctype
4278 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4279 tok_state_doctype_public_identifier_double_quoted = ->
4280 c = txt.charAt(cur++)
4282 tok_state = tok_state_after_doctype_public_identifier
4286 tok_cur_tag.public_identifier += "\ufffd"
4290 tok_cur_tag.flag 'force-quirks', true
4291 tok_state = tok_state_data
4295 tok_state = tok_state_data
4296 tok_cur_tag.flag 'force-quirks', true
4297 cur -= 1 # Reconsume
4300 tok_cur_tag.public_identifier += c
4303 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4304 tok_state_doctype_public_identifier_single_quoted = ->
4305 c = txt.charAt(cur++)
4307 tok_state = tok_state_after_doctype_public_identifier
4311 tok_cur_tag.public_identifier += "\ufffd"
4315 tok_cur_tag.flag 'force-quirks', true
4316 tok_state = tok_state_data
4320 tok_state = tok_state_data
4321 tok_cur_tag.flag 'force-quirks', true
4322 cur -= 1 # Reconsume
4325 tok_cur_tag.public_identifier += c
4328 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4329 tok_state_after_doctype_public_identifier = ->
4330 c = txt.charAt(cur++)
4331 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4332 tok_state = tok_state_between_doctype_public_and_system_identifiers
4335 tok_state = tok_state_data
4339 tok_cur_tag.system_identifier = ''
4340 tok_state = tok_state_doctype_system_identifier_double_quoted
4344 tok_cur_tag.system_identifier = ''
4345 tok_state = tok_state_doctype_system_identifier_single_quoted
4349 tok_state = tok_state_data
4350 tok_cur_tag.flag 'force-quirks', true
4351 cur -= 1 # Reconsume
4355 tok_cur_tag.flag 'force-quirks', true
4356 tok_state = tok_state_bogus_doctype
4359 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4360 tok_state_between_doctype_public_and_system_identifiers = ->
4361 c = txt.charAt(cur++)
4362 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4365 tok_state = tok_state_data
4369 tok_cur_tag.system_identifier = ''
4370 tok_state = tok_state_doctype_system_identifier_double_quoted
4374 tok_cur_tag.system_identifier = ''
4375 tok_state = tok_state_doctype_system_identifier_single_quoted
4379 tok_state = tok_state_data
4380 tok_cur_tag.flag 'force-quirks', true
4381 cur -= 1 # Reconsume
4385 tok_cur_tag.flag 'force-quirks', true
4386 tok_state = tok_state_bogus_doctype
4389 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4390 tok_state_after_doctype_system_keyword = ->
4391 c = txt.charAt(cur++)
4392 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4393 tok_state = tok_state_before_doctype_system_identifier
4397 tok_cur_tag.system_identifier = ''
4398 tok_state = tok_state_doctype_system_identifier_double_quoted
4402 tok_cur_tag.system_identifier = ''
4403 tok_state = tok_state_doctype_system_identifier_single_quoted
4407 tok_cur_tag.flag 'force-quirks', true
4408 tok_state = tok_state_data
4412 tok_state = tok_state_data
4413 tok_cur_tag.flag 'force-quirks', true
4414 cur -= 1 # Reconsume
4418 tok_cur_tag.flag 'force-quirks', true
4419 tok_state = tok_state_bogus_doctype
4422 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4423 tok_state_before_doctype_system_identifier = ->
4424 c = txt.charAt(cur++)
4425 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4428 tok_cur_tag.system_identifier = ''
4429 tok_state = tok_state_doctype_system_identifier_double_quoted
4432 tok_cur_tag.system_identifier = ''
4433 tok_state = tok_state_doctype_system_identifier_single_quoted
4437 tok_cur_tag.flag 'force-quirks', true
4438 tok_state = tok_state_data
4442 tok_state = tok_state_data
4443 tok_cur_tag.flag 'force-quirks', true
4444 cur -= 1 # Reconsume
4448 tok_cur_tag.flag 'force-quirks', true
4449 tok_state = tok_state_bogus_doctype
4452 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4453 tok_state_doctype_system_identifier_double_quoted = ->
4454 c = txt.charAt(cur++)
4456 tok_state = tok_state_after_doctype_system_identifier
4460 tok_cur_tag.system_identifier += "\ufffd"
4464 tok_cur_tag.flag 'force-quirks', true
4465 tok_state = tok_state_data
4469 tok_state = tok_state_data
4470 tok_cur_tag.flag 'force-quirks', true
4471 cur -= 1 # Reconsume
4474 tok_cur_tag.system_identifier += c
4477 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4478 tok_state_doctype_system_identifier_single_quoted = ->
4479 c = txt.charAt(cur++)
4481 tok_state = tok_state_after_doctype_system_identifier
4485 tok_cur_tag.system_identifier += "\ufffd"
4489 tok_cur_tag.flag 'force-quirks', true
4490 tok_state = tok_state_data
4494 tok_state = tok_state_data
4495 tok_cur_tag.flag 'force-quirks', true
4496 cur -= 1 # Reconsume
4499 tok_cur_tag.system_identifier += c
4502 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4503 tok_state_after_doctype_system_identifier = ->
4504 c = txt.charAt(cur++)
4505 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4508 tok_state = tok_state_data
4512 tok_state = tok_state_data
4513 tok_cur_tag.flag 'force-quirks', true
4514 cur -= 1 # Reconsume
4518 # do _not_ tok_cur_tag.flag 'force-quirks', true
4519 tok_state = tok_state_bogus_doctype
4522 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4523 tok_state_bogus_doctype = ->
4524 c = txt.charAt(cur++)
4526 tok_state = tok_state_data
4529 tok_state = tok_state_data
4530 cur -= 1 # Reconsume
4535 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4536 tok_state_cdata_section = ->
4537 tok_state = tok_state_data
4538 next_gt = txt.indexOf ']]>', cur
4540 val = txt.substr cur
4543 val = txt.substr cur, (next_gt - cur)
4546 return new_character_token val # fixfull split
4549 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4550 # Don't set this as a state, just call it
4551 # returns a string (NOT a text node)
4552 parse_character_reference = (allowed_char = null, in_attr = false) ->
4553 if cur >= txt.length
4555 switch c = txt.charAt(cur)
4556 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4557 # explicitly not a parse error
4560 # there has to be "one or more" alnums between & and ; to be a parse error
4563 if cur + 1 >= txt.length
4565 if txt.charAt(cur + 1).toLowerCase() is 'x'
4574 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4579 if txt.charAt(start + i) is ';'
4583 code_point = txt.substr(start, i)
4584 while code_point.charAt(0) is '0' and code_point.length > 1
4585 code_point = code_point.substr 1
4586 code_point = parseInt(code_point, base)
4587 if unicode_fixes[code_point]?
4589 return unicode_fixes[code_point]
4591 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4595 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4597 return from_code_point code_point
4601 if alnum.indexOf(txt.charAt(cur + i)) is -1
4604 # exit early, because parse_error() below needs at least one alnum
4606 if txt.charAt(cur + i) is ';'
4607 i += 1 # include ';' terminator in value
4608 decoded = decode_named_char_ref txt.substr(cur, i)
4615 # no ';' terminator (only legacy char refs)
4617 for i in [2..max] # no prefix matches, so ok to check shortest first
4618 c = legacy_char_refs[txt.substr(cur, i)]
4621 if txt.charAt(cur + i) is '='
4622 # "because some legacy user agents will
4623 # misinterpret the markup in those cases"
4626 if alnum.indexOf(txt.charAt(cur + i)) > -1
4627 # this makes attributes forgiving about url args
4629 # ok, and besides the weird exceptions for attributes...
4630 # return the matching char
4631 cur += i # consume entity chars
4632 parse_error() # because no terminating ";"
4636 return # never reached
4638 eat_next_token_if_newline = ->
4643 if t.type is TYPE_TEXT
4644 # definition of a newline depends on whether it was a character ref or not
4645 if cur - old_cur is 1
4646 # not a character reference
4647 if t.text is "\u000d" or t.text is "\u000a"
4650 if t.text is "\u000a"
4656 # tree constructor initialization
4657 # see comments on TYPE_TAG/etc for the structure of this data
4660 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4661 doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
4663 afe = [] # active formatting elements
4664 template_ins_modes = []
4665 ins_mode = ins_mode_initial
4666 original_ins_mode = ins_mode # TODO check spec
4667 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4668 flag_frameset_ok = true
4670 flag_foster_parenting = false
4671 form_element_pointer = null
4672 temporary_buffer = null
4673 pending_table_character_tokens = []
4674 head_element_pointer = null
4675 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4676 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4677 prev_node_id = 0 # just for debugging
4679 # tokenizer initialization
4680 tok_state = tok_state_data
4682 # text pre-processing
4683 # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4684 txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4685 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4686 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4688 if args.name is "tests23.dat #1"
4691 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4692 parse_main_loop = ->
4697 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4701 serialize_els = (els, shallow, show_ids) ->
4707 serialized += t.serialize shallow, show_ids
4710 module.exports.parse_html = parse_html
4711 module.exports.debug_log_reset = debug_log_reset
4712 module.exports.debug_log_each = debug_log_each
4713 module.exports.TYPE_TAG = TYPE_TAG
4714 module.exports.TYPE_TEXT = TYPE_TEXT
4715 module.exports.TYPE_COMMENT = TYPE_COMMENT
4716 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4717 module.exports.NS_HTML = NS_HTML
4718 module.exports.NS_MATHML = NS_MATHML
4719 module.exports.NS_SVG = NS_SVG
4720 module.exports.QUIRKS_NO = QUIRKS_NO
4721 module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
4722 module.exports.QUIRKS_YES = QUIRKS_YES