1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
25 # Deviations from that spec:
27 # Purposeful: search this file for "WHATWG"
29 # Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
34 # the spec uses a many different words do indicate which ends of lists/stacks
35 # they are talking about (and relative movement within the lists/stacks). This
36 # section splains. I'm implementing "lists" (afe and open_els) the same way
39 # stacks grow downward (current element is index=0)
41 # example: open_els = [a, b, c, d, e, f, g]
43 # "grows downwards" means it's visualized like this: (index: el, names)
45 # 6: g "start of the list", "topmost", "first"
47 # 4: e "previous" (to d), "above", "before"
48 # 3: d (previous/next are relative to this element)
49 # 2: c "next", "after", "lower", "below"
51 # 0: a "end of the list", "current node", "bottommost", "last"
55 # note: to get this to run outside a browser, you'll have to write a native
56 # implementation of decode_named_char_ref()
57 unless module?.exports?
59 module = exports: window.wheic
61 from_code_point = (x) ->
62 if String.fromCodePoint?
63 return String.fromCodePoint x
66 return String.fromCharCode x
68 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
70 # Each node is an obect of the Node class. Here are the Node types:
71 TYPE_TAG = 0 # name, {attributes}, [children]
72 TYPE_TEXT = 1 # "text"
75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
77 TYPE_END_TAG = 5 # name
79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
87 # quirks mode constants
97 debug_log_each = (cb) ->
98 for str in g_debug_log
103 constructor: (type, args = {}) ->
104 @type = type # one of the TYPE_* constants above
105 @name = args.name ? '' # tag name
106 @text = args.text ? '' # contents for text/comment nodes
107 @attrs = args.attrs ? {}
108 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
109 @children = args.children ? []
110 @namespace = args.namespace ? NS_HTML
111 @parent = args.parent ? null
112 @token = args.token ? null
113 @flags = args.flags ? {}
117 @id = "#{++prev_node_id}"
118 acknowledge_self_closing: ->
120 @token.flag 'did_self_close', true
122 @flag 'did_self_close', true
123 flag: (key, value = null) ->
128 serialize: (shallow = false, show_ids = false) -> # for unit tests
133 ret += JSON.stringify @name
148 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
154 ret += c.serialize shallow, show_ids
158 ret += JSON.stringify @text
161 ret += JSON.stringify @text
163 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
166 when TYPE_AAA_BOOKMARK
167 ret += 'aaa_bookmark'
170 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
173 # helpers: (only take args that are normally known when parser creates nodes)
174 new_open_tag = (name) ->
175 return new Node TYPE_START_TAG, name: name
176 new_end_tag = (name) ->
177 return new Node TYPE_END_TAG, name: name
178 new_element = (name) ->
179 return new Node TYPE_TAG, name: name
180 new_text_node = (txt) ->
181 return new Node TYPE_TEXT, text: txt
182 new_character_token = new_text_node
183 new_comment_token = (txt) ->
184 return new Node TYPE_COMMENT, text: txt
185 new_doctype_token = (name) ->
186 return new Node TYPE_DOCTYPE, name: name
188 return new Node TYPE_EOF
190 return new Node TYPE_AFE_MARKER
191 new_aaa_bookmark = ->
192 return new Node TYPE_AAA_BOOKMARK
194 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
195 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
196 digits = "0123456789"
197 alnum = lc_alpha + uc_alpha + digits
198 hex_chars = digits + "abcdefABCDEF"
200 is_uc_alpha = (str) ->
201 return str.length is 1 and uc_alpha.indexOf(str) > -1
202 is_lc_alpha = (str) ->
203 return str.length is 1 and lc_alpha.indexOf(str) > -1
205 # some SVG elements have dashes in them
206 tag_name_chars = alnum + "-"
208 # http://www.w3.org/TR/html5/infrastructure.html#space-character
209 space_chars = "\u0009\u000a\u000c\u000d\u0020"
211 return txt.length is 1 and space_chars.indexOf(txt) > -1
212 is_space_tok = (t) ->
213 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
215 is_input_hidden_tok = (t) ->
216 return false unless t.type is TYPE_START_TAG
219 if a[1].toLowerCase() is 'hidden'
224 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
225 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
228 unicode_fixes[0x00] = "\uFFFD"
229 unicode_fixes[0x80] = "\u20AC"
230 unicode_fixes[0x82] = "\u201A"
231 unicode_fixes[0x83] = "\u0192"
232 unicode_fixes[0x84] = "\u201E"
233 unicode_fixes[0x85] = "\u2026"
234 unicode_fixes[0x86] = "\u2020"
235 unicode_fixes[0x87] = "\u2021"
236 unicode_fixes[0x88] = "\u02C6"
237 unicode_fixes[0x89] = "\u2030"
238 unicode_fixes[0x8A] = "\u0160"
239 unicode_fixes[0x8B] = "\u2039"
240 unicode_fixes[0x8C] = "\u0152"
241 unicode_fixes[0x8E] = "\u017D"
242 unicode_fixes[0x91] = "\u2018"
243 unicode_fixes[0x92] = "\u2019"
244 unicode_fixes[0x93] = "\u201C"
245 unicode_fixes[0x94] = "\u201D"
246 unicode_fixes[0x95] = "\u2022"
247 unicode_fixes[0x96] = "\u2013"
248 unicode_fixes[0x97] = "\u2014"
249 unicode_fixes[0x98] = "\u02DC"
250 unicode_fixes[0x99] = "\u2122"
251 unicode_fixes[0x9A] = "\u0161"
252 unicode_fixes[0x9B] = "\u203A"
253 unicode_fixes[0x9C] = "\u0153"
254 unicode_fixes[0x9E] = "\u017E"
255 unicode_fixes[0x9F] = "\u0178"
257 quirks_yes_pi_prefixes = [
258 "+//silmaril//dtd html pro v0r11 19970101//"
259 "-//as//dtd html 3.0 aswedit + extensions//"
260 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
261 "-//ietf//dtd html 2.0 level 1//"
262 "-//ietf//dtd html 2.0 level 2//"
263 "-//ietf//dtd html 2.0 strict level 1//"
264 "-//ietf//dtd html 2.0 strict level 2//"
265 "-//ietf//dtd html 2.0 strict//"
266 "-//ietf//dtd html 2.0//"
267 "-//ietf//dtd html 2.1e//"
268 "-//ietf//dtd html 3.0//"
269 "-//ietf//dtd html 3.2 final//"
270 "-//ietf//dtd html 3.2//"
271 "-//ietf//dtd html 3//"
272 "-//ietf//dtd html level 0//"
273 "-//ietf//dtd html level 1//"
274 "-//ietf//dtd html level 2//"
275 "-//ietf//dtd html level 3//"
276 "-//ietf//dtd html strict level 0//"
277 "-//ietf//dtd html strict level 1//"
278 "-//ietf//dtd html strict level 2//"
279 "-//ietf//dtd html strict level 3//"
280 "-//ietf//dtd html strict//"
281 "-//ietf//dtd html//"
282 "-//metrius//dtd metrius presentational//"
283 "-//microsoft//dtd internet explorer 2.0 html strict//"
284 "-//microsoft//dtd internet explorer 2.0 html//"
285 "-//microsoft//dtd internet explorer 2.0 tables//"
286 "-//microsoft//dtd internet explorer 3.0 html strict//"
287 "-//microsoft//dtd internet explorer 3.0 html//"
288 "-//microsoft//dtd internet explorer 3.0 tables//"
289 "-//netscape comm. corp.//dtd html//"
290 "-//netscape comm. corp.//dtd strict html//"
291 "-//o'reilly and associates//dtd html 2.0//"
292 "-//o'reilly and associates//dtd html extended 1.0//"
293 "-//o'reilly and associates//dtd html extended relaxed 1.0//"
294 "-//sq//dtd html 2.0 hotmetal + extensions//"
295 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
296 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
297 "-//spyglass//dtd html 2.0 extended//"
298 "-//sun microsystems corp.//dtd hotjava html//"
299 "-//sun microsystems corp.//dtd hotjava strict html//"
300 "-//w3c//dtd html 3 1995-03-24//"
301 "-//w3c//dtd html 3.2 draft//"
302 "-//w3c//dtd html 3.2 final//"
303 "-//w3c//dtd html 3.2//"
304 "-//w3c//dtd html 3.2s draft//"
305 "-//w3c//dtd html 4.0 frameset//"
306 "-//w3c//dtd html 4.0 transitional//"
307 "-//w3c//dtd html experimental 19960712//"
308 "-//w3c//dtd html experimental 970421//"
309 "-//w3c//dtd w3 html//"
310 "-//w3o//dtd w3 html 3.0//"
311 "-//webtechs//dtd mozilla html 2.0//"
312 "-//webtechs//dtd mozilla html//"
315 # These are the character references that don't need a terminating semicolon
316 # min length: 2, max: 6, none are a prefix of any other.
318 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
319 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
320 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
321 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
322 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
323 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
324 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
325 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
326 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
327 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
328 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
329 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
330 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
331 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
332 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
333 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
334 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
338 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
339 raw_text_elements = ['script', 'style']
340 escapable_raw_text_elements = ['textarea', 'title']
341 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
343 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
344 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
345 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
346 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
347 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
348 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
349 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
350 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
351 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
352 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
353 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
354 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
355 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
356 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
360 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
362 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
363 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
364 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
365 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
366 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
367 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
368 'determinant', 'diff', 'divergence', 'divide', 'domain',
369 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
370 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
371 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
372 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
373 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
374 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
375 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
376 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
377 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
378 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
379 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
380 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
381 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
382 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
383 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
384 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
385 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
386 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
387 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
388 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
389 'vectorproduct', 'xor'
391 # foreign_elements = [svg_elements..., mathml_elements...]
392 #normal_elements = All other allowed HTML elements are normal elements.
396 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
397 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
398 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
399 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
400 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
401 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
402 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
403 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
404 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
405 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
406 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
408 menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
410 meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
411 noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
412 plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
413 select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
414 table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
415 textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
416 tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
419 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
420 'annotation-xml':NS_MATHML,
423 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
426 formatting_elements = {
427 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
428 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
432 mathml_text_integration = {
433 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
435 is_mathml_text_integration_point = (el) ->
436 return mathml_text_integration[el.name] is el.namespace
437 is_html_integration = (el) -> # DON'T PASS A TOKEN
438 if el.namespace is NS_MATHML
439 if el.name is 'annotation-xml'
440 if el.attrs.encoding?
441 if el.attrs.encoding.toLowerCase() is 'text/html'
443 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
446 if el.namespace is NS_SVG
447 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
452 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
455 foster_parenting_targets = {
476 el_is_special = (e) ->
477 return special_elements[e.name] is e.namespace
479 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
480 el_is_special_not_adp = (el) ->
481 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
485 altglyphdef: 'altGlyphDef'
486 altglyphitem: 'altGlyphItem'
487 animatecolor: 'animateColor'
488 animatemotion: 'animateMotion'
489 animatetransform: 'animateTransform'
492 fecolormatrix: 'feColorMatrix'
493 fecomponenttransfer: 'feComponentTransfer'
494 fecomposite: 'feComposite'
495 feconvolvematrix: 'feConvolveMatrix'
496 fediffuselighting: 'feDiffuseLighting'
497 fedisplacementmap: 'feDisplacementMap'
498 fedistantlight: 'feDistantLight'
499 fedropshadow: 'feDropShadow'
505 fegaussianblur: 'feGaussianBlur'
508 femergenode: 'feMergeNode'
509 femorphology: 'feMorphology'
511 fepointlight: 'fePointLight'
512 fespecularlighting: 'feSpecularLighting'
513 fespotlight: 'feSpotLight'
515 feturbulence: 'feTurbulence'
516 foreignobject: 'foreignObject'
518 lineargradient: 'linearGradient'
519 radialgradient: 'radialGradient'
522 svg_attribute_fixes = {
523 attributename: 'attributeName'
524 attributetype: 'attributeType'
525 basefrequency: 'baseFrequency'
526 baseprofile: 'baseProfile'
528 clippathunits: 'clipPathUnits'
529 contentscripttype: 'contentScriptType'
530 contentstyletype: 'contentStyleType'
531 diffuseconstant: 'diffuseConstant'
533 externalresourcesrequired: 'externalResourcesRequired'
534 # WHATWG removes this: filterres: 'filterRes'
535 filterunits: 'filterUnits'
537 gradienttransform: 'gradientTransform'
538 gradientunits: 'gradientUnits'
539 kernelmatrix: 'kernelMatrix'
540 kernelunitlength: 'kernelUnitLength'
541 keypoints: 'keyPoints'
542 keysplines: 'keySplines'
544 lengthadjust: 'lengthAdjust'
545 limitingconeangle: 'limitingConeAngle'
546 markerheight: 'markerHeight'
547 markerunits: 'markerUnits'
548 markerwidth: 'markerWidth'
549 maskcontentunits: 'maskContentUnits'
550 maskunits: 'maskUnits'
551 numoctaves: 'numOctaves'
552 pathlength: 'pathLength'
553 patterncontentunits: 'patternContentUnits'
554 patterntransform: 'patternTransform'
555 patternunits: 'patternUnits'
556 pointsatx: 'pointsAtX'
557 pointsaty: 'pointsAtY'
558 pointsatz: 'pointsAtZ'
559 preservealpha: 'preserveAlpha'
560 preserveaspectratio: 'preserveAspectRatio'
561 primitiveunits: 'primitiveUnits'
564 repeatcount: 'repeatCount'
565 repeatdur: 'repeatDur'
566 requiredextensions: 'requiredExtensions'
567 requiredfeatures: 'requiredFeatures'
568 specularconstant: 'specularConstant'
569 specularexponent: 'specularExponent'
570 spreadmethod: 'spreadMethod'
571 startoffset: 'startOffset'
572 stddeviation: 'stdDeviation'
573 stitchtiles: 'stitchTiles'
574 surfacescale: 'surfaceScale'
575 systemlanguage: 'systemLanguage'
576 tablevalues: 'tableValues'
579 textlength: 'textLength'
581 viewtarget: 'viewTarget'
582 xchannelselector: 'xChannelSelector'
583 ychannelselector: 'yChannelSelector'
584 zoomandpan: 'zoomAndPan'
586 foreign_attr_fixes = {
587 'xlink:actuate': 'xlink actuate'
588 'xlink:arcrole': 'xlink arcrole'
589 'xlink:href': 'xlink href'
590 'xlink:role': 'xlink role'
591 'xlink:show': 'xlink show'
592 'xlink:title': 'xlink title'
593 'xlink:type': 'xlink type'
594 'xml:base': 'xml base'
595 'xml:lang': 'xml lang'
596 'xml:space': 'xml space'
598 'xmlns:xlink': 'xmlns xlink'
600 adjust_mathml_attributes = (t) ->
602 if a[0] is 'definitionurl'
603 a[0] = 'definitionURL'
605 adjust_svg_attributes = (t) ->
607 if svg_attribute_fixes[a[0]]?
608 a[0] = svg_attribute_fixes[a[0]]
610 adjust_foreign_attributes = (t) ->
613 if foreign_attr_fixes[a[0]]?
614 a[0] = foreign_attr_fixes[a[0]]
617 # decode_named_char_ref()
619 # The list of named character references is _huge_ so ask the browser to decode
620 # for us instead of wasting bandwidth/space on including the table here.
622 # Pass without the "&" but with the ";" examples:
623 # for "&" pass "amp;"
624 # for "′" pass "x2032;"
627 textarea: document.createElement('textarea')
629 # TODO test this in IE8
630 decode_named_char_ref = (txt) ->
632 decoded = g_dncr.cache[txt]
633 return decoded if decoded?
634 g_dncr.textarea.innerHTML = txt
635 decoded = g_dncr.textarea.value
636 return null if decoded is txt
637 return g_dncr.cache[txt] = decoded
639 parse_html = (args) ->
641 cur = null # index of next char in txt to be parsed
642 # declare doc and tokenizer variables so they're in scope below
644 open_els = null # stack of open elements
645 afe = null # active formatting elements
646 template_ins_modes = null
648 original_ins_mode = null
650 tok_cur_tag = null # partially parsed tag
651 flag_scripting = null
652 flag_frameset_ok = null
654 flag_foster_parenting = null
655 form_element_pointer = null
656 temporary_buffer = null
657 pending_table_character_tokens = null
658 head_element_pointer = null
659 flag_fragment_parsing = null
660 context_element = null
669 console.log "Parse error at character #{cur} of #{txt.length}"
671 afe_push = (new_el) ->
674 if el.name is new_el.name and el.namespace is new_el.namespace
676 continue unless new_el.attrs[k] is v
677 for k, v of new_el.attrs
678 continue unless el.attrs[k] is v
685 afe.unshift new_afe_marker()
687 # the functions below impliment the Tree Contstruction algorithm
688 # http://www.w3.org/TR/html5/syntax.html#tree-construction
690 # But first... the helpers
691 template_tag_is_open = ->
693 if el.name is 'template' and el.namespace is NS_HTML
696 is_in_scope_x = (tag_name, scope, namespace) ->
698 if el.name is tag_name and (namespace is null or namespace is el.namespace)
700 if scope[el.name] is el.namespace
703 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
705 if el.name is tag_name and (namespace is null or namespace is el.namespace)
707 if scope[el.name] is el.namespace
709 if scope2[el.name] is el.namespace
713 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
714 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
717 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
718 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
720 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
722 button_scopers = button: NS_HTML
723 li_scopers = ol: NS_HTML, ul: NS_HTML
724 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
725 is_in_scope = (tag_name, namespace = null) ->
726 return is_in_scope_x tag_name, standard_scopers, namespace
727 is_in_button_scope = (tag_name, namespace = null) ->
728 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
729 is_in_table_scope = (tag_name, namespace = null) ->
730 return is_in_scope_x tag_name, table_scopers, namespace
731 # aka is_in_list_item_scope
732 is_in_li_scope = (tag_name, namespace = null) ->
733 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
734 is_in_select_scope = (tag_name, namespace = null) ->
736 if t.name is tag_name and (namespace is null or namespace is t.namespace)
738 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
741 # this checks for a particular element, not by name
742 # this requires a namespace match
743 el_is_in_scope = (needle) ->
747 if standard_scopers[el.name] is el.namespace
751 clear_to_table_stopers = {
756 clear_stack_to_table_context = ->
758 if clear_to_table_stopers[open_els[0].name]?
762 clear_to_table_body_stopers = {
769 clear_stack_to_table_body_context = ->
771 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
775 clear_to_table_row_stopers = {
780 clear_stack_to_table_row_context = ->
782 if clear_to_table_row_stopers[open_els[0].name]?
786 clear_afe_to_marker = ->
788 return unless afe.length > 0 # this happens in fragment case, ?spec error
790 if el.type is TYPE_AFE_MARKER
795 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
797 # 1. Let last be false.
799 # 2. Let node be the last node in the stack of open elements.
801 node = open_els[node_i]
802 # 3. Loop: If node is the first node in the stack of open elements,
803 # then set last to true, and, if the parser was originally created as
804 # part of the HTML fragment parsing algorithm (fragment case) set node
805 # to the context element.
807 if node_i is open_els.length - 1
809 # fixfull (fragment case)
811 # 4. If node is a select element, run these substeps:
812 if node.name is 'select' and node.namespace is NS_HTML
813 # 1. If last is true, jump to the step below labeled done.
815 # 2. Let ancestor be node.
818 # 3. Loop: If ancestor is the first node in the stack of
819 # open elements, jump to the step below labeled done.
821 if ancestor_i is open_els.length - 1
823 # 4. Let ancestor be the node before ancestor in the stack
826 ancestor = open_els[ancestor_i]
827 # 5. If ancestor is a template node, jump to the step below
829 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
831 # 6. If ancestor is a table node, switch the insertion mode
832 # to "in select in table" and abort these steps.
833 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
834 ins_mode = ins_mode_in_select_in_table
836 # 7. Jump back to the step labeled loop.
837 # 8. Done: Switch the insertion mode to "in select" and abort
839 ins_mode = ins_mode_in_select
841 # 5. If node is a td or th element and last is false, then switch
842 # the insertion mode to "in cell" and abort these steps.
843 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
844 ins_mode = ins_mode_in_cell
846 # 6. If node is a tr element, then switch the insertion mode to "in
847 # row" and abort these steps.
848 if node.name is 'tr' and node.namespace is NS_HTML
849 ins_mode = ins_mode_in_row
851 # 7. If node is a tbody, thead, or tfoot element, then switch the
852 # insertion mode to "in table body" and abort these steps.
853 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
854 ins_mode = ins_mode_in_table_body
856 # 8. If node is a caption element, then switch the insertion mode
857 # to "in caption" and abort these steps.
858 if node.name is 'caption' and node.namespace is NS_HTML
859 ins_mode = ins_mode_in_caption
861 # 9. If node is a colgroup element, then switch the insertion mode
862 # to "in column group" and abort these steps.
863 if node.name is 'colgroup' and node.namespace is NS_HTML
864 ins_mode = ins_mode_in_column_group
866 # 10. If node is a table element, then switch the insertion mode to
867 # "in table" and abort these steps.
868 if node.name is 'table' and node.namespace is NS_HTML
869 ins_mode = ins_mode_in_table
871 # 11. If node is a template element, then switch the insertion mode
872 # to the current template insertion mode and abort these steps.
873 if node.name is 'template' and node.namespace is NS_HTML
874 ins_mode = template_ins_modes[0]
876 # 12. If node is a head element and last is true, then switch the
877 # insertion mode to "in body" ("in body"! not "in head"!) and abort
878 # these steps. (fragment case)
879 if node.name is 'head' and node.namespace is NS_HTML and last
880 ins_mode = ins_mode_in_body
882 # 13. If node is a head element and last is false, then switch the
883 # insertion mode to "in head" and abort these steps.
884 if node.name is 'head' and node.namespace is NS_HTML and last is false
885 ins_mode = ins_mode_in_head
887 # 14. If node is a body element, then switch the insertion mode to
888 # "in body" and abort these steps.
889 if node.name is 'body' and node.namespace is NS_HTML
890 ins_mode = ins_mode_in_body
892 # 15. If node is a frameset element, then switch the insertion mode
893 # to "in frameset" and abort these steps. (fragment case)
894 if node.name is 'frameset' and node.namespace is NS_HTML
895 ins_mode = ins_mode_in_frameset
897 # 16. If node is an html element, run these substeps:
898 if node.name is 'html' and node.namespace is NS_HTML
899 # 1. If the head element pointer is null, switch the insertion
900 # mode to "before head" and abort these steps. (fragment case)
901 if head_element_pointer is null
902 ins_mode = ins_mode_before_head
904 # 2. Otherwise, the head element pointer is not null,
905 # switch the insertion mode to "after head" and abort these
907 ins_mode = ins_mode_after_head
909 # 17. If last is true, then switch the insertion mode to "in body"
910 # and abort these steps. (fragment case)
912 ins_mode = ins_mode_in_body
914 # 18. Let node now be the node before node in the stack of open
917 node = open_els[node_i]
918 # 19. Return to the step labeled loop.
922 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
923 adjusted_current_node = ->
924 if open_els.length is 1 and flag_fragment_parsing
925 return context_element
928 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
929 # this implementation is structured (mostly) as described at the link above.
930 # capitalized comments are the "labels" described at the link above.
932 return if afe.length is 0
933 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
938 if i is afe.length - 1
941 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
946 el = insert_html_element afe[i].token
951 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
952 # adoption agency algorithm
954 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
955 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
956 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
957 adoption_agency = (subject) ->
958 debug_log "adoption_agency()"
959 debug_log "tree: #{serialize_els doc.children, false, true}"
960 debug_log "open_els: #{serialize_els open_els, true, true}"
961 debug_log "afe: #{serialize_els afe, true, true}"
962 # this block implements tha W3C spec
963 # # 1. If the current node is an HTML element whose tag name is subject,
964 # # then run these substeps:
966 # # 1. Let element be the current node.
968 # # 2. Pop element off the stack of open elements.
970 # # 3. If element is also in the list of active formatting elements,
971 # # remove the element from the list.
973 # # 4. Abort the adoption agency algorithm.
974 # if open_els[0].name is subject and open_els[0].namespace is NS_HTML
975 # el = open_els.shift()
976 # # remove it from the list of active formatting elements (if found)
981 # debug_log "aaa: starting off with subject on top of stack, exiting"
983 # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
984 # If the current node is an HTML element whose tag name is subject, and
985 # the current node is not in the list of active formatting elements,
986 # then pop the current node off the stack of open elements, and abort
988 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
989 debug_log "aaa: starting off with subject on top of stack, exiting"
990 # remove it from the list of active formatting elements (if found)
997 debug_log "aaa: ...and not in afe, aaa done"
1007 # 5. Let formatting element be the last element in the list of
1008 # active formatting elements that: is between the end of the list
1009 # and the last scope marker in the list, if any, or the start of
1010 # the list otherwise, and has the tag name subject.
1012 for t, fe_of_afe in afe
1013 if t.type is TYPE_AFE_MARKER
1015 if t.name is subject
1018 # If there is no such element, then abort these steps and instead
1019 # act as described in the "any other end tag" entry above.
1021 debug_log "aaa: fe not found in afe"
1022 in_body_any_other_end_tag subject
1024 # 6. If formatting element is not in the stack of open elements,
1025 # then this is a parse error; remove the element from the list, and
1026 # abort these steps.
1028 for t, fe_of_open_els in open_els
1033 debug_log "aaa: fe not found in open_els"
1035 # "remove it from the list" must mean afe, since it's not in open_els
1036 afe.splice fe_of_afe, 1
1038 # 7. If formatting element is in the stack of open elements, but
1039 # the element is not in scope, then this is a parse error; abort
1041 unless el_is_in_scope fe
1042 debug_log "aaa: fe not in scope"
1045 # 8. If formatting element is not the current node, this is a parse
1046 # error. (But do not abort these steps.)
1047 unless open_els[0] is fe
1050 # 9. Let furthest block be the topmost node in the stack of open
1051 # elements that is lower in the stack than formatting element, and
1052 # is an element in the special category. There might not be one.
1054 fb_of_open_els = null
1055 for t, i in open_els
1061 # and continue, to see if there's one that's more "topmost"
1062 # 10. If there is no furthest block, then the UA must first pop all
1063 # the nodes from the bottom of the stack of open elements, from the
1064 # current node up to and including formatting element, then remove
1065 # formatting element from the list of active formatting elements,
1066 # and finally abort these steps.
1068 debug_log "aaa: no fb"
1070 t = open_els.shift()
1072 afe.splice fe_of_afe, 1
1074 # 11. Let common ancestor be the element immediately above
1075 # formatting element in the stack of open elements.
1076 ca = open_els[fe_of_open_els + 1] # common ancestor
1078 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
1079 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
1080 bookmark = new_aaa_bookmark()
1083 afe.splice i, 0, bookmark
1085 node = last_node = fb
1089 # 3. Let node be the element immediately above node in the
1090 # stack of open elements, or if node is no longer in the stack
1091 # of open elements (e.g. because it got removed by this
1092 # algorithm), the element that was immediately above node in
1093 # the stack of open elements before node was removed.
1095 for t, i in open_els
1097 node_next = open_els[i + 1]
1099 node = node_next ? node_above
1100 debug_log "inner loop #{inner}"
1101 debug_log "tree: #{serialize_els doc.children, false, true}"
1102 debug_log "open_els: #{serialize_els open_els, true, true}"
1103 debug_log "afe: #{serialize_els afe, true, true}"
1104 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1105 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1106 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1107 debug_log "node: #{node.serialize true, true}"
1108 # TODO make sure node_above gets re-set if/when node is removed from open_els
1110 # 4. If node is formatting element, then go to the next step in
1111 # the overall algorithm.
1114 debug_log "the meat"
1115 # 5. If inner loop counter is greater than three and node is in
1116 # the list of active formatting elements, then remove node from
1117 # the list of active formatting elements.
1123 debug_log "max out inner"
1128 # 6. If node is not in the list of active formatting elements,
1129 # then remove node from the stack of open elements and then go
1130 # back to the step labeled inner loop.
1132 debug_log "not in afe"
1133 for t, i in open_els
1135 node_above = open_els[i + 1]
1136 open_els.splice i, 1
1139 debug_log "the bones"
1140 # 7. create an element for the token for which the element node
1141 # was created, in the HTML namespace, with common ancestor as
1142 # the intended parent; replace the entry for node in the list
1143 # of active formatting elements with an entry for the new
1144 # element, replace the entry for node in the stack of open
1145 # elements with an entry for the new element, and let node be
1147 new_node = token_to_element node.token, NS_HTML, ca
1151 debug_log "replaced in afe"
1153 for t, i in open_els
1155 node_above = open_els[i + 1]
1156 open_els[i] = new_node
1157 debug_log "replaced in open_els"
1160 # 8. If last node is furthest block, then move the
1161 # aforementioned bookmark to be immediately after the new node
1162 # in the list of active formatting elements.
1167 debug_log "removed bookmark"
1171 # "after" means lower
1172 afe.splice i, 0, bookmark # "after as <-
1173 debug_log "placed bookmark after node"
1174 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1176 # 9. Insert last node into node, first removing it from its
1177 # previous parent node if any.
1178 if last_node.parent?
1179 debug_log "last_node has parent"
1180 for c, i in last_node.parent.children
1182 debug_log "removing last_node from parent"
1183 last_node.parent.children.splice i, 1
1185 node.children.push last_node
1186 last_node.parent = node
1187 # 10. Let last node be node.
1190 # 11. Return to the step labeled inner loop.
1191 # 14. Insert whatever last node ended up being in the previous step
1192 # at the appropriate place for inserting a node, but using common
1193 # ancestor as the override target.
1195 # In the case where fe is immediately followed by fb:
1196 # * inner loop exits out early (node==fe)
1198 # * last_node is still in the tree (not a duplicate)
1199 if last_node.parent?
1200 debug_log "FEFIRST? last_node has parent"
1201 for c, i in last_node.parent.children
1203 debug_log "removing last_node from parent"
1204 last_node.parent.children.splice i, 1
1207 debug_log "after aaa inner loop"
1208 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1209 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1210 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1211 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1212 debug_log "tree: #{serialize_els doc.children, false, true}"
1217 # can't use standard insert token thing, because it's already in
1218 # open_els and must stay at it's current position in open_els
1219 dest = adjusted_insertion_location ca
1220 dest[0].children.splice dest[1], 0, last_node
1221 last_node.parent = dest[0]
1224 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1225 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1226 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1227 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1228 debug_log "tree: #{serialize_els doc.children, false, true}"
1230 # 15. Create an element for the token for which formatting element
1231 # was created, in the HTML namespace, with furthest block as the
1233 new_element = token_to_element fe.token, NS_HTML, fb
1234 # 16. Take all of the child nodes of furthest block and append them
1235 # to the element created in the last step.
1236 while fb.children.length
1237 t = fb.children.shift()
1238 t.parent = new_element
1239 new_element.children.push t
1240 # 17. Append that new element to furthest block.
1241 new_element.parent = fb
1242 fb.children.push new_element
1243 # 18. Remove formatting element from the list of active formatting
1244 # elements, and insert the new element into the list of active
1245 # formatting elements at the position of the aforementioned
1253 afe[i] = new_element
1255 # 19. Remove formatting element from the stack of open elements,
1256 # and insert the new element into the stack of open elements
1257 # immediately below the position of furthest block in that stack.
1258 for t, i in open_els
1260 open_els.splice i, 1
1262 for t, i in open_els
1264 open_els.splice i, 0, new_element
1266 # 20. Jump back to the step labeled outer loop.
1267 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1268 debug_log "tree: #{serialize_els doc.children, false, true}"
1269 debug_log "open_els: #{serialize_els open_els, true, true}"
1270 debug_log "afe: #{serialize_els afe, true, true}"
1271 debug_log "AAA DONE"
1273 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1274 close_p_element = ->
1275 generate_implied_end_tags 'p' # arg is exception
1276 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1278 while open_els.length > 1 # just in case
1279 el = open_els.shift()
1280 if el.name is 'p' and el.namespace is NS_HTML
1282 close_p_if_in_button_scope = ->
1283 if is_in_button_scope 'p', NS_HTML
1286 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1287 # aka insert_a_character = (t) ->
1288 insert_character = (t) ->
1289 dest = adjusted_insertion_location()
1290 # fixfull check for Document node
1292 prev = dest[0].children[dest[1] - 1]
1293 if prev.type is TYPE_TEXT
1296 dest[0].children.splice dest[1], 0, t
1299 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1300 process_token = (t) ->
1301 acn = adjusted_current_node()
1305 if acn.namespace is NS_HTML
1308 if is_mathml_text_integration_point(acn)
1309 if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1312 if t.type is TYPE_TEXT
1315 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1318 if is_html_integration acn
1319 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1322 if t.type is TYPE_EOF
1325 in_foreign_content t
1329 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1330 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1331 adjusted_insertion_location = (override_target = null) ->
1332 # 1. If there was an override target specified, then let target be the
1335 target = override_target
1336 else # Otherwise, let target be the current node.
1337 target = open_els[0]
1338 # 2. Determine the adjusted insertion location using the first matching
1339 # steps from the following list:
1341 # If foster parenting is enabled and target is a table, tbody, tfoot,
1342 # thead, or tr element Foster parenting happens when content is
1343 # misnested in tables.
1344 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1345 loop # once. this is here so we can ``break`` to "abort these substeps"
1346 # 1. Let last template be the last template element in the
1347 # stack of open elements, if any.
1348 last_template = null
1349 last_template_i = null
1350 for el, i in open_els
1351 if el.name is 'template' and el.namespace is NS_HTML
1355 # 2. Let last table be the last table element in the stack of
1356 # open elements, if any.
1359 for el, i in open_els
1360 if el.name is 'table' and el.namespace is NS_HTML
1364 # 3. If there is a last template and either there is no last
1365 # table, or there is one, but last template is lower (more
1366 # recently added) than last table in the stack of open
1367 # elements, then: let adjusted insertion location be inside
1368 # last template's template contents, after its last child (if
1369 # any), and abort these substeps.
1370 if last_template and (last_table is null or last_template_i < last_table_i)
1371 target = last_template # fixfull should be it's contents
1372 target_i = target.children.length
1374 # 4. If there is no last table, then let adjusted insertion
1375 # location be inside the first element in the stack of open
1376 # elements (the html element), after its last child (if any),
1377 # and abort these substeps. (fragment case)
1378 if last_table is null
1380 target = open_els[open_els.length - 1]
1381 target_i = target.children.length
1383 # 5. If last table has a parent element, then let adjusted
1384 # insertion location be inside last table's parent element,
1385 # immediately before last table, and abort these substeps.
1386 if last_table.parent?
1387 for c, i in last_table.parent.children
1389 target = last_table.parent
1393 # 6. Let previous element be the element immediately above last
1394 # table in the stack of open elements.
1396 # huh? how could it not have a parent?
1397 previous_element = open_els[last_table_i + 1]
1398 # 7. Let adjusted insertion location be inside previous
1399 # element, after its last child (if any).
1400 target = previous_element
1401 target_i = target.children.length
1402 # Note: These steps are involved in part because it's possible
1403 # for elements, the table element in this case in particular,
1404 # to have been moved by a script around in the DOM, or indeed
1405 # removed from the DOM entirely, after the element was inserted
1407 break # don't really loop
1409 # Otherwise Let adjusted insertion location be inside target, after
1410 # its last child (if any).
1411 target_i = target.children.length
1413 # 3. If the adjusted insertion location is inside a template element,
1414 # let it instead be inside the template element's template contents,
1415 # after its last child (if any).
1416 # fixfull (template)
1418 # 4. Return the adjusted insertion location.
1419 return [target, target_i]
1421 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1422 # aka create_an_element_for_token
1423 token_to_element = (t, namespace, intended_parent) ->
1424 # convert attributes into a hash
1427 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1428 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1430 # TODO 2. If the newly created element has an xmlns attribute in the
1431 # XMLNS namespace whose value is not exactly the same as the element's
1432 # namespace, that is a parse error. Similarly, if the newly created
1433 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1434 # value is not the XLink Namespace, that is a parse error.
1436 # fixfull: the spec says stuff about form pointers and ownerDocument
1440 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1441 insert_foreign_element = (token, namespace) ->
1442 ail = adjusted_insertion_location()
1445 el = token_to_element token, namespace, ail_el
1446 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1448 ail_el.children.splice ail_i, 0, el
1451 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1452 insert_html_element = (token) ->
1453 insert_foreign_element token, NS_HTML
1455 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1456 # position should be [node, index_within_children]
1457 insert_comment = (t, position = null) ->
1458 position ?= adjusted_insertion_location()
1459 position[0].children.splice position[1], 0, t
1462 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1463 parse_generic_raw_text = (t) ->
1464 insert_html_element t
1465 tok_state = tok_state_rawtext
1466 original_ins_mode = ins_mode
1467 ins_mode = ins_mode_text
1468 parse_generic_rcdata_text = (t) ->
1469 insert_html_element t
1470 tok_state = tok_state_rcdata
1471 original_ins_mode = ins_mode
1472 ins_mode = ins_mode_text
1474 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1475 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1476 generate_implied_end_tags = (except = null) ->
1477 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1480 # 8.2.5.4 The rules for parsing tokens in HTML content
1481 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1483 # 8.2.5.4.1 The "initial" insertion mode
1484 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1485 is_quirks_yes_doctype = (t) ->
1486 if t.flag 'force-quirks'
1488 if t.name isnt 'html'
1490 if t.public_identifier?
1491 pi = t.public_identifier.toLowerCase()
1492 for p in quirks_yes_pi_prefixes
1493 if pi.substr(0, p.length) is p
1495 if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
1497 if t.system_identifier?
1498 if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
1500 else if t.public_identifier?
1501 # already did this: pi = t.public_identifier.toLowerCase()
1502 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1505 is_quirks_limited_doctype = (t) ->
1506 if t.public_identifier?
1507 pi = t.public_identifier.toLowerCase()
1508 if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
1510 if t.system_identifier?
1511 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1514 ins_mode_initial = (t) ->
1517 if t.type is TYPE_COMMENT
1521 if t.type is TYPE_DOCTYPE
1522 # fixfull syntax error from first paragraph and following bullets
1523 # fixfull set doc.doctype
1524 # fixfull is the "not an iframe srcdoc" thing relevant?
1525 if is_quirks_yes_doctype t
1526 doc.flag 'quirks mode', QUIRKS_YES
1527 else if is_quirks_limited_doctype t
1528 doc.flag 'quirks mode', QUIRKS_LIMITED
1530 ins_mode = ins_mode_before_html
1533 # fixfull not iframe srcdoc?
1535 doc.flag 'quirks mode', QUIRKS_YES
1536 ins_mode = ins_mode_before_html
1540 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1541 ins_mode_before_html = (t) ->
1542 if t.type is TYPE_DOCTYPE
1545 if t.type is TYPE_COMMENT
1550 if t.type is TYPE_START_TAG and t.name is 'html'
1551 el = token_to_element t, NS_HTML, doc
1552 doc.children.push el
1553 open_els.unshift(el)
1554 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1555 ins_mode = ins_mode_before_head
1557 if t.type is TYPE_END_TAG
1558 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1559 # fall through to "anything else"
1564 el = token_to_element new_open_tag('html'), NS_HTML, doc
1565 doc.children.push el
1568 # ?fixfull browsing context
1569 ins_mode = ins_mode_before_head
1573 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1574 ins_mode_before_head = (t) ->
1577 if t.type is TYPE_COMMENT
1580 if t.type is TYPE_DOCTYPE
1583 if t.type is TYPE_START_TAG and t.name is 'html'
1586 if t.type is TYPE_START_TAG and t.name is 'head'
1587 el = insert_html_element t
1588 head_element_pointer = el
1589 ins_mode = ins_mode_in_head
1591 if t.type is TYPE_END_TAG
1592 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1593 # fall through to Anything else below
1598 el = insert_html_element new_open_tag 'head'
1599 head_element_pointer = el
1600 ins_mode = ins_mode_in_head
1603 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1604 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1605 open_els.shift() # spec says this will be a 'head' node
1606 ins_mode = ins_mode_after_head
1608 ins_mode_in_head = (t) ->
1609 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1612 if t.type is TYPE_COMMENT
1615 if t.type is TYPE_DOCTYPE
1618 if t.type is TYPE_START_TAG and t.name is 'html'
1621 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1622 el = insert_html_element t
1624 t.acknowledge_self_closing()
1626 if t.type is TYPE_START_TAG and t.name is 'meta'
1627 el = insert_html_element t
1629 t.acknowledge_self_closing()
1630 # fixfull encoding stuff
1632 if t.type is TYPE_START_TAG and t.name is 'title'
1633 parse_generic_rcdata_text t
1635 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1636 parse_generic_raw_text t
1638 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1639 insert_html_element t
1640 ins_mode = ins_mode_in_head_noscript
1642 if t.type is TYPE_START_TAG and t.name is 'script'
1643 ail = adjusted_insertion_location()
1644 el = token_to_element t, NS_HTML, ail
1645 el.flag 'parser-inserted', true
1646 # fixfull frament case
1647 ail[0].children.splice ail[1], 0, el
1649 tok_state = tok_state_script_data
1650 original_ins_mode = ins_mode # make sure orig... is defined
1651 ins_mode = ins_mode_text
1653 if t.type is TYPE_END_TAG and t.name is 'head'
1654 open_els.shift() # will be a head element... spec says so
1655 ins_mode = ins_mode_after_head
1657 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1658 ins_mode_in_head_else t
1660 if t.type is TYPE_START_TAG and t.name is 'template'
1661 insert_html_element t
1663 flag_frameset_ok = false
1664 ins_mode = ins_mode_in_template
1665 template_ins_modes.unshift ins_mode_in_template
1667 if t.type is TYPE_END_TAG and t.name is 'template'
1668 if template_tag_is_open()
1669 generate_implied_end_tags
1670 if open_els[0].name isnt 'template'
1673 el = open_els.shift()
1674 if el.name is 'template' and el.namespace is NS_HTML
1676 clear_afe_to_marker()
1677 template_ins_modes.shift()
1682 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1685 ins_mode_in_head_else t
1687 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1688 ins_mode_in_head_noscript_else = (t) ->
1691 ins_mode = ins_mode_in_head
1693 ins_mode_in_head_noscript = (t) ->
1694 if t.type is TYPE_DOCTYPE
1697 if t.type is TYPE_START_TAG and t.name is 'html'
1700 if t.type is TYPE_END_TAG and t.name is 'noscript'
1702 ins_mode = ins_mode_in_head
1704 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1707 if t.type is TYPE_END_TAG and t.name is 'br'
1708 ins_mode_in_head_noscript_else t
1710 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1714 ins_mode_in_head_noscript_else t
1719 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1720 ins_mode_after_head_else = (t) ->
1721 body_tok = new_open_tag 'body'
1722 insert_html_element body_tok
1723 ins_mode = ins_mode_in_body
1726 ins_mode_after_head = (t) ->
1730 if t.type is TYPE_COMMENT
1733 if t.type is TYPE_DOCTYPE
1736 if t.type is TYPE_START_TAG and t.name is 'html'
1739 if t.type is TYPE_START_TAG and t.name is 'body'
1740 insert_html_element t
1741 flag_frameset_ok = false
1742 ins_mode = ins_mode_in_body
1744 if t.type is TYPE_START_TAG and t.name is 'frameset'
1745 insert_html_element t
1746 ins_mode = ins_mode_in_frameset
1748 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1750 open_els.unshift head_element_pointer
1752 for el, i in open_els
1753 if el is head_element_pointer
1754 open_els.splice i, 1
1756 console.log "warning: 23904 couldn't find head element in open_els"
1758 if t.type is TYPE_END_TAG and t.name is 'template'
1761 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1762 ins_mode_after_head_else t
1764 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1768 ins_mode_after_head_else t
1770 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1771 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1772 for el, i in open_els
1773 if el.name is name and el.namespace is NS_HTML
1774 generate_implied_end_tags name # arg is exception
1775 parse_error() unless i is 0
1780 if special_elements[el.name] is el.namespace
1784 ins_mode_in_body = (t) ->
1785 if t.type is TYPE_TEXT and t.text is "\u0000"
1792 if t.type is TYPE_TEXT
1795 flag_frameset_ok = false
1797 if t.type is TYPE_COMMENT
1800 if t.type is TYPE_DOCTYPE
1803 if t.type is TYPE_START_TAG and t.name is 'html'
1805 return if template_tag_is_open()
1806 root_attrs = open_els[open_els.length - 1].attrs
1808 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1811 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1814 if t.type is TYPE_START_TAG and t.name is 'body'
1816 return if open_els.length < 2
1817 second = open_els[open_els.length - 2]
1818 return unless second.namespace is NS_HTML
1819 return unless second.name is 'body'
1820 return if template_tag_is_open()
1821 flag_frameset_ok = false
1823 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1825 if t.type is TYPE_START_TAG and t.name is 'frameset'
1827 return if open_els.length < 2
1828 second_i = open_els.length - 2
1829 second = open_els[second_i]
1830 return unless second.namespace is NS_HTML
1831 return unless second.name is 'body'
1832 if flag_frameset_ok is false
1835 for el, i in second.parent.children
1837 second.parent.children.splice i, 1
1839 open_els.splice second_i, 1
1840 # pop everything except the "root html element"
1841 while open_els.length > 1
1843 insert_html_element t
1844 ins_mode = ins_mode_in_frameset
1846 if t.type is TYPE_EOF
1848 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1849 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1850 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1853 unless ok_tags[t.name] is el.namespace
1856 if template_ins_modes.length > 0
1857 ins_mode_in_template t
1861 if t.type is TYPE_END_TAG and t.name is 'body'
1862 unless is_in_scope 'body', NS_HTML
1866 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1867 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1868 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1869 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1873 unless ok_tags[t.name] is el.namespace
1876 ins_mode = ins_mode_after_body
1878 if t.type is TYPE_END_TAG and t.name is 'html'
1879 unless is_in_scope 'body', NS_HTML
1883 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1884 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1885 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1886 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1890 unless ok_tags[t.name] is el.namespace
1893 ins_mode = ins_mode_after_body
1896 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1897 close_p_if_in_button_scope()
1898 insert_html_element t
1900 if t.type is TYPE_START_TAG and h_tags[t.name]?
1901 close_p_if_in_button_scope()
1902 if h_tags[open_els[0].name] is open_els[0].namespace
1905 insert_html_element t
1907 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1908 close_p_if_in_button_scope()
1909 insert_html_element t
1910 # spec: If the next token is a "LF" (U+000A) character token, then
1911 # ignore that token and move on to the next one. (Newlines at the
1912 # start of pre blocks are ignored as an authoring convenience.)
1913 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1915 flag_frameset_ok = false
1917 if t.type is TYPE_START_TAG and t.name is 'form'
1918 unless form_element_pointer is null or template_tag_is_open()
1921 close_p_if_in_button_scope()
1922 el = insert_html_element t
1923 unless template_tag_is_open()
1924 form_element_pointer = el
1926 if t.type is TYPE_START_TAG and t.name is 'li'
1927 flag_frameset_ok = false
1928 for node in open_els
1929 if node.name is 'li' and node.namespace is NS_HTML
1930 generate_implied_end_tags 'li' # arg is exception
1931 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1934 el = open_els.shift()
1935 if el.name is 'li' and el.namespace is NS_HTML
1938 if el_is_special_not_adp node
1940 close_p_if_in_button_scope()
1941 insert_html_element t
1943 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1944 flag_frameset_ok = false
1945 for node in open_els
1946 if node.name is 'dd' and node.namespace is NS_HTML
1947 generate_implied_end_tags 'dd' # arg is exception
1948 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1951 el = open_els.shift()
1952 if el.name is 'dd' and el.namespace is NS_HTML
1955 if node.name is 'dt' and node.namespace is NS_HTML
1956 generate_implied_end_tags 'dt' # arg is exception
1957 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1960 el = open_els.shift()
1961 if el.name is 'dt' and el.namespace is NS_HTML
1964 if el_is_special_not_adp node
1966 close_p_if_in_button_scope()
1967 insert_html_element t
1969 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1970 close_p_if_in_button_scope()
1971 insert_html_element t
1972 tok_state = tok_state_plaintext
1974 if t.type is TYPE_START_TAG and t.name is 'button'
1975 if is_in_scope 'button', NS_HTML
1977 generate_implied_end_tags()
1979 el = open_els.shift()
1980 if el.name is 'button' and el.namespace is NS_HTML
1983 insert_html_element t
1984 flag_frameset_ok = false
1986 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1987 unless is_in_scope t.name, NS_HTML
1990 generate_implied_end_tags()
1991 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1994 el = open_els.shift()
1995 if el.name is t.name and el.namespace is NS_HTML
1998 if t.type is TYPE_END_TAG and t.name is 'form'
1999 unless template_tag_is_open()
2000 node = form_element_pointer
2001 form_element_pointer = null
2002 if node is null or not el_is_in_scope node
2005 generate_implied_end_tags()
2006 if open_els[0] isnt node
2008 for el, i in open_els
2010 open_els.splice i, 1
2013 unless is_in_scope 'form', NS_HTML
2016 generate_implied_end_tags()
2017 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
2020 el = open_els.shift()
2021 if el.name is 'form' and el.namespace is NS_HTML
2024 if t.type is TYPE_END_TAG and t.name is 'p'
2025 unless is_in_button_scope 'p', NS_HTML
2027 insert_html_element new_open_tag 'p'
2030 if t.type is TYPE_END_TAG and t.name is 'li'
2031 unless is_in_li_scope 'li', NS_HTML
2034 generate_implied_end_tags 'li' # arg is exception
2035 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
2038 el = open_els.shift()
2039 if el.name is 'li' and el.namespace is NS_HTML
2042 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
2043 unless is_in_scope t.name, NS_HTML
2046 generate_implied_end_tags t.name # arg is exception
2047 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2050 el = open_els.shift()
2051 if el.name is t.name and el.namespace is NS_HTML
2054 if t.type is TYPE_END_TAG and h_tags[t.name]?
2057 if h_tags[el.name] is el.namespace
2060 if standard_scopers[el.name] is el.namespace
2065 generate_implied_end_tags()
2066 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2069 el = open_els.shift()
2070 if h_tags[el.name] is el.namespace
2074 if t.type is TYPE_START_TAG and t.name is 'a'
2075 # If the list of active formatting elements contains an a element
2076 # between the end of the list and the last marker on the list (or
2077 # the start of the list if there is no marker on the list), then
2078 # this is a parse error; run the adoption agency algorithm for the
2079 # tag name "a", then remove that element from the list of active
2080 # formatting elements and the stack of open elements if the
2081 # adoption agency algorithm didn't already remove it (it might not
2082 # have if the element is not in table scope).
2085 if el.type is TYPE_AFE_MARKER
2087 if el.name is 'a' and el.namespace is NS_HTML
2095 for el, i in open_els
2097 open_els.splice i, 1
2099 el = insert_html_element t
2102 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2104 el = insert_html_element t
2107 if t.type is TYPE_START_TAG and t.name is 'nobr'
2109 el = insert_html_element t
2112 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2113 adoption_agency t.name
2115 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2117 insert_html_element t
2119 flag_frameset_ok = false
2121 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2122 unless is_in_scope t.name, NS_HTML
2125 generate_implied_end_tags()
2126 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2129 el = open_els.shift()
2130 if el.name is t.name and el.namespace is NS_HTML
2132 clear_afe_to_marker()
2134 if t.type is TYPE_START_TAG and t.name is 'table'
2135 unless doc.flag('quirks mode') is QUIRKS_YES
2136 close_p_if_in_button_scope() # test
2137 insert_html_element t
2138 flag_frameset_ok = false
2139 ins_mode = ins_mode_in_table
2141 if t.type is TYPE_END_TAG and t.name is 'br'
2143 t.type = TYPE_START_TAG
2145 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2147 insert_html_element t
2149 t.acknowledge_self_closing()
2150 flag_frameset_ok = false
2152 if t.type is TYPE_START_TAG and t.name is 'input'
2154 insert_html_element t
2156 t.acknowledge_self_closing()
2157 unless is_input_hidden_tok t
2158 flag_frameset_ok = false
2160 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
2161 insert_html_element t
2163 t.acknowledge_self_closing()
2165 if t.type is TYPE_START_TAG and t.name is 'hr'
2166 close_p_if_in_button_scope()
2167 insert_html_element t
2169 t.acknowledge_self_closing()
2170 flag_frameset_ok = false
2172 if t.type is TYPE_START_TAG and t.name is 'image'
2177 if t.type is TYPE_START_TAG and t.name is 'isindex'
2179 if template_tag_is_open() is false and form_element_pointer isnt null
2181 t.acknowledge_self_closing()
2182 flag_frameset_ok = false
2183 close_p_if_in_button_scope()
2184 el = insert_html_element new_open_tag 'form'
2185 unless template_tag_is_open()
2186 form_element_pointer = el
2189 el.attrs['action'] = a[1]
2191 insert_html_element new_open_tag 'hr'
2194 insert_html_element new_open_tag 'label'
2195 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2196 input_el = new_open_tag 'input'
2201 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2202 input_el.attrs_a.push [a[0], a[1]]
2203 input_el.attrs_a.push ['name', 'isindex']
2204 # fixfull this next bit is in english... internationalize?
2205 prompt ?= "This is a searchable index. Enter search keywords: "
2206 insert_character new_character_token prompt # fixfull split
2207 # TODO submit typo "balue" in spec
2208 insert_html_element input_el
2210 # insert_character '' # you can put chars here if promt attr missing
2212 insert_html_element new_open_tag 'hr'
2215 unless template_tag_is_open()
2216 form_element_pointer = null
2218 if t.type is TYPE_START_TAG and t.name is 'textarea'
2219 insert_html_element t
2220 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2222 tok_state = tok_state_rcdata
2223 original_ins_mode = ins_mode
2224 flag_frameset_ok = false
2225 ins_mode = ins_mode_text
2227 if t.type is TYPE_START_TAG and t.name is 'xmp'
2228 close_p_if_in_button_scope()
2230 flag_frameset_ok = false
2231 parse_generic_raw_text t
2233 if t.type is TYPE_START_TAG and t.name is 'iframe'
2234 flag_frameset_ok = false
2235 parse_generic_raw_text t
2237 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2238 parse_generic_raw_text t
2240 if t.type is TYPE_START_TAG and t.name is 'select'
2242 insert_html_element t
2243 flag_frameset_ok = false
2244 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2245 ins_mode = ins_mode_in_select_in_table
2247 ins_mode = ins_mode_in_select
2249 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2250 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2253 insert_html_element t
2255 # this comment block implements the W3C spec
2256 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2257 # if is_in_scope 'ruby', NS_HTML
2258 # generate_implied_end_tags()
2259 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2261 # insert_html_element t
2263 # if t.type is TYPE_START_TAG and t.name is 'rt'
2264 # if is_in_scope 'ruby', NS_HTML
2265 # generate_implied_end_tags 'rtc' # arg is exception
2266 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2268 # insert_html_element t
2270 # below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2271 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2272 if is_in_scope 'ruby', NS_HTML
2273 generate_implied_end_tags()
2274 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2276 insert_html_element t
2278 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2279 if is_in_scope 'ruby', NS_HTML
2280 generate_implied_end_tags 'rtc'
2281 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2283 insert_html_element t
2286 if t.type is TYPE_START_TAG and t.name is 'math'
2288 adjust_mathml_attributes t
2289 adjust_foreign_attributes t
2290 insert_foreign_element t, NS_MATHML
2291 if t.flag 'self-closing'
2293 t.acknowledge_self_closing()
2295 if t.type is TYPE_START_TAG and t.name is 'svg'
2297 adjust_svg_attributes t
2298 adjust_foreign_attributes t
2299 insert_foreign_element t, NS_SVG
2300 if t.flag 'self-closing'
2302 t.acknowledge_self_closing()
2304 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2307 if t.type is TYPE_START_TAG # any other start tag
2309 insert_html_element t
2311 if t.type is TYPE_END_TAG # any other end tag
2312 in_body_any_other_end_tag t.name
2316 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2317 ins_mode_text = (t) ->
2318 if t.type is TYPE_TEXT
2321 if t.type is TYPE_EOF
2323 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2324 open_els[0].flag 'already started', true
2326 ins_mode = original_ins_mode
2329 if t.type is TYPE_END_TAG and t.name is 'script'
2331 ins_mode = original_ins_mode
2332 # fixfull the spec seems to assume that I'm going to run the script
2333 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2335 if t.type is TYPE_END_TAG
2337 ins_mode = original_ins_mode
2339 console.log 'warning: end of ins_mode_text reached'
2341 # the functions below implement the tokenizer stats described here:
2342 # http://www.w3.org/TR/html5/syntax.html#tokenization
2344 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2345 ins_mode_in_table_else = (t) ->
2347 flag_foster_parenting = true
2349 flag_foster_parenting = false
2351 ins_mode_in_table = (t) ->
2354 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2355 pending_table_character_tokens = []
2356 original_ins_mode = ins_mode
2357 ins_mode = ins_mode_in_table_text
2360 ins_mode_in_table_else t
2368 clear_stack_to_table_context()
2370 insert_html_element t
2371 ins_mode = ins_mode_in_caption
2373 clear_stack_to_table_context()
2374 insert_html_element t
2375 ins_mode = ins_mode_in_column_group
2377 clear_stack_to_table_context()
2378 insert_html_element new_open_tag 'colgroup'
2379 ins_mode = ins_mode_in_column_group
2381 when 'tbody', 'tfoot', 'thead'
2382 clear_stack_to_table_context()
2383 insert_html_element t
2384 ins_mode = ins_mode_in_table_body
2385 when 'td', 'th', 'tr'
2386 clear_stack_to_table_context()
2387 insert_html_element new_open_tag 'tbody'
2388 ins_mode = ins_mode_in_table_body
2392 if is_in_table_scope 'table', NS_HTML
2394 el = open_els.shift()
2395 if el.name is 'table' and el.namespace is NS_HTML
2399 when 'style', 'script', 'template'
2402 unless is_input_hidden_tok t
2403 ins_mode_in_table_else t
2406 el = insert_html_element t
2408 t.acknowledge_self_closing()
2411 if form_element_pointer?
2413 if template_tag_is_open()
2415 form_element_pointer = insert_html_element t
2418 ins_mode_in_table_else t
2422 if is_in_table_scope 'table', NS_HTML
2424 el = open_els.shift()
2425 if el.name is 'table' and el.namespace is NS_HTML
2430 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2435 ins_mode_in_table_else t
2439 ins_mode_in_table_else t
2442 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2443 ins_mode_in_table_text = (t) ->
2444 if t.type is TYPE_TEXT and t.text is "\u0000"
2448 if t.type is TYPE_TEXT
2449 pending_table_character_tokens.push t
2453 for old in pending_table_character_tokens
2454 unless is_space_tok old
2458 for old in pending_table_character_tokens
2459 insert_character old
2461 for old in pending_table_character_tokens
2462 ins_mode_in_table_else old
2463 pending_table_character_tokens = []
2464 ins_mode = original_ins_mode
2467 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2468 ins_mode_in_caption = (t) ->
2469 if t.type is TYPE_END_TAG and t.name is 'caption'
2470 if is_in_table_scope 'caption', NS_HTML
2471 generate_implied_end_tags()
2472 if open_els[0].name isnt 'caption'
2475 el = open_els.shift()
2476 if el.name is 'caption' and el.namespace is NS_HTML
2478 clear_afe_to_marker()
2479 ins_mode = ins_mode_in_table
2484 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2486 if is_in_table_scope 'caption', NS_HTML
2488 el = open_els.shift()
2489 if el.name is 'caption' and el.namespace is NS_HTML
2491 clear_afe_to_marker()
2492 ins_mode = ins_mode_in_table
2494 # else fragment case
2496 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2502 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2503 ins_mode_in_column_group = (t) ->
2507 if t.type is TYPE_COMMENT
2510 if t.type is TYPE_DOCTYPE
2513 if t.type is TYPE_START_TAG and t.name is 'html'
2516 if t.type is TYPE_START_TAG and t.name is 'col'
2517 el = insert_html_element t
2519 t.acknowledge_self_closing()
2521 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2522 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2524 ins_mode = ins_mode_in_table
2528 if t.type is TYPE_END_TAG and t.name is 'col'
2531 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2534 if t.type is TYPE_EOF
2538 if open_els[0].name isnt 'colgroup'
2542 ins_mode = ins_mode_in_table
2546 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2547 ins_mode_in_table_body = (t) ->
2548 if t.type is TYPE_START_TAG and t.name is 'tr'
2549 clear_stack_to_table_body_context()
2550 insert_html_element t
2551 ins_mode = ins_mode_in_row
2553 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2555 clear_stack_to_table_body_context()
2556 insert_html_element new_open_tag 'tr'
2557 ins_mode = ins_mode_in_row
2560 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2561 unless is_in_table_scope t.name, NS_HTML
2564 clear_stack_to_table_body_context()
2566 ins_mode = ins_mode_in_table
2568 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2571 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2574 if table_scopers[el.name] is el.namespace
2579 clear_stack_to_table_body_context()
2581 ins_mode = ins_mode_in_table
2584 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2590 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2591 ins_mode_in_row = (t) ->
2592 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2593 clear_stack_to_table_row_context()
2594 insert_html_element t
2595 ins_mode = ins_mode_in_cell
2598 if t.type is TYPE_END_TAG and t.name is 'tr'
2599 if is_in_table_scope 'tr', NS_HTML
2600 clear_stack_to_table_row_context()
2602 ins_mode = ins_mode_in_table_body
2606 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2607 if is_in_table_scope 'tr', NS_HTML
2608 clear_stack_to_table_row_context()
2610 ins_mode = ins_mode_in_table_body
2615 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2616 if is_in_table_scope t.name, NS_HTML
2617 if is_in_table_scope 'tr', NS_HTML
2618 clear_stack_to_table_row_context()
2620 ins_mode = ins_mode_in_table_body
2625 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2631 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2633 generate_implied_end_tags()
2634 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2637 el = open_els.shift()
2638 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2640 clear_afe_to_marker()
2641 ins_mode = ins_mode_in_row
2643 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2644 ins_mode_in_cell = (t) ->
2645 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2646 if is_in_table_scope t.name, NS_HTML
2647 generate_implied_end_tags()
2648 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2651 el = open_els.shift()
2652 if el.name is t.name and el.namespace is NS_HTML
2654 clear_afe_to_marker()
2655 ins_mode = ins_mode_in_row
2659 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2662 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2665 if table_scopers[el.name] is el.namespace
2673 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2676 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2677 if is_in_table_scope t.name, NS_HTML
2686 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2687 ins_mode_in_select = (t) ->
2688 if t.type is TYPE_TEXT and t.text is "\u0000"
2691 if t.type is TYPE_TEXT
2694 if t.type is TYPE_COMMENT
2697 if t.type is TYPE_DOCTYPE
2700 if t.type is TYPE_START_TAG and t.name is 'html'
2703 if t.type is TYPE_START_TAG and t.name is 'option'
2704 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2706 insert_html_element t
2708 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2709 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2711 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2713 insert_html_element t
2715 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2716 if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
2717 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2719 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2724 if t.type is TYPE_END_TAG and t.name is 'option'
2725 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2730 if t.type is TYPE_END_TAG and t.name is 'select'
2731 if is_in_select_scope 'select', NS_HTML
2733 el = open_els.shift()
2734 if el.name is 'select' and el.namespace is NS_HTML
2740 if t.type is TYPE_START_TAG and t.name is 'select'
2743 el = open_els.shift()
2744 if el.name is 'select' and el.namespace is NS_HTML
2747 # spec says that this is the same as </select> but it doesn't say
2748 # to check scope first
2750 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2752 if is_in_select_scope 'select', NS_HTML
2755 el = open_els.shift()
2756 if el.name is 'select' and el.namespace is NS_HTML
2761 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2764 if t.type is TYPE_EOF
2771 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2772 ins_mode_in_select_in_table = (t) ->
2773 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2776 el = open_els.shift()
2777 if el.name is 'select' and el.namespace is NS_HTML
2782 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2784 unless is_in_table_scope t.name, NS_HTML
2787 el = open_els.shift()
2788 if el.name is 'select' and el.namespace is NS_HTML
2794 ins_mode_in_select t
2797 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2798 ins_mode_in_template = (t) ->
2799 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2802 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2805 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2806 template_ins_modes.shift()
2807 template_ins_modes.unshift ins_mode_in_table
2808 ins_mode = ins_mode_in_table
2811 if t.type is TYPE_START_TAG and t.name is 'col'
2812 template_ins_modes.shift()
2813 template_ins_modes.unshift ins_mode_in_column_group
2814 ins_mode = ins_mode_in_column_group
2817 if t.type is TYPE_START_TAG and t.name is 'tr'
2818 template_ins_modes.shift()
2819 template_ins_modes.unshift ins_mode_in_table_body
2820 ins_mode = ins_mode_in_table_body
2823 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2824 template_ins_modes.shift()
2825 template_ins_modes.unshift ins_mode_in_row
2826 ins_mode = ins_mode_in_row
2829 if t.type is TYPE_START_TAG
2830 template_ins_modes.shift()
2831 template_ins_modes.unshift ins_mode_in_body
2832 ins_mode = ins_mode_in_body
2835 if t.type is TYPE_END_TAG
2838 if t.type is TYPE_EOF
2839 unless template_tag_is_open()
2844 el = open_els.shift()
2845 if el.name is 'template' and el.namespace is NS_HTML
2847 clear_afe_to_marker()
2848 template_ins_modes.shift()
2852 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2853 ins_mode_after_body = (t) ->
2857 if t.type is TYPE_COMMENT
2858 first = open_els[open_els.length - 1]
2859 insert_comment t, [first, first.children.length]
2861 if t.type is TYPE_DOCTYPE
2864 if t.type is TYPE_START_TAG and t.name is 'html'
2867 if t.type is TYPE_END_TAG and t.name is 'html'
2868 if flag_fragment_parsing
2871 ins_mode = ins_mode_after_after_body
2873 if t.type is TYPE_EOF
2878 ins_mode = ins_mode_in_body
2881 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2882 ins_mode_in_frameset = (t) ->
2886 if t.type is TYPE_COMMENT
2889 if t.type is TYPE_DOCTYPE
2892 if t.type is TYPE_START_TAG and t.name is 'html'
2895 if t.type is TYPE_START_TAG and t.name is 'frameset'
2896 insert_html_element t
2898 if t.type is TYPE_END_TAG and t.name is 'frameset'
2899 if open_els.length is 1
2901 return # fragment case
2903 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2904 ins_mode = ins_mode_after_frameset
2906 if t.type is TYPE_START_TAG and t.name is 'frame'
2907 insert_html_element t
2909 t.acknowledge_self_closing()
2911 if t.type is TYPE_START_TAG and t.name is 'noframes'
2914 if t.type is TYPE_EOF
2915 if open_els.length isnt 1
2923 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2924 ins_mode_after_frameset = (t) ->
2928 if t.type is TYPE_COMMENT
2931 if t.type is TYPE_DOCTYPE
2934 if t.type is TYPE_START_TAG and t.name is 'html'
2937 if t.type is TYPE_END_TAG and t.name is 'html'
2938 ins_mode = ins_mode_after_after_frameset
2940 if t.type is TYPE_START_TAG and t.name is 'noframes'
2943 if t.type is TYPE_EOF
2950 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2951 ins_mode_after_after_body = (t) ->
2952 if t.type is TYPE_COMMENT
2953 insert_comment t, [doc, doc.children.length]
2955 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2958 if t.type is TYPE_EOF
2963 ins_mode = ins_mode_in_body
2967 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2968 ins_mode_after_after_frameset = (t) ->
2969 if t.type is TYPE_COMMENT
2970 insert_comment t, [doc, doc.children.length]
2972 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2975 if t.type is TYPE_EOF
2978 if t.type is TYPE_START_TAG and t.name is 'noframes'
2985 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2986 has_color_face_or_size = (t) ->
2988 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2991 in_foreign_content_end_script = ->
2995 in_foreign_content_other_start = (t) ->
2996 acn = adjusted_current_node()
2997 if acn.namespace is NS_MATHML
2998 adjust_mathml_attributes t
2999 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
3000 t.name = svg_name_fixes[t.name]
3001 if acn.namespace is NS_SVG
3002 adjust_svg_attributes t
3003 adjust_foreign_attributes t
3004 insert_foreign_element t, acn.namespace
3005 if t.flag 'self-closing'
3006 if t.name is 'script'
3007 t.acknowledge_self_closing()
3008 in_foreign_content_end_script()
3012 t.acknowledge_self_closing()
3014 in_foreign_content = (t) ->
3015 if t.type is TYPE_TEXT and t.text is "\u0000"
3017 insert_character new_character_token "\ufffd"
3022 if t.type is TYPE_TEXT
3023 flag_frameset_ok = false
3026 if t.type is TYPE_COMMENT
3029 if t.type is TYPE_DOCTYPE
3032 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
3034 if flag_fragment_parsing
3035 in_foreign_content_other_start t
3037 loop # is this safe?
3039 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
3043 if t.type is TYPE_START_TAG
3044 in_foreign_content_other_start t
3046 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
3047 in_foreign_content_end_script()
3049 if t.type is TYPE_END_TAG
3052 if node.name.toLowerCase() isnt t.name
3055 if node is open_els[open_els.length - 1]
3057 if node.name.toLowerCase() is t.name
3059 el = open_els.shift()
3064 if node.namespace is NS_HTML
3066 ins_mode t # explicitly call HTML insertion mode
3069 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
3071 switch c = txt.charAt(cur++)
3073 return new_text_node parse_character_reference()
3075 tok_state = tok_state_tag_open
3078 return new_text_node "\ufffd"
3080 return new_eof_token()
3082 return new_text_node c
3085 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
3086 # not needed: tok_state_character_reference_in_data = ->
3087 # just call parse_character_reference()
3089 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
3090 tok_state_rcdata = ->
3091 switch c = txt.charAt(cur++)
3093 return new_text_node parse_character_reference()
3095 tok_state = tok_state_rcdata_less_than_sign
3098 return new_character_token "\ufffd"
3100 return new_eof_token()
3102 return new_character_token c
3105 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
3106 # not needed: tok_state_character_reference_in_rcdata = ->
3107 # just call parse_character_reference()
3109 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
3110 tok_state_rawtext = ->
3111 switch c = txt.charAt(cur++)
3113 tok_state = tok_state_rawtext_less_than_sign
3116 return new_character_token "\ufffd"
3118 return new_eof_token()
3120 return new_character_token c
3123 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
3124 tok_state_script_data = ->
3125 switch c = txt.charAt(cur++)
3127 tok_state = tok_state_script_data_less_than_sign
3130 return new_character_token "\ufffd"
3132 return new_eof_token()
3134 return new_character_token c
3137 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3138 tok_state_plaintext = ->
3139 switch c = txt.charAt(cur++)
3142 return new_character_token "\ufffd"
3144 return new_eof_token()
3146 return new_character_token c
3150 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3151 tok_state_tag_open = ->
3152 c = txt.charAt(cur++)
3154 tok_state = tok_state_markup_declaration_open
3157 tok_state = tok_state_end_tag_open
3160 tok_cur_tag = new_open_tag c.toLowerCase()
3161 tok_state = tok_state_tag_name
3164 tok_cur_tag = new_open_tag c
3165 tok_state = tok_state_tag_name
3169 tok_cur_tag = new_comment_token '?' # FIXME right?
3170 tok_state = tok_state_bogus_comment
3174 tok_state = tok_state_data
3175 cur -= 1 # we didn't parse/handle the char after <
3176 return new_text_node '<'
3178 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3179 tok_state_end_tag_open = ->
3180 c = txt.charAt(cur++)
3182 tok_cur_tag = new_end_tag c.toLowerCase()
3183 tok_state = tok_state_tag_name
3186 tok_cur_tag = new_end_tag c
3187 tok_state = tok_state_tag_name
3191 tok_state = tok_state_data
3195 tok_state = tok_state_data
3196 return new_text_node '</'
3199 tok_cur_tag = new_comment_token c
3200 tok_state = tok_state_bogus_comment
3203 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3204 tok_state_tag_name = ->
3205 switch c = txt.charAt(cur++)
3206 when "\t", "\n", "\u000c", ' '
3207 tok_state = tok_state_before_attribute_name
3209 tok_state = tok_state_self_closing_start_tag
3211 tok_state = tok_state_data
3217 tok_cur_tag.name += "\ufffd"
3220 tok_state = tok_state_data
3223 tok_cur_tag.name += c.toLowerCase()
3225 tok_cur_tag.name += c
3228 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3229 tok_state_rcdata_less_than_sign = ->
3230 c = txt.charAt(cur++)
3232 temporary_buffer = ''
3233 tok_state = tok_state_rcdata_end_tag_open
3236 tok_state = tok_state_rcdata
3237 cur -= 1 # reconsume the input character
3238 return new_character_token '<'
3240 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3241 tok_state_rcdata_end_tag_open = ->
3242 c = txt.charAt(cur++)
3244 tok_cur_tag = new_end_tag c.toLowerCase()
3245 temporary_buffer += c
3246 tok_state = tok_state_rcdata_end_tag_name
3249 tok_cur_tag = new_end_tag c
3250 temporary_buffer += c
3251 tok_state = tok_state_rcdata_end_tag_name
3254 tok_state = tok_state_rcdata
3255 cur -= 1 # reconsume the input character
3256 return new_character_token "</" # fixfull separate these
3258 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3259 is_appropriate_end_tag = (t) ->
3260 # spec says to check against "the tag name of the last start tag to
3261 # have been emitted from this tokenizer", but this is only called from
3262 # the various "raw" states, so it's hopefully ok to assume that
3263 # open_els[0].name will work instead TODO: verify this after the script
3264 # data states are implemented
3265 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3266 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3268 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3269 tok_state_rcdata_end_tag_name = ->
3270 c = txt.charAt(cur++)
3271 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3272 if is_appropriate_end_tag tok_cur_tag
3273 tok_state = tok_state_before_attribute_name
3275 # else fall through to "Anything else"
3277 if is_appropriate_end_tag tok_cur_tag
3278 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3280 # else fall through to "Anything else"
3282 if is_appropriate_end_tag tok_cur_tag
3283 tok_state = tok_state_data
3285 # else fall through to "Anything else"
3287 tok_cur_tag.name += c.toLowerCase()
3288 temporary_buffer += c
3291 tok_cur_tag.name += c
3292 temporary_buffer += c
3295 tok_state = tok_state_rcdata
3296 cur -= 1 # reconsume the input character
3297 return new_character_token '</' + temporary_buffer # fixfull separate these
3299 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3300 tok_state_rawtext_less_than_sign = ->
3301 c = txt.charAt(cur++)
3303 temporary_buffer = ''
3304 tok_state = tok_state_rawtext_end_tag_open
3307 tok_state = tok_state_rawtext
3308 cur -= 1 # reconsume the input character
3309 return new_character_token '<'
3311 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3312 tok_state_rawtext_end_tag_open = ->
3313 c = txt.charAt(cur++)
3315 tok_cur_tag = new_end_tag c.toLowerCase()
3316 temporary_buffer += c
3317 tok_state = tok_state_rawtext_end_tag_name
3320 tok_cur_tag = new_end_tag c
3321 temporary_buffer += c
3322 tok_state = tok_state_rawtext_end_tag_name
3325 tok_state = tok_state_rawtext
3326 cur -= 1 # reconsume the input character
3327 return new_character_token "</" # fixfull separate these
3329 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3330 tok_state_rawtext_end_tag_name = ->
3331 c = txt.charAt(cur++)
3332 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3333 if is_appropriate_end_tag tok_cur_tag
3334 tok_state = tok_state_before_attribute_name
3336 # else fall through to "Anything else"
3338 if is_appropriate_end_tag tok_cur_tag
3339 tok_state = tok_state_self_closing_start_tag
3341 # else fall through to "Anything else"
3343 if is_appropriate_end_tag tok_cur_tag
3344 tok_state = tok_state_data
3346 # else fall through to "Anything else"
3348 tok_cur_tag.name += c.toLowerCase()
3349 temporary_buffer += c
3352 tok_cur_tag.name += c
3353 temporary_buffer += c
3356 tok_state = tok_state_rawtext
3357 cur -= 1 # reconsume the input character
3358 return new_character_token '</' + temporary_buffer # fixfull separate these
3360 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3361 tok_state_script_data_less_than_sign = ->
3362 c = txt.charAt(cur++)
3364 temporary_buffer = ''
3365 tok_state = tok_state_script_data_end_tag_open
3368 tok_state = tok_state_script_data_escape_start
3369 return new_character_token '<!' # fixfull split
3371 tok_state = tok_state_script_data
3372 cur -= 1 # Reconsume
3373 return new_character_token '<'
3375 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3376 tok_state_script_data_end_tag_open = ->
3377 c = txt.charAt(cur++)
3379 tok_cur_tag = new_end_tag c.toLowerCase()
3380 temporary_buffer += c
3381 tok_state = tok_state_script_data_end_tag_name
3384 tok_cur_tag = new_end_tag c
3385 temporary_buffer += c
3386 tok_state = tok_state_script_data_end_tag_name
3389 tok_state = tok_state_script_data
3390 cur -= 1 # Reconsume
3391 return new_character_token '</'
3393 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3394 tok_state_script_data_end_tag_name = ->
3395 c = txt.charAt(cur++)
3396 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3397 if is_appropriate_end_tag tok_cur_tag
3398 tok_state = tok_state_before_attribute_name
3402 if is_appropriate_end_tag tok_cur_tag
3403 tok_state = tok_state_self_closing_start_tag
3407 if is_appropriate_end_tag tok_cur_tag
3408 tok_state = tok_state_data
3412 tok_cur_tag.name += c.toLowerCase()
3413 temporary_buffer += c
3416 tok_cur_tag.name += c
3417 temporary_buffer += c
3420 tok_state = tok_state_script_data
3421 cur -= 1 # Reconsume
3422 return new_character_token "</#{temporary_buffer}" # fixfull split
3424 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3425 tok_state_script_data_escape_start = ->
3426 c = txt.charAt(cur++)
3428 tok_state = tok_state_script_data_escape_start_dash
3429 return new_character_token '-'
3431 tok_state = tok_state_script_data
3432 cur -= 1 # Reconsume
3435 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3436 tok_state_script_data_escape_start_dash = ->
3437 c = txt.charAt(cur++)
3439 tok_state = tok_state_script_data_escaped_dash_dash
3440 return new_character_token '-'
3442 tok_state = tok_state_script_data
3443 cur -= 1 # Reconsume
3446 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3447 tok_state_script_data_escaped = ->
3448 c = txt.charAt(cur++)
3450 tok_state = tok_state_script_data_escaped_dash
3451 return new_character_token '-'
3453 tok_state = tok_state_script_data_escaped_less_than_sign
3457 return new_character_token "\ufffd"
3459 tok_state = tok_state_data
3461 cur -= 1 # Reconsume
3464 return new_character_token c
3466 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3467 tok_state_script_data_escaped_dash = ->
3468 c = txt.charAt(cur++)
3470 tok_state = tok_state_script_data_escaped_dash_dash
3471 return new_character_token '-'
3473 tok_state = tok_state_script_data_escaped_less_than_sign
3477 tok_state = tok_state_script_data_escaped
3478 return new_character_token "\ufffd"
3480 tok_state = tok_state_data
3482 cur -= 1 # Reconsume
3485 tok_state = tok_state_script_data_escaped
3486 return new_character_token c
3488 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3489 tok_state_script_data_escaped_dash_dash = ->
3490 c = txt.charAt(cur++)
3492 return new_character_token '-'
3494 tok_state = tok_state_script_data_escaped_less_than_sign
3497 tok_state = tok_state_script_data
3498 return new_character_token '>'
3501 tok_state = tok_state_script_data_escaped
3502 return new_character_token "\ufffd"
3505 tok_state = tok_state_data
3506 cur -= 1 # Reconsume
3509 tok_state = tok_state_script_data_escaped
3510 return new_character_token c
3512 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3513 tok_state_script_data_escaped_less_than_sign = ->
3514 c = txt.charAt(cur++)
3516 temporary_buffer = ''
3517 tok_state = tok_state_script_data_escaped_end_tag_open
3520 temporary_buffer = c.toLowerCase() # yes, really
3521 tok_state = tok_state_script_data_double_escape_start
3522 return new_character_token "<#{c}" # fixfull split
3524 temporary_buffer = c
3525 tok_state = tok_state_script_data_double_escape_start
3526 return new_character_token "<#{c}" # fixfull split
3528 tok_state = tok_state_script_data_escaped
3529 cur -= 1 # Reconsume
3530 return new_character_token '<'
3532 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3533 tok_state_script_data_escaped_end_tag_open = ->
3534 c = txt.charAt(cur++)
3536 tok_cur_tag = new_end_tag c.toLowerCase()
3537 temporary_buffer += c
3538 tok_state = tok_state_script_data_escaped_end_tag_name
3541 tok_cur_tag = new_end_tag c
3542 temporary_buffer += c
3543 tok_state = tok_state_script_data_escaped_end_tag_name
3546 tok_state = tok_state_script_data_escaped
3547 cur -= 1 # Reconsume
3548 return new_character_token '</' # fixfull split
3550 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3551 tok_state_script_data_escaped_end_tag_name = ->
3552 c = txt.charAt(cur++)
3553 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3554 if is_appropriate_end_tag tok_cur_tag
3555 tok_state = tok_state_before_attribute_name
3559 if is_appropriate_end_tag tok_cur_tag
3560 tok_state = tok_state_self_closing_start_tag
3564 if is_appropriate_end_tag tok_cur_tag
3565 tok_state = tok_state_data
3569 tok_cur_tag.name += c.toLowerCase()
3570 temporary_buffer += c.toLowerCase()
3573 tok_cur_tag.name += c
3574 temporary_buffer += c.toLowerCase()
3577 tok_state = tok_state_script_data_escaped
3578 cur -= 1 # Reconsume
3579 return new_character_token "</#{temporary_buffer}" # fixfull split
3581 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3582 tok_state_script_data_double_escape_start = ->
3583 c = txt.charAt(cur++)
3584 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3585 if temporary_buffer is 'script'
3586 tok_state = tok_state_script_data_double_escaped
3588 tok_state = tok_state_script_data_escaped
3589 return new_character_token c
3591 temporary_buffer += c.toLowerCase() # yes, really lowercase
3592 return new_character_token c
3594 temporary_buffer += c
3595 return new_character_token c
3597 tok_state = tok_state_script_data_escaped
3598 cur -= 1 # Reconsume
3601 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3602 tok_state_script_data_double_escaped = ->
3603 c = txt.charAt(cur++)
3605 tok_state = tok_state_script_data_double_escaped_dash
3606 return new_character_token '-'
3608 tok_state = tok_state_script_data_double_escaped_less_than_sign
3609 return new_character_token '<'
3612 return new_character_token "\ufffd"
3615 tok_state = tok_state_data
3616 cur -= 1 # Reconsume
3619 return new_character_token c
3621 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3622 tok_state_script_data_double_escaped_dash = ->
3623 c = txt.charAt(cur++)
3625 tok_state = tok_state_script_data_double_escaped_dash_dash
3626 return new_character_token '-'
3628 tok_state = tok_state_script_data_double_escaped_less_than_sign
3629 return new_character_token '<'
3632 tok_state = tok_state_script_data_double_escaped
3633 return new_character_token "\ufffd"
3636 tok_state = tok_state_data
3637 cur -= 1 # Reconsume
3640 tok_state = tok_state_script_data_double_escaped
3641 return new_character_token c
3643 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3644 tok_state_script_data_double_escaped_dash_dash = ->
3645 c = txt.charAt(cur++)
3647 return new_character_token '-'
3649 tok_state = tok_state_script_data_double_escaped_less_than_sign
3650 return new_character_token '<'
3652 tok_state = tok_state_script_data
3653 return new_character_token '>'
3656 tok_state = tok_state_script_data_double_escaped
3657 return new_character_token "\ufffd"
3660 tok_state = tok_state_data
3661 cur -= 1 # Reconsume
3664 tok_state = tok_state_script_data_double_escaped
3665 return new_character_token c
3667 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3668 tok_state_script_data_double_escaped_less_than_sign = ->
3669 c = txt.charAt(cur++)
3671 temporary_buffer = ''
3672 tok_state = tok_state_script_data_double_escape_end
3673 return new_character_token '/'
3675 tok_state = tok_state_script_data_double_escaped
3676 cur -= 1 # Reconsume
3679 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3680 tok_state_script_data_double_escape_end = ->
3681 c = txt.charAt(cur++)
3682 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3683 if temporary_buffer is 'script'
3684 tok_state = tok_state_script_data_escaped
3686 tok_state = tok_state_script_data_double_escaped
3687 return new_character_token c
3689 temporary_buffer += c.toLowerCase() # yes, really lowercase
3690 return new_character_token c
3692 temporary_buffer += c
3693 return new_character_token c
3695 tok_state = tok_state_script_data_double_escaped
3696 cur -= 1 # Reconsume
3699 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3700 tok_state_before_attribute_name = ->
3702 switch c = txt.charAt(cur++)
3703 when "\t", "\n", "\u000c", ' '
3706 tok_state = tok_state_self_closing_start_tag
3709 tok_state = tok_state_data
3715 attr_name = "\ufffd"
3716 when '"', "'", '<', '='
3721 tok_state = tok_state_data
3724 attr_name = c.toLowerCase()
3728 tok_cur_tag.attrs_a.unshift [attr_name, '']
3729 tok_state = tok_state_attribute_name
3732 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3733 tok_state_attribute_name = ->
3734 switch c = txt.charAt(cur++)
3735 when "\t", "\n", "\u000c", ' '
3736 tok_state = tok_state_after_attribute_name
3738 tok_state = tok_state_self_closing_start_tag
3740 tok_state = tok_state_before_attribute_value
3742 tok_state = tok_state_data
3748 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3751 tok_cur_tag.attrs_a[0][0] += c
3754 tok_state = tok_state_data
3757 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3759 tok_cur_tag.attrs_a[0][0] += c
3762 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3763 tok_state_after_attribute_name = ->
3764 c = txt.charAt(cur++)
3765 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3768 tok_state = tok_state_self_closing_start_tag
3771 tok_state = tok_state_before_attribute_value
3774 tok_state = tok_state_data
3777 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3778 tok_state = tok_state_attribute_name
3782 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3783 tok_state = tok_state_attribute_name
3787 tok_state = tok_state_data
3788 cur -= 1 # reconsume
3790 if c is '"' or c is "'" or c is '<'
3792 # fall through to Anything else
3794 tok_cur_tag.attrs_a.unshift [c, '']
3795 tok_state = tok_state_attribute_name
3797 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3798 tok_state_before_attribute_value = ->
3799 switch c = txt.charAt(cur++)
3800 when "\t", "\n", "\u000c", ' '
3803 tok_state = tok_state_attribute_value_double_quoted
3805 tok_state = tok_state_attribute_value_unquoted
3808 tok_state = tok_state_attribute_value_single_quoted
3811 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3812 tok_state = tok_state_attribute_value_unquoted
3815 tok_state = tok_state_data
3821 tok_state = tok_state_data
3823 tok_cur_tag.attrs_a[0][1] += c
3824 tok_state = tok_state_attribute_value_unquoted
3827 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3828 tok_state_attribute_value_double_quoted = ->
3829 switch c = txt.charAt(cur++)
3831 tok_state = tok_state_after_attribute_value_quoted
3833 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3836 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3839 tok_state = tok_state_data
3841 tok_cur_tag.attrs_a[0][1] += c
3844 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3845 tok_state_attribute_value_single_quoted = ->
3846 switch c = txt.charAt(cur++)
3848 tok_state = tok_state_after_attribute_value_quoted
3850 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3853 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3856 tok_state = tok_state_data
3858 tok_cur_tag.attrs_a[0][1] += c
3861 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3862 tok_state_attribute_value_unquoted = ->
3863 switch c = txt.charAt(cur++)
3864 when "\t", "\n", "\u000c", ' '
3865 tok_state = tok_state_before_attribute_name
3867 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3869 tok_state = tok_state_data
3874 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3877 tok_state = tok_state_data
3879 # Parse Error if ', <, = or ` (backtick)
3880 tok_cur_tag.attrs_a[0][1] += c
3883 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3884 tok_state_after_attribute_value_quoted = ->
3885 switch c = txt.charAt(cur++)
3886 when "\t", "\n", "\u000c", ' '
3887 tok_state = tok_state_before_attribute_name
3889 tok_state = tok_state_self_closing_start_tag
3891 tok_state = tok_state_data
3897 tok_state = tok_state_data
3900 tok_state = tok_state_before_attribute_name
3901 cur -= 1 # we didn't handle that char
3904 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3905 tok_state_self_closing_start_tag = ->
3906 c = txt.charAt(cur++)
3908 tok_cur_tag.flag 'self-closing', true
3909 tok_state = tok_state_data
3913 tok_state = tok_state_data
3914 cur -= 1 # Reconsume
3918 tok_state = tok_state_before_attribute_name
3919 cur -= 1 # Reconsume
3922 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3923 # WARNING: put a comment token in tok_cur_tag before setting this state
3924 tok_state_bogus_comment = ->
3925 next_gt = txt.indexOf '>', cur
3927 val = txt.substr cur
3930 val = txt.substr cur, (next_gt - cur)
3932 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3933 tok_cur_tag.text += val
3934 tok_state = tok_state_data
3937 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3938 tok_state_markup_declaration_open = ->
3939 if txt.substr(cur, 2) is '--'
3941 tok_cur_tag = new_comment_token ''
3942 tok_state = tok_state_comment_start
3944 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3946 tok_state = tok_state_doctype
3948 acn = adjusted_current_node()
3949 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3951 tok_state = tok_state_cdata_section
3955 tok_cur_tag = new_comment_token ''
3956 tok_state = tok_state_bogus_comment
3959 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3960 tok_state_comment_start = ->
3961 switch c = txt.charAt(cur++)
3963 tok_state = tok_state_comment_start_dash
3966 tok_state = tok_state_comment
3967 return new_character_token "\ufffd"
3970 tok_state = tok_state_data
3974 tok_state = tok_state_data
3975 cur -= 1 # Reconsume
3978 tok_cur_tag.text += c
3979 tok_state = tok_state_comment
3982 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3983 tok_state_comment_start_dash = ->
3984 switch c = txt.charAt(cur++)
3986 tok_state = tok_state_comment_end
3989 tok_cur_tag.text += "-\ufffd"
3990 tok_state = tok_state_comment
3993 tok_state = tok_state_data
3997 tok_state = tok_state_data
3998 cur -= 1 # Reconsume
4001 tok_cur_tag.text += "-#{c}"
4002 tok_state = tok_state_comment
4005 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
4006 tok_state_comment = ->
4007 switch c = txt.charAt(cur++)
4009 tok_state = tok_state_comment_end_dash
4012 tok_cur_tag.text += "\ufffd"
4015 tok_state = tok_state_data
4016 cur -= 1 # Reconsume
4019 tok_cur_tag.text += c
4022 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
4023 tok_state_comment_end_dash = ->
4024 switch c = txt.charAt(cur++)
4026 tok_state = tok_state_comment_end
4029 tok_cur_tag.text += "-\ufffd"
4030 tok_state = tok_state_comment
4033 tok_state = tok_state_data
4034 cur -= 1 # Reconsume
4037 tok_cur_tag.text += "-#{c}"
4038 tok_state = tok_state_comment
4041 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
4042 tok_state_comment_end = ->
4043 switch c = txt.charAt(cur++)
4045 tok_state = tok_state_data
4049 tok_cur_tag.text += "--\ufffd"
4050 tok_state = tok_state_comment
4053 tok_state = tok_state_comment_end_bang
4056 tok_cur_tag.text += '-'
4059 tok_state = tok_state_data
4060 cur -= 1 # Reconsume
4064 tok_cur_tag.text += "--#{c}"
4065 tok_state = tok_state_comment
4068 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
4069 tok_state_comment_end_bang = ->
4070 switch c = txt.charAt(cur++)
4072 tok_cur_tag.text += "--!#{c}"
4073 tok_state = tok_state_comment_end_dash
4075 tok_state = tok_state_data
4079 tok_cur_tag.text += "--!\ufffd"
4080 tok_state = tok_state_comment
4083 tok_state = tok_state_data
4084 cur -= 1 # Reconsume
4087 tok_cur_tag.text += "--!#{c}"
4088 tok_state = tok_state_comment
4091 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4092 tok_state_doctype = ->
4093 switch c = txt.charAt(cur++)
4094 when "\t", "\u000a", "\u000c", ' '
4095 tok_state = tok_state_before_doctype_name
4098 tok_state = tok_state_data
4099 el = new_doctype_token ''
4100 el.flag 'force-quirks', true
4101 cur -= 1 # Reconsume
4105 tok_state = tok_state_before_doctype_name
4106 cur -= 1 # Reconsume
4109 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4110 tok_state_before_doctype_name = ->
4111 c = txt.charAt(cur++)
4112 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4115 tok_cur_tag = new_doctype_token c.toLowerCase()
4116 tok_state = tok_state_doctype_name
4120 tok_cur_tag = new_doctype_token "\ufffd"
4121 tok_state = tok_state_doctype_name
4125 el = new_doctype_token ''
4126 el.flag 'force-quirks', true
4127 tok_state = tok_state_data
4131 tok_state = tok_state_data
4132 el = new_doctype_token ''
4133 el.flag 'force-quirks', true
4134 cur -= 1 # Reconsume
4137 tok_cur_tag = new_doctype_token c
4138 tok_state = tok_state_doctype_name
4141 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4142 tok_state_doctype_name = ->
4143 c = txt.charAt(cur++)
4144 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4145 tok_state = tok_state_after_doctype_name
4148 tok_state = tok_state_data
4151 tok_cur_tag.name += c.toLowerCase()
4155 tok_cur_tag.name += "\ufffd"
4159 tok_state = tok_state_data
4160 tok_cur_tag.flag 'force-quirks', true
4161 cur -= 1 # Reconsume
4164 tok_cur_tag.name += c
4167 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4168 tok_state_after_doctype_name = ->
4169 c = txt.charAt(cur++)
4170 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4173 tok_state = tok_state_data
4177 tok_state = tok_state_data
4178 tok_cur_tag.flag 'force-quirks', true
4179 cur -= 1 # Reconsume
4182 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4184 tok_state = tok_state_after_doctype_public_keyword
4186 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4188 tok_state = tok_state_after_doctype_system_keyword
4191 tok_cur_tag.flag 'force-quirks', true
4192 tok_state = tok_state_bogus_doctype
4195 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4196 tok_state_after_doctype_public_keyword = ->
4197 c = txt.charAt(cur++)
4198 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4199 tok_state = tok_state_before_doctype_public_identifier
4203 tok_cur_tag.public_identifier = ''
4204 tok_state = tok_state_doctype_public_identifier_double_quoted
4208 tok_cur_tag.public_identifier = ''
4209 tok_state = tok_state_doctype_public_identifier_single_quoted
4213 tok_cur_tag.flag 'force-quirks', true
4214 tok_state = tok_state_data
4218 tok_state = tok_state_data
4219 tok_cur_tag.flag 'force-quirks', true
4220 cur -= 1 # Reconsume
4224 tok_cur_tag.flag 'force-quirks', true
4225 tok_state = tok_state_bogus_doctype
4228 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4229 tok_state_before_doctype_public_identifier = ->
4230 c = txt.charAt(cur++)
4231 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4235 tok_cur_tag.public_identifier = ''
4236 tok_state = tok_state_doctype_public_identifier_double_quoted
4240 tok_cur_tag.public_identifier = ''
4241 tok_state = tok_state_doctype_public_identifier_single_quoted
4245 tok_cur_tag.flag 'force-quirks', true
4246 tok_state = tok_state_data
4250 tok_state = tok_state_data
4251 tok_cur_tag.flag 'force-quirks', true
4252 cur -= 1 # Reconsume
4256 tok_cur_tag.flag 'force-quirks', true
4257 tok_state = tok_state_bogus_doctype
4261 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4262 tok_state_doctype_public_identifier_double_quoted = ->
4263 c = txt.charAt(cur++)
4265 tok_state = tok_state_after_doctype_public_identifier
4269 tok_cur_tag.public_identifier += "\ufffd"
4273 tok_cur_tag.flag 'force-quirks', true
4274 tok_state = tok_state_data
4278 tok_state = tok_state_data
4279 tok_cur_tag.flag 'force-quirks', true
4280 cur -= 1 # Reconsume
4283 tok_cur_tag.public_identifier += c
4286 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4287 tok_state_doctype_public_identifier_single_quoted = ->
4288 c = txt.charAt(cur++)
4290 tok_state = tok_state_after_doctype_public_identifier
4294 tok_cur_tag.public_identifier += "\ufffd"
4298 tok_cur_tag.flag 'force-quirks', true
4299 tok_state = tok_state_data
4303 tok_state = tok_state_data
4304 tok_cur_tag.flag 'force-quirks', true
4305 cur -= 1 # Reconsume
4308 tok_cur_tag.public_identifier += c
4311 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4312 tok_state_after_doctype_public_identifier = ->
4313 c = txt.charAt(cur++)
4314 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4315 tok_state = tok_state_between_doctype_public_and_system_identifiers
4318 tok_state = tok_state_data
4322 tok_cur_tag.system_identifier = ''
4323 tok_state = tok_state_doctype_system_identifier_double_quoted
4327 tok_cur_tag.system_identifier = ''
4328 tok_state = tok_state_doctype_system_identifier_single_quoted
4332 tok_state = tok_state_data
4333 tok_cur_tag.flag 'force-quirks', true
4334 cur -= 1 # Reconsume
4338 tok_cur_tag.flag 'force-quirks', true
4339 tok_state = tok_state_bogus_doctype
4342 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4343 tok_state_between_doctype_public_and_system_identifiers = ->
4344 c = txt.charAt(cur++)
4345 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4348 tok_state = tok_state_data
4352 tok_cur_tag.system_identifier = ''
4353 tok_state = tok_state_doctype_system_identifier_double_quoted
4357 tok_cur_tag.system_identifier = ''
4358 tok_state = tok_state_doctype_system_identifier_single_quoted
4362 tok_state = tok_state_data
4363 tok_cur_tag.flag 'force-quirks', true
4364 cur -= 1 # Reconsume
4368 tok_cur_tag.flag 'force-quirks', true
4369 tok_state = tok_state_bogus_doctype
4372 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4373 tok_state_after_doctype_system_keyword = ->
4374 c = txt.charAt(cur++)
4375 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4376 tok_state = tok_state_before_doctype_system_identifier
4380 tok_cur_tag.system_identifier = ''
4381 tok_state = tok_state_doctype_system_identifier_double_quoted
4385 tok_cur_tag.system_identifier = ''
4386 tok_state = tok_state_doctype_system_identifier_single_quoted
4390 tok_cur_tag.flag 'force-quirks', true
4391 tok_state = tok_state_data
4395 tok_state = tok_state_data
4396 tok_cur_tag.flag 'force-quirks', true
4397 cur -= 1 # Reconsume
4401 tok_cur_tag.flag 'force-quirks', true
4402 tok_state = tok_state_bogus_doctype
4405 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4406 tok_state_before_doctype_system_identifier = ->
4407 c = txt.charAt(cur++)
4408 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4411 tok_cur_tag.system_identifier = ''
4412 tok_state = tok_state_doctype_system_identifier_double_quoted
4415 tok_cur_tag.system_identifier = ''
4416 tok_state = tok_state_doctype_system_identifier_single_quoted
4420 tok_cur_tag.flag 'force-quirks', true
4421 tok_state = tok_state_data
4425 tok_state = tok_state_data
4426 tok_cur_tag.flag 'force-quirks', true
4427 cur -= 1 # Reconsume
4431 tok_cur_tag.flag 'force-quirks', true
4432 tok_state = tok_state_bogus_doctype
4435 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4436 tok_state_doctype_system_identifier_double_quoted = ->
4437 c = txt.charAt(cur++)
4439 tok_state = tok_state_after_doctype_system_identifier
4443 tok_cur_tag.system_identifier += "\ufffd"
4447 tok_cur_tag.flag 'force-quirks', true
4448 tok_state = tok_state_data
4452 tok_state = tok_state_data
4453 tok_cur_tag.flag 'force-quirks', true
4454 cur -= 1 # Reconsume
4457 tok_cur_tag.system_identifier += c
4460 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4461 tok_state_doctype_system_identifier_single_quoted = ->
4462 c = txt.charAt(cur++)
4464 tok_state = tok_state_after_doctype_system_identifier
4468 tok_cur_tag.system_identifier += "\ufffd"
4472 tok_cur_tag.flag 'force-quirks', true
4473 tok_state = tok_state_data
4477 tok_state = tok_state_data
4478 tok_cur_tag.flag 'force-quirks', true
4479 cur -= 1 # Reconsume
4482 tok_cur_tag.system_identifier += c
4485 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4486 tok_state_after_doctype_system_identifier = ->
4487 c = txt.charAt(cur++)
4488 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4491 tok_state = tok_state_data
4495 tok_state = tok_state_data
4496 tok_cur_tag.flag 'force-quirks', true
4497 cur -= 1 # Reconsume
4501 # do _not_ tok_cur_tag.flag 'force-quirks', true
4502 tok_state = tok_state_bogus_doctype
4505 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4506 tok_state_bogus_doctype = ->
4507 c = txt.charAt(cur++)
4509 tok_state = tok_state_data
4512 tok_state = tok_state_data
4513 cur -= 1 # Reconsume
4518 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4519 tok_state_cdata_section = ->
4520 tok_state = tok_state_data
4521 next_gt = txt.indexOf ']]>', cur
4523 val = txt.substr cur
4526 val = txt.substr cur, (next_gt - cur)
4528 return new_character_token val # fixfull split
4530 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4531 # Don't set this as a state, just call it
4532 # returns a string (NOT a text node)
4533 parse_character_reference = (allowed_char = null, in_attr = false) ->
4534 if cur >= txt.length
4536 switch c = txt.charAt(cur)
4537 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4538 # explicitly not a parse error
4541 # there has to be "one or more" alnums between & and ; to be a parse error
4544 if cur + 1 >= txt.length
4546 if txt.charAt(cur + 1).toLowerCase() is 'x'
4555 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4560 if txt.charAt(start + i) is ';'
4564 code_point = txt.substr(start, i)
4565 while code_point.charAt(0) is '0' and code_point.length > 1
4566 code_point = code_point.substr 1
4567 code_point = parseInt(code_point, base)
4568 if unicode_fixes[code_point]?
4570 return unicode_fixes[code_point]
4572 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4576 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4578 return from_code_point code_point
4582 if alnum.indexOf(txt.charAt(cur + i)) is -1
4585 # exit early, because parse_error() below needs at least one alnum
4587 if txt.charAt(cur + i) is ';'
4588 i += 1 # include ';' terminator in value
4589 decoded = decode_named_char_ref txt.substr(cur, i)
4596 # no ';' terminator (only legacy char refs)
4598 for i in [2..max] # no prefix matches, so ok to check shortest first
4599 c = legacy_char_refs[txt.substr(cur, i)]
4602 if txt.charAt(cur + i) is '='
4603 # "because some legacy user agents will
4604 # misinterpret the markup in those cases"
4607 if alnum.indexOf(txt.charAt(cur + i)) > -1
4608 # this makes attributes forgiving about url args
4610 # ok, and besides the weird exceptions for attributes...
4611 # return the matching char
4612 cur += i # consume entity chars
4613 parse_error() # because no terminating ";"
4617 return # never reached
4619 # tree constructor initialization
4620 # see comments on TYPE_TAG/etc for the structure of this data
4623 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4624 doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
4626 afe = [] # active formatting elements
4627 template_ins_modes = []
4628 ins_mode = ins_mode_initial
4629 original_ins_mode = ins_mode # TODO check spec
4630 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4631 flag_frameset_ok = true
4633 flag_foster_parenting = false
4634 form_element_pointer = null
4635 temporary_buffer = null
4636 pending_table_character_tokens = []
4637 head_element_pointer = null
4638 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4639 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4640 prev_node_id = 0 # just for debugging
4642 # tokenizer initialization
4643 tok_state = tok_state_data
4645 # text pre-processing
4646 # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4647 txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4648 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4649 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4651 if args.name is "tests20.dat #22"
4654 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4659 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4662 serialize_els = (els, shallow, show_ids) ->
4668 serialized += t.serialize shallow, show_ids
4671 module.exports.parse_html = parse_html
4672 module.exports.debug_log_reset = debug_log_reset
4673 module.exports.debug_log_each = debug_log_each
4674 module.exports.TYPE_TAG = TYPE_TAG
4675 module.exports.TYPE_TEXT = TYPE_TEXT
4676 module.exports.TYPE_COMMENT = TYPE_COMMENT
4677 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4678 module.exports.NS_HTML = NS_HTML
4679 module.exports.NS_MATHML = NS_MATHML
4680 module.exports.NS_SVG = NS_SVG
4681 module.exports.QUIRKS_NO = QUIRKS_NO
4682 module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
4683 module.exports.QUIRKS_YES = QUIRKS_YES