1 # Copyright 2015 Jason Woofenden
2 # This file implements an HTML5 parser
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a thorough parser for html5, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
24 # http://www.w3.org/TR/html5/syntax.html
26 # except for some places marked "WHATWG" that are implemented as described here:
28 # https://html.spec.whatwg.org/multipage/syntax.html
30 # This code passes all of the tests in the .dat files at:
32 # https://github.com/JasonWoof/html5lib-tests/tree/patch-1/tree-construction
35 ##################################
36 ## how to use this code
37 ##################################
39 # See README.md for how to run this file in the browser or in node.js.
41 # This file exports a single useful function: parse_tml, and some constants
42 # (see the bottom of this file for those.)
46 # peach_parser.parse("<p><b>hi</p>")
48 # Or, if you don't want <html><head><body>/etc, do this:
50 # peach_parser.parse("<p><b>hi</p>", {fragment: "body"})
52 # return value is an array of Nodes, see "class Node" below.
54 # This code is a work in progress, eg try search this file for "fixfull",
60 # Jason was frequently confused by the terminology used to refer to different
61 # parts of the stacks and lists in the spec, so he made this chart to help keep
64 # stacks grow downward (current element is index=0)
66 # example: open_els = [a, b, c, d, e, f, g]
68 # "grows downwards" means it's visualized like this: (index: el "names")
70 # 6: g "start of the list", "topmost", "first"
72 # 4: e "previous" (to d), "above", "before"
73 # 3: d (previous/next are relative to this element)
74 # 2: c "next", "after", "lower", "below"
76 # 0: a "end of the list", "current node", "bottommost", "last"
78 if (typeof module) isnt 'undefined' and module.exports?
80 exports = module.exports
83 window.peach_parser = {}
84 exports = window.peach_parser
86 from_code_point = (x) ->
87 if String.fromCodePoint?
88 return String.fromCodePoint x
91 return String.fromCharCode x
93 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
95 # Each node is an obect of the Node class. Here are the Node types:
96 TYPE_TAG = 'tag' # name, {attributes}, [children]
97 TYPE_TEXT = 'text' # "text"
98 TYPE_COMMENT = 'comment'
99 TYPE_DOCTYPE = 'doctype'
100 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
101 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
102 TYPE_END_TAG = 5 # name
104 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
105 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
107 # namespace constants
112 # quirks mode constants
114 QUIRKS_LIMITED = 'limited'
117 # queue up debug logs, so eg they can be shown only for tests that fail
125 debug_log_each = (cb) ->
126 for str in g_debug_log
132 constructor: (type, args = {}) ->
133 @type = type # one of the TYPE_* constants above
134 @name = args.name ? '' # tag name
135 @text = args.text ? '' # contents for text/comment nodes
136 @attrs = args.attrs ? {}
137 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
138 @children = args.children ? []
139 @namespace = args.namespace ? NS_HTML
140 @parent = args.parent ? null
141 @token = args.token ? null
142 @flags = args.flags ? {}
146 @id = "#{++prev_node_id}"
147 acknowledge_self_closing: ->
149 @token.flag 'did_self_close', true
151 @flag 'did_self_close', true
153 flag: (key, value = null) ->
160 # helpers: (only take args that are normally known when parser creates nodes)
161 new_open_tag = (name) ->
162 return new Node TYPE_START_TAG, name: name
163 new_end_tag = (name) ->
164 return new Node TYPE_END_TAG, name: name
165 new_element = (name) ->
166 return new Node TYPE_TAG, name: name
167 new_text_node = (txt) ->
168 return new Node TYPE_TEXT, text: txt
169 new_character_token = new_text_node
170 new_comment_token = (txt) ->
171 return new Node TYPE_COMMENT, text: txt
172 new_doctype_token = (name) ->
173 return new Node TYPE_DOCTYPE, name: name
175 return new Node TYPE_EOF
177 return new Node TYPE_AFE_MARKER
178 new_aaa_bookmark = ->
179 return new Node TYPE_AAA_BOOKMARK
181 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
182 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
183 digits = "0123456789"
184 alnum = lc_alpha + uc_alpha + digits
185 hex_chars = digits + "abcdefABCDEF"
187 is_uc_alpha = (str) ->
188 return str.length is 1 and uc_alpha.indexOf(str) > -1
189 is_lc_alpha = (str) ->
190 return str.length is 1 and lc_alpha.indexOf(str) > -1
192 # some SVG elements have dashes in them
193 tag_name_chars = alnum + "-"
195 # http://www.w3.org/TR/html5/infrastructure.html#space-character
196 space_chars = "\u0009\u000a\u000c\u000d\u0020"
198 return txt.length is 1 and space_chars.indexOf(txt) > -1
199 is_space_tok = (t) ->
200 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
202 is_input_hidden_tok = (t) ->
203 return false unless t.type is TYPE_START_TAG
206 if a[1].toLowerCase() is 'hidden'
211 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
212 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
215 unicode_fixes[0x00] = "\uFFFD"
216 unicode_fixes[0x80] = "\u20AC"
217 unicode_fixes[0x82] = "\u201A"
218 unicode_fixes[0x83] = "\u0192"
219 unicode_fixes[0x84] = "\u201E"
220 unicode_fixes[0x85] = "\u2026"
221 unicode_fixes[0x86] = "\u2020"
222 unicode_fixes[0x87] = "\u2021"
223 unicode_fixes[0x88] = "\u02C6"
224 unicode_fixes[0x89] = "\u2030"
225 unicode_fixes[0x8A] = "\u0160"
226 unicode_fixes[0x8B] = "\u2039"
227 unicode_fixes[0x8C] = "\u0152"
228 unicode_fixes[0x8E] = "\u017D"
229 unicode_fixes[0x91] = "\u2018"
230 unicode_fixes[0x92] = "\u2019"
231 unicode_fixes[0x93] = "\u201C"
232 unicode_fixes[0x94] = "\u201D"
233 unicode_fixes[0x95] = "\u2022"
234 unicode_fixes[0x96] = "\u2013"
235 unicode_fixes[0x97] = "\u2014"
236 unicode_fixes[0x98] = "\u02DC"
237 unicode_fixes[0x99] = "\u2122"
238 unicode_fixes[0x9A] = "\u0161"
239 unicode_fixes[0x9B] = "\u203A"
240 unicode_fixes[0x9C] = "\u0153"
241 unicode_fixes[0x9E] = "\u017E"
242 unicode_fixes[0x9F] = "\u0178"
244 quirks_yes_pi_prefixes = [
245 "+//silmaril//dtd html pro v0r11 19970101//"
246 "-//as//dtd html 3.0 aswedit + extensions//"
247 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
248 "-//ietf//dtd html 2.0 level 1//"
249 "-//ietf//dtd html 2.0 level 2//"
250 "-//ietf//dtd html 2.0 strict level 1//"
251 "-//ietf//dtd html 2.0 strict level 2//"
252 "-//ietf//dtd html 2.0 strict//"
253 "-//ietf//dtd html 2.0//"
254 "-//ietf//dtd html 2.1e//"
255 "-//ietf//dtd html 3.0//"
256 "-//ietf//dtd html 3.2 final//"
257 "-//ietf//dtd html 3.2//"
258 "-//ietf//dtd html 3//"
259 "-//ietf//dtd html level 0//"
260 "-//ietf//dtd html level 1//"
261 "-//ietf//dtd html level 2//"
262 "-//ietf//dtd html level 3//"
263 "-//ietf//dtd html strict level 0//"
264 "-//ietf//dtd html strict level 1//"
265 "-//ietf//dtd html strict level 2//"
266 "-//ietf//dtd html strict level 3//"
267 "-//ietf//dtd html strict//"
268 "-//ietf//dtd html//"
269 "-//metrius//dtd metrius presentational//"
270 "-//microsoft//dtd internet explorer 2.0 html strict//"
271 "-//microsoft//dtd internet explorer 2.0 html//"
272 "-//microsoft//dtd internet explorer 2.0 tables//"
273 "-//microsoft//dtd internet explorer 3.0 html strict//"
274 "-//microsoft//dtd internet explorer 3.0 html//"
275 "-//microsoft//dtd internet explorer 3.0 tables//"
276 "-//netscape comm. corp.//dtd html//"
277 "-//netscape comm. corp.//dtd strict html//"
278 "-//o'reilly and associates//dtd html 2.0//"
279 "-//o'reilly and associates//dtd html extended 1.0//"
280 "-//o'reilly and associates//dtd html extended relaxed 1.0//"
281 "-//sq//dtd html 2.0 hotmetal + extensions//"
282 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
283 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
284 "-//spyglass//dtd html 2.0 extended//"
285 "-//sun microsystems corp.//dtd hotjava html//"
286 "-//sun microsystems corp.//dtd hotjava strict html//"
287 "-//w3c//dtd html 3 1995-03-24//"
288 "-//w3c//dtd html 3.2 draft//"
289 "-//w3c//dtd html 3.2 final//"
290 "-//w3c//dtd html 3.2//"
291 "-//w3c//dtd html 3.2s draft//"
292 "-//w3c//dtd html 4.0 frameset//"
293 "-//w3c//dtd html 4.0 transitional//"
294 "-//w3c//dtd html experimental 19960712//"
295 "-//w3c//dtd html experimental 970421//"
296 "-//w3c//dtd w3 html//"
297 "-//w3o//dtd w3 html 3.0//"
298 "-//webtechs//dtd mozilla html 2.0//"
299 "-//webtechs//dtd mozilla html//"
302 # These are the character references that don't need a terminating semicolon
303 # min length: 2, max: 6, none are a prefix of any other.
305 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
306 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
307 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
308 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
309 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
310 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
311 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
312 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
313 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
314 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
315 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
316 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
317 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
318 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
319 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
320 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
321 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
325 #void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
326 #raw_text_elements = ['script', 'style']
327 #escapable_raw_text_elements = ['textarea', 'title']
328 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
330 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
331 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
332 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
333 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
334 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
335 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
336 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
337 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
338 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
339 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
340 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
341 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
342 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
343 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
347 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
349 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
350 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
351 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
352 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
353 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
354 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
355 'determinant', 'diff', 'divergence', 'divide', 'domain',
356 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
357 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
358 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
359 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
360 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
361 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
362 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
363 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
364 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
365 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
366 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
367 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
368 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
369 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
370 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
371 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
372 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
373 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
374 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
375 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
376 'vectorproduct', 'xor'
378 # foreign_elements = [svg_elements..., mathml_elements...]
379 #normal_elements = All other allowed HTML elements are normal elements.
383 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
384 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
385 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
386 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
387 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
388 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
389 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
390 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
391 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
392 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
393 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
395 menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
397 meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
398 noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
399 plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
400 select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
401 table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
402 textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
403 tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
406 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
407 'annotation-xml':NS_MATHML,
410 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
413 formatting_elements = {
414 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
415 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
419 mathml_text_integration = {
420 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
422 is_mathml_text_integration_point = (el) ->
423 return mathml_text_integration[el.name] is el.namespace
424 is_html_integration = (el) -> # DON'T PASS A TOKEN
425 if el.namespace is NS_MATHML
426 if el.name is 'annotation-xml'
427 if el.attrs.encoding?
428 if el.attrs.encoding.toLowerCase() is 'text/html'
430 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
433 if el.namespace is NS_SVG
434 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
439 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
442 foster_parenting_targets = {
463 el_is_special = (e) ->
464 return special_elements[e.name] is e.namespace
466 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
467 el_is_special_not_adp = (el) ->
468 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
472 altglyphdef: 'altGlyphDef'
473 altglyphitem: 'altGlyphItem'
474 animatecolor: 'animateColor'
475 animatemotion: 'animateMotion'
476 animatetransform: 'animateTransform'
479 fecolormatrix: 'feColorMatrix'
480 fecomponenttransfer: 'feComponentTransfer'
481 fecomposite: 'feComposite'
482 feconvolvematrix: 'feConvolveMatrix'
483 fediffuselighting: 'feDiffuseLighting'
484 fedisplacementmap: 'feDisplacementMap'
485 fedistantlight: 'feDistantLight'
486 fedropshadow: 'feDropShadow'
492 fegaussianblur: 'feGaussianBlur'
495 femergenode: 'feMergeNode'
496 femorphology: 'feMorphology'
498 fepointlight: 'fePointLight'
499 fespecularlighting: 'feSpecularLighting'
500 fespotlight: 'feSpotLight'
502 feturbulence: 'feTurbulence'
503 foreignobject: 'foreignObject'
505 lineargradient: 'linearGradient'
506 radialgradient: 'radialGradient'
509 svg_attribute_fixes = {
510 attributename: 'attributeName'
511 attributetype: 'attributeType'
512 basefrequency: 'baseFrequency'
513 baseprofile: 'baseProfile'
515 clippathunits: 'clipPathUnits'
516 contentscripttype: 'contentScriptType'
517 contentstyletype: 'contentStyleType'
518 diffuseconstant: 'diffuseConstant'
520 externalresourcesrequired: 'externalResourcesRequired'
521 # WHATWG removes this: filterres: 'filterRes'
522 filterunits: 'filterUnits'
524 gradienttransform: 'gradientTransform'
525 gradientunits: 'gradientUnits'
526 kernelmatrix: 'kernelMatrix'
527 kernelunitlength: 'kernelUnitLength'
528 keypoints: 'keyPoints'
529 keysplines: 'keySplines'
531 lengthadjust: 'lengthAdjust'
532 limitingconeangle: 'limitingConeAngle'
533 markerheight: 'markerHeight'
534 markerunits: 'markerUnits'
535 markerwidth: 'markerWidth'
536 maskcontentunits: 'maskContentUnits'
537 maskunits: 'maskUnits'
538 numoctaves: 'numOctaves'
539 pathlength: 'pathLength'
540 patterncontentunits: 'patternContentUnits'
541 patterntransform: 'patternTransform'
542 patternunits: 'patternUnits'
543 pointsatx: 'pointsAtX'
544 pointsaty: 'pointsAtY'
545 pointsatz: 'pointsAtZ'
546 preservealpha: 'preserveAlpha'
547 preserveaspectratio: 'preserveAspectRatio'
548 primitiveunits: 'primitiveUnits'
551 repeatcount: 'repeatCount'
552 repeatdur: 'repeatDur'
553 requiredextensions: 'requiredExtensions'
554 requiredfeatures: 'requiredFeatures'
555 specularconstant: 'specularConstant'
556 specularexponent: 'specularExponent'
557 spreadmethod: 'spreadMethod'
558 startoffset: 'startOffset'
559 stddeviation: 'stdDeviation'
560 stitchtiles: 'stitchTiles'
561 surfacescale: 'surfaceScale'
562 systemlanguage: 'systemLanguage'
563 tablevalues: 'tableValues'
566 textlength: 'textLength'
568 viewtarget: 'viewTarget'
569 xchannelselector: 'xChannelSelector'
570 ychannelselector: 'yChannelSelector'
571 zoomandpan: 'zoomAndPan'
573 foreign_attr_fixes = {
574 'xlink:actuate': 'xlink actuate'
575 'xlink:arcrole': 'xlink arcrole'
576 'xlink:href': 'xlink href'
577 'xlink:role': 'xlink role'
578 'xlink:show': 'xlink show'
579 'xlink:title': 'xlink title'
580 'xlink:type': 'xlink type'
581 'xml:base': 'xml base'
582 'xml:lang': 'xml lang'
583 'xml:space': 'xml space'
585 'xmlns:xlink': 'xmlns xlink'
587 adjust_mathml_attributes = (t) ->
589 if a[0] is 'definitionurl'
590 a[0] = 'definitionURL'
592 adjust_svg_attributes = (t) ->
594 if svg_attribute_fixes[a[0]]?
595 a[0] = svg_attribute_fixes[a[0]]
597 adjust_foreign_attributes = (t) ->
600 if foreign_attr_fixes[a[0]]?
601 a[0] = foreign_attr_fixes[a[0]]
604 # decode_named_char_ref()
606 # The list of named character references is _huge_ so if we're running in a
607 # browser, we get the browser to decode them, rather than increasing the code
608 # size to include the table.
609 if context is 'module'
610 _decode_named_char_ref = require './parser_no_browser_helper.coffee'
612 # TODO test this in IE8
613 decode_named_char_ref_el = document.createElement('textarea')
614 _decode_named_char_ref = (txt) ->
616 decode_named_char_ref_el.innerHTML = txt
617 decoded = decode_named_char_ref_el.value
618 return null if decoded is txt
620 # Pass the name of a named entity _that has a terminating semicolon_
621 # Entities without terminating semicolons should use legacy_char_refs[]
622 # Do not include the "&" or ";" in your argument, eg pass "alpha"
623 decode_named_char_ref_cache = {}
624 decode_named_char_ref = (txt) ->
625 decoded = decode_named_char_ref_cache[txt]
626 return decoded if decoded?
627 decoded = _decode_named_char_ref txt
628 return decode_named_char_ref_cache[txt] = decoded
630 parse_html = (args_html, args = {}) ->
632 cur = null # index of next char in txt to be parsed
633 # declare doc and tokenizer variables so they're in scope below
635 open_els = null # stack of open elements
636 afe = null # active formatting elements
637 template_ins_modes = null
639 original_ins_mode = null
641 tok_cur_tag = null # partially parsed tag
642 flag_scripting = null
643 flag_frameset_ok = null
645 flag_foster_parenting = null
646 form_element_pointer = null
647 temporary_buffer = null
648 pending_table_character_tokens = null
649 head_element_pointer = null
650 flag_fragment_parsing = null
651 context_element = null
662 # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
663 # "Noah's Ark clause" but with three
664 afe_push = (new_el) ->
667 if el.type is TYPE_AFE_MARKER
669 if el.name is new_el.name and el.namespace is new_el.namespace
672 unless new_el.attrs[k] is v
676 for k, v of new_el.attrs
677 unless el.attrs[k] is v
689 afe.unshift new_afe_marker()
692 # the functions below impliment the Tree Contstruction algorithm
693 # http://www.w3.org/TR/html5/syntax.html#tree-construction
695 # But first... the helpers
696 template_tag_is_open = ->
698 if el.name is 'template' and el.namespace is NS_HTML
701 is_in_scope_x = (tag_name, scope, namespace) ->
703 if el.name is tag_name and (namespace is null or namespace is el.namespace)
705 if scope[el.name] is el.namespace
708 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
710 if el.name is tag_name and (namespace is null or namespace is el.namespace)
712 if scope[el.name] is el.namespace
714 if scope2[el.name] is el.namespace
718 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
719 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
722 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
723 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
725 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
727 button_scopers = button: NS_HTML
728 li_scopers = ol: NS_HTML, ul: NS_HTML
729 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
730 is_in_scope = (tag_name, namespace = null) ->
731 return is_in_scope_x tag_name, standard_scopers, namespace
732 is_in_button_scope = (tag_name, namespace = null) ->
733 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
734 is_in_table_scope = (tag_name, namespace = null) ->
735 return is_in_scope_x tag_name, table_scopers, namespace
736 # aka is_in_list_item_scope
737 is_in_li_scope = (tag_name, namespace = null) ->
738 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
739 is_in_select_scope = (tag_name, namespace = null) ->
741 if t.name is tag_name and (namespace is null or namespace is t.namespace)
743 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
746 # this checks for a particular element, not by name
747 # this requires a namespace match
748 el_is_in_scope = (needle) ->
752 if standard_scopers[el.name] is el.namespace
756 clear_to_table_stopers = {
761 clear_stack_to_table_context = ->
763 if clear_to_table_stopers[open_els[0].name]?
767 clear_to_table_body_stopers = {
774 clear_stack_to_table_body_context = ->
776 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
780 clear_to_table_row_stopers = {
785 clear_stack_to_table_row_context = ->
787 if clear_to_table_row_stopers[open_els[0].name]?
791 clear_afe_to_marker = ->
793 return unless afe.length > 0 # this happens in fragment case, ?spec error
795 if el.type is TYPE_AFE_MARKER
800 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
802 # 1. Let last be false.
804 # 2. Let node be the last node in the stack of open elements.
806 node = open_els[node_i]
807 # 3. Loop: If node is the first node in the stack of open elements,
808 # then set last to true, and, if the parser was originally created as
809 # part of the HTML fragment parsing algorithm (fragment case) set node
810 # to the context element.
812 if node_i is open_els.length - 1
814 if flag_fragment_parsing
815 node = context_element
816 # 4. If node is a select element, run these substeps:
817 if node.name is 'select' and node.namespace is NS_HTML
818 # 1. If last is true, jump to the step below labeled done.
820 # 2. Let ancestor be node.
823 # 3. Loop: If ancestor is the first node in the stack of
824 # open elements, jump to the step below labeled done.
826 if ancestor_i is open_els.length - 1
828 # 4. Let ancestor be the node before ancestor in the stack
831 ancestor = open_els[ancestor_i]
832 # 5. If ancestor is a template node, jump to the step below
834 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
836 # 6. If ancestor is a table node, switch the insertion mode
837 # to "in select in table" and abort these steps.
838 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
839 ins_mode = ins_mode_in_select_in_table
841 # 7. Jump back to the step labeled loop.
842 # 8. Done: Switch the insertion mode to "in select" and abort
844 ins_mode = ins_mode_in_select
846 # 5. If node is a td or th element and last is false, then switch
847 # the insertion mode to "in cell" and abort these steps.
848 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
849 ins_mode = ins_mode_in_cell
851 # 6. If node is a tr element, then switch the insertion mode to "in
852 # row" and abort these steps.
853 if node.name is 'tr' and node.namespace is NS_HTML
854 ins_mode = ins_mode_in_row
856 # 7. If node is a tbody, thead, or tfoot element, then switch the
857 # insertion mode to "in table body" and abort these steps.
858 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
859 ins_mode = ins_mode_in_table_body
861 # 8. If node is a caption element, then switch the insertion mode
862 # to "in caption" and abort these steps.
863 if node.name is 'caption' and node.namespace is NS_HTML
864 ins_mode = ins_mode_in_caption
866 # 9. If node is a colgroup element, then switch the insertion mode
867 # to "in column group" and abort these steps.
868 if node.name is 'colgroup' and node.namespace is NS_HTML
869 ins_mode = ins_mode_in_column_group
871 # 10. If node is a table element, then switch the insertion mode to
872 # "in table" and abort these steps.
873 if node.name is 'table' and node.namespace is NS_HTML
874 ins_mode = ins_mode_in_table
876 # 11. If node is a template element, then switch the insertion mode
877 # to the current template insertion mode and abort these steps.
878 if node.name is 'template' and node.namespace is NS_HTML
879 ins_mode = template_ins_modes[0]
881 # 12. If node is a head element and last is true, then switch the
882 # insertion mode to "in body" ("in body"! not "in head"!) and abort
883 # these steps. (fragment case)
884 if node.name is 'head' and node.namespace is NS_HTML and last
885 ins_mode = ins_mode_in_body
887 # 13. If node is a head element and last is false, then switch the
888 # insertion mode to "in head" and abort these steps.
889 if node.name is 'head' and node.namespace is NS_HTML and last is false
890 ins_mode = ins_mode_in_head
892 # 14. If node is a body element, then switch the insertion mode to
893 # "in body" and abort these steps.
894 if node.name is 'body' and node.namespace is NS_HTML
895 ins_mode = ins_mode_in_body
897 # 15. If node is a frameset element, then switch the insertion mode
898 # to "in frameset" and abort these steps. (fragment case)
899 if node.name is 'frameset' and node.namespace is NS_HTML
900 ins_mode = ins_mode_in_frameset
902 # 16. If node is an html element, run these substeps:
903 if node.name is 'html' and node.namespace is NS_HTML
904 # 1. If the head element pointer is null, switch the insertion
905 # mode to "before head" and abort these steps. (fragment case)
906 if head_element_pointer is null
907 ins_mode = ins_mode_before_head
909 # 2. Otherwise, the head element pointer is not null,
910 # switch the insertion mode to "after head" and abort these
912 ins_mode = ins_mode_after_head
914 # 17. If last is true, then switch the insertion mode to "in body"
915 # and abort these steps. (fragment case)
917 ins_mode = ins_mode_in_body
919 # 18. Let node now be the node before node in the stack of open
922 node = open_els[node_i]
923 # 19. Return to the step labeled loop.
928 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
929 adjusted_current_node = ->
930 if open_els.length is 1 and flag_fragment_parsing
931 return context_element
934 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
935 # this implementation is structured (mostly) as described at the link above.
936 # capitalized comments are the "labels" described at the link above.
938 return if afe.length is 0
939 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
944 if i is afe.length - 1
947 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
952 el = insert_html_element afe[i].token
958 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
959 # adoption agency algorithm
961 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
962 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
963 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
964 adoption_agency = (subject) ->
965 # this block implements tha W3C spec
966 # # 1. If the current node is an HTML element whose tag name is subject,
967 # # then run these substeps:
969 # # 1. Let element be the current node.
971 # # 2. Pop element off the stack of open elements.
973 # # 3. If element is also in the list of active formatting elements,
974 # # remove the element from the list.
976 # # 4. Abort the adoption agency algorithm.
977 # if open_els[0].name is subject and open_els[0].namespace is NS_HTML
978 # el = open_els.shift()
979 # # remove it from the list of active formatting elements (if found)
985 # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
986 # If the current node is an HTML element whose tag name is subject, and
987 # the current node is not in the list of active formatting elements,
988 # then pop the current node off the stack of open elements, and abort
990 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
991 # remove it from the list of active formatting elements (if found)
1007 # 5. Let formatting element be the last element in the list of
1008 # active formatting elements that: is between the end of the list
1009 # and the last scope marker in the list, if any, or the start of
1010 # the list otherwise, and has the tag name subject.
1012 for t, fe_of_afe in afe
1013 if t.type is TYPE_AFE_MARKER
1015 if t.name is subject
1018 # If there is no such element, then abort these steps and instead
1019 # act as described in the "any other end tag" entry above.
1021 in_body_any_other_end_tag subject
1023 # 6. If formatting element is not in the stack of open elements,
1024 # then this is a parse error; remove the element from the list, and
1025 # abort these steps.
1027 for t, fe_of_open_els in open_els
1033 # "remove it from the list" must mean afe, since it's not in open_els
1034 afe.splice fe_of_afe, 1
1036 # 7. If formatting element is in the stack of open elements, but
1037 # the element is not in scope, then this is a parse error; abort
1039 unless el_is_in_scope fe
1042 # 8. If formatting element is not the current node, this is a parse
1043 # error. (But do not abort these steps.)
1044 unless open_els[0] is fe
1047 # 9. Let furthest block be the topmost node in the stack of open
1048 # elements that is lower in the stack than formatting element, and
1049 # is an element in the special category. There might not be one.
1051 fb_of_open_els = null
1052 for t, i in open_els
1058 # and continue, to see if there's one that's more "topmost"
1059 # 10. If there is no furthest block, then the UA must first pop all
1060 # the nodes from the bottom of the stack of open elements, from the
1061 # current node up to and including formatting element, then remove
1062 # formatting element from the list of active formatting elements,
1063 # and finally abort these steps.
1066 t = open_els.shift()
1068 afe.splice fe_of_afe, 1
1070 # 11. Let common ancestor be the element immediately above
1071 # formatting element in the stack of open elements.
1072 ca = open_els[fe_of_open_els + 1] # common ancestor
1074 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
1075 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
1076 bookmark = new_aaa_bookmark()
1079 afe.splice i, 0, bookmark
1081 node = last_node = fb
1085 # 3. Let node be the element immediately above node in the
1086 # stack of open elements, or if node is no longer in the stack
1087 # of open elements (e.g. because it got removed by this
1088 # algorithm), the element that was immediately above node in
1089 # the stack of open elements before node was removed.
1091 for t, i in open_els
1093 node_next = open_els[i + 1]
1095 node = node_next ? node_above
1096 # TODO make sure node_above gets re-set if/when node is removed from open_els
1098 # 4. If node is formatting element, then go to the next step in
1099 # the overall algorithm.
1102 # 5. If inner loop counter is greater than three and node is in
1103 # the list of active formatting elements, then remove node from
1104 # the list of active formatting elements.
1113 # 6. If node is not in the list of active formatting elements,
1114 # then remove node from the stack of open elements and then go
1115 # back to the step labeled inner loop.
1117 for t, i in open_els
1119 node_above = open_els[i + 1]
1120 open_els.splice i, 1
1123 # 7. create an element for the token for which the element node
1124 # was created, in the HTML namespace, with common ancestor as
1125 # the intended parent; replace the entry for node in the list
1126 # of active formatting elements with an entry for the new
1127 # element, replace the entry for node in the stack of open
1128 # elements with an entry for the new element, and let node be
1130 new_node = token_to_element node.token, NS_HTML, ca
1135 for t, i in open_els
1137 node_above = open_els[i + 1]
1138 open_els[i] = new_node
1141 # 8. If last node is furthest block, then move the
1142 # aforementioned bookmark to be immediately after the new node
1143 # in the list of active formatting elements.
1151 # "after" means lower
1152 afe.splice i, 0, bookmark # "after as <-
1154 # 9. Insert last node into node, first removing it from its
1155 # previous parent node if any.
1156 if last_node.parent?
1157 for c, i in last_node.parent.children
1159 last_node.parent.children.splice i, 1
1161 node.children.push last_node
1162 last_node.parent = node
1163 # 10. Let last node be node.
1165 # 11. Return to the step labeled inner loop.
1166 # 14. Insert whatever last node ended up being in the previous step
1167 # at the appropriate place for inserting a node, but using common
1168 # ancestor as the override target.
1170 # In the case where fe is immediately followed by fb:
1171 # * inner loop exits out early (node==fe)
1173 # * last_node is still in the tree (not a duplicate)
1174 if last_node.parent?
1175 for c, i in last_node.parent.children
1177 last_node.parent.children.splice i, 1
1179 # can't use standard insert token thing, because it's already in
1180 # open_els and must stay at it's current position in open_els
1181 dest = adjusted_insertion_location ca
1182 dest[0].children.splice dest[1], 0, last_node
1183 last_node.parent = dest[0]
1184 # 15. Create an element for the token for which formatting element
1185 # was created, in the HTML namespace, with furthest block as the
1187 new_element = token_to_element fe.token, NS_HTML, fb
1188 # 16. Take all of the child nodes of furthest block and append them
1189 # to the element created in the last step.
1190 while fb.children.length
1191 t = fb.children.shift()
1192 t.parent = new_element
1193 new_element.children.push t
1194 # 17. Append that new element to furthest block.
1195 new_element.parent = fb
1196 fb.children.push new_element
1197 # 18. Remove formatting element from the list of active formatting
1198 # elements, and insert the new element into the list of active
1199 # formatting elements at the position of the aforementioned
1207 afe[i] = new_element
1209 # 19. Remove formatting element from the stack of open elements,
1210 # and insert the new element into the stack of open elements
1211 # immediately below the position of furthest block in that stack.
1212 for t, i in open_els
1214 open_els.splice i, 1
1216 for t, i in open_els
1218 open_els.splice i, 0, new_element
1220 # 20. Jump back to the step labeled outer loop.
1223 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1224 close_p_element = ->
1225 generate_implied_end_tags 'p' # arg is exception
1226 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1228 while open_els.length > 1 # just in case
1229 el = open_els.shift()
1230 if el.name is 'p' and el.namespace is NS_HTML
1233 close_p_if_in_button_scope = ->
1234 if is_in_button_scope 'p', NS_HTML
1238 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1239 # aka insert_a_character = (t) ->
1240 insert_character = (t) ->
1241 dest = adjusted_insertion_location()
1242 # fixfull check for Document node
1244 prev = dest[0].children[dest[1] - 1]
1245 if prev.type is TYPE_TEXT
1248 dest[0].children.splice dest[1], 0, t
1252 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1253 process_token = (t) ->
1254 acn = adjusted_current_node()
1258 if acn.namespace is NS_HTML
1261 if is_mathml_text_integration_point(acn)
1262 if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1265 if t.type is TYPE_TEXT
1268 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1271 if is_html_integration acn
1272 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1275 if t.type is TYPE_EOF
1278 in_foreign_content t
1282 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1283 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1284 adjusted_insertion_location = (override_target = null) ->
1285 # 1. If there was an override target specified, then let target be the
1288 target = override_target
1289 else # Otherwise, let target be the current node.
1290 target = open_els[0]
1291 # 2. Determine the adjusted insertion location using the first matching
1292 # steps from the following list:
1294 # If foster parenting is enabled and target is a table, tbody, tfoot,
1295 # thead, or tr element Foster parenting happens when content is
1296 # misnested in tables.
1297 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1298 loop # once. this is here so we can ``break`` to "abort these substeps"
1299 # 1. Let last template be the last template element in the
1300 # stack of open elements, if any.
1301 last_template = null
1302 last_template_i = null
1303 for el, i in open_els
1304 if el.name is 'template' and el.namespace is NS_HTML
1308 # 2. Let last table be the last table element in the stack of
1309 # open elements, if any.
1312 for el, i in open_els
1313 if el.name is 'table' and el.namespace is NS_HTML
1317 # 3. If there is a last template and either there is no last
1318 # table, or there is one, but last template is lower (more
1319 # recently added) than last table in the stack of open
1320 # elements, then: let adjusted insertion location be inside
1321 # last template's template contents, after its last child (if
1322 # any), and abort these substeps.
1323 if last_template and (last_table is null or last_template_i < last_table_i)
1324 target = last_template # fixfull should be it's contents
1325 target_i = target.children.length
1327 # 4. If there is no last table, then let adjusted insertion
1328 # location be inside the first element in the stack of open
1329 # elements (the html element), after its last child (if any),
1330 # and abort these substeps. (fragment case)
1331 if last_table is null
1333 target = open_els[open_els.length - 1]
1334 target_i = target.children.length
1336 # 5. If last table has a parent element, then let adjusted
1337 # insertion location be inside last table's parent element,
1338 # immediately before last table, and abort these substeps.
1339 if last_table.parent?
1340 for c, i in last_table.parent.children
1342 target = last_table.parent
1346 # 6. Let previous element be the element immediately above last
1347 # table in the stack of open elements.
1349 # huh? how could it not have a parent?
1350 previous_element = open_els[last_table_i + 1]
1351 # 7. Let adjusted insertion location be inside previous
1352 # element, after its last child (if any).
1353 target = previous_element
1354 target_i = target.children.length
1355 # Note: These steps are involved in part because it's possible
1356 # for elements, the table element in this case in particular,
1357 # to have been moved by a script around in the DOM, or indeed
1358 # removed from the DOM entirely, after the element was inserted
1360 break # don't really loop
1362 # Otherwise Let adjusted insertion location be inside target, after
1363 # its last child (if any).
1364 target_i = target.children.length
1366 # 3. If the adjusted insertion location is inside a template element,
1367 # let it instead be inside the template element's template contents,
1368 # after its last child (if any).
1369 # fixfull (template)
1371 # 4. Return the adjusted insertion location.
1372 return [target, target_i]
1374 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1375 # aka create_an_element_for_token
1376 token_to_element = (t, namespace, intended_parent) ->
1377 # convert attributes into a hash
1380 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1381 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1383 # TODO 2. If the newly created element has an xmlns attribute in the
1384 # XMLNS namespace whose value is not exactly the same as the element's
1385 # namespace, that is a parse error. Similarly, if the newly created
1386 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1387 # value is not the XLink Namespace, that is a parse error.
1389 # fixfull: the spec says stuff about form pointers and ownerDocument
1393 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1394 insert_foreign_element = (token, namespace) ->
1395 ail = adjusted_insertion_location()
1398 el = token_to_element token, namespace, ail_el
1399 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1401 ail_el.children.splice ail_i, 0, el
1404 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1405 insert_html_element = (token) ->
1406 return insert_foreign_element token, NS_HTML
1408 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1409 # position should be [node, index_within_children]
1410 insert_comment = (t, position = null) ->
1411 position ?= adjusted_insertion_location()
1412 position[0].children.splice position[1], 0, t
1416 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1417 parse_generic_raw_text = (t) ->
1418 insert_html_element t
1419 tok_state = tok_state_rawtext
1420 original_ins_mode = ins_mode
1421 ins_mode = ins_mode_text
1423 parse_generic_rcdata_text = (t) ->
1424 insert_html_element t
1425 tok_state = tok_state_rcdata
1426 original_ins_mode = ins_mode
1427 ins_mode = ins_mode_text
1430 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1431 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1432 generate_implied_end_tags = (except = null) ->
1433 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1437 # 8.2.5.4 The rules for parsing tokens in HTML content
1438 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1440 # 8.2.5.4.1 The "initial" insertion mode
1441 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1442 is_quirks_yes_doctype = (t) ->
1443 if t.flag 'force-quirks'
1445 if t.name isnt 'html'
1447 if t.public_identifier?
1448 pi = t.public_identifier.toLowerCase()
1449 for p in quirks_yes_pi_prefixes
1450 if pi.substr(0, p.length) is p
1452 if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
1454 if t.system_identifier?
1455 if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
1457 else if t.public_identifier?
1458 # already did this: pi = t.public_identifier.toLowerCase()
1459 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1462 is_quirks_limited_doctype = (t) ->
1463 if t.public_identifier?
1464 pi = t.public_identifier.toLowerCase()
1465 if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
1467 if t.system_identifier?
1468 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1471 ins_mode_initial = (t) ->
1474 if t.type is TYPE_COMMENT
1478 if t.type is TYPE_DOCTYPE
1479 # fixfull syntax error from first paragraph and following bullets
1480 # fixfull set doc.doctype
1481 # fixfull is the "not an iframe srcdoc" thing relevant?
1482 if is_quirks_yes_doctype t
1483 doc.flag 'quirks mode', QUIRKS_YES
1484 else if is_quirks_limited_doctype t
1485 doc.flag 'quirks mode', QUIRKS_LIMITED
1487 ins_mode = ins_mode_before_html
1490 # fixfull not iframe srcdoc?
1492 doc.flag 'quirks mode', QUIRKS_YES
1493 ins_mode = ins_mode_before_html
1497 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1498 ins_mode_before_html = (t) ->
1499 if t.type is TYPE_DOCTYPE
1502 if t.type is TYPE_COMMENT
1507 if t.type is TYPE_START_TAG and t.name is 'html'
1508 el = token_to_element t, NS_HTML, doc
1509 doc.children.push el
1511 open_els.unshift(el)
1512 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1513 ins_mode = ins_mode_before_head
1515 if t.type is TYPE_END_TAG
1516 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1517 # fall through to "anything else"
1522 el = token_to_element new_open_tag('html'), NS_HTML, doc
1523 doc.children.push el
1526 # ?fixfull browsing context
1527 ins_mode = ins_mode_before_head
1531 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1532 ins_mode_before_head = (t) ->
1535 if t.type is TYPE_COMMENT
1538 if t.type is TYPE_DOCTYPE
1541 if t.type is TYPE_START_TAG and t.name is 'html'
1544 if t.type is TYPE_START_TAG and t.name is 'head'
1545 el = insert_html_element t
1546 head_element_pointer = el
1547 ins_mode = ins_mode_in_head
1549 if t.type is TYPE_END_TAG
1550 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1551 # fall through to Anything else below
1556 el = insert_html_element new_open_tag 'head'
1557 head_element_pointer = el
1558 ins_mode = ins_mode_in_head
1562 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1563 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1564 open_els.shift() # spec says this will be a 'head' node
1565 ins_mode = ins_mode_after_head
1568 ins_mode_in_head = (t) ->
1569 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1572 if t.type is TYPE_COMMENT
1575 if t.type is TYPE_DOCTYPE
1578 if t.type is TYPE_START_TAG and t.name is 'html'
1581 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1582 el = insert_html_element t
1584 t.acknowledge_self_closing()
1586 if t.type is TYPE_START_TAG and t.name is 'meta'
1587 el = insert_html_element t
1589 t.acknowledge_self_closing()
1590 # fixfull encoding stuff
1592 if t.type is TYPE_START_TAG and t.name is 'title'
1593 parse_generic_rcdata_text t
1595 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1596 parse_generic_raw_text t
1598 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1599 insert_html_element t
1600 ins_mode = ins_mode_in_head_noscript
1602 if t.type is TYPE_START_TAG and t.name is 'script'
1603 ail = adjusted_insertion_location()
1604 el = token_to_element t, NS_HTML, ail
1605 el.flag 'parser-inserted', true
1606 # fixfull frament case
1607 ail[0].children.splice ail[1], 0, el
1609 tok_state = tok_state_script_data
1610 original_ins_mode = ins_mode # make sure orig... is defined
1611 ins_mode = ins_mode_text
1613 if t.type is TYPE_END_TAG and t.name is 'head'
1614 open_els.shift() # will be a head element... spec says so
1615 ins_mode = ins_mode_after_head
1617 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1618 ins_mode_in_head_else t
1620 if t.type is TYPE_START_TAG and t.name is 'template'
1621 insert_html_element t
1623 flag_frameset_ok = false
1624 ins_mode = ins_mode_in_template
1625 template_ins_modes.unshift ins_mode_in_template
1627 if t.type is TYPE_END_TAG and t.name is 'template'
1628 if template_tag_is_open()
1629 generate_implied_end_tags
1630 if open_els[0].name isnt 'template'
1633 el = open_els.shift()
1634 if el.name is 'template' and el.namespace is NS_HTML
1636 clear_afe_to_marker()
1637 template_ins_modes.shift()
1642 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1645 ins_mode_in_head_else t
1648 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1649 ins_mode_in_head_noscript_else = (t) ->
1652 ins_mode = ins_mode_in_head
1655 ins_mode_in_head_noscript = (t) ->
1656 if t.type is TYPE_DOCTYPE
1659 if t.type is TYPE_START_TAG and t.name is 'html'
1662 if t.type is TYPE_END_TAG and t.name is 'noscript'
1664 ins_mode = ins_mode_in_head
1666 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1669 if t.type is TYPE_END_TAG and t.name is 'br'
1670 ins_mode_in_head_noscript_else t
1672 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1676 ins_mode_in_head_noscript_else t
1679 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1680 ins_mode_after_head_else = (t) ->
1681 body_tok = new_open_tag 'body'
1682 insert_html_element body_tok
1683 ins_mode = ins_mode_in_body
1686 ins_mode_after_head = (t) ->
1690 if t.type is TYPE_COMMENT
1693 if t.type is TYPE_DOCTYPE
1696 if t.type is TYPE_START_TAG and t.name is 'html'
1699 if t.type is TYPE_START_TAG and t.name is 'body'
1700 insert_html_element t
1701 flag_frameset_ok = false
1702 ins_mode = ins_mode_in_body
1704 if t.type is TYPE_START_TAG and t.name is 'frameset'
1705 insert_html_element t
1706 ins_mode = ins_mode_in_frameset
1708 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1710 open_els.unshift head_element_pointer
1712 for el, i in open_els
1713 if el is head_element_pointer
1714 open_els.splice i, 1
1717 if t.type is TYPE_END_TAG and t.name is 'template'
1720 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1721 ins_mode_after_head_else t
1723 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1727 ins_mode_after_head_else t
1730 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1731 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1734 if node.name is name and node.namespace is NS_HTML
1735 generate_implied_end_tags name # arg is exception
1736 unless node is open_els[0]
1739 el = open_els.shift()
1742 if special_elements[node.name] is node.namespace
1745 for el, i in open_els
1747 node = open_els[i + 1]
1750 ins_mode_in_body = (t) ->
1751 if t.type is TYPE_TEXT and t.text is "\u0000"
1758 if t.type is TYPE_TEXT
1761 flag_frameset_ok = false
1763 if t.type is TYPE_COMMENT
1766 if t.type is TYPE_DOCTYPE
1769 if t.type is TYPE_START_TAG and t.name is 'html'
1771 return if template_tag_is_open()
1772 root_attrs = open_els[open_els.length - 1].attrs
1774 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1777 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1780 if t.type is TYPE_START_TAG and t.name is 'body'
1782 return if open_els.length < 2
1783 second = open_els[open_els.length - 2]
1784 return unless second.namespace is NS_HTML
1785 return unless second.name is 'body'
1786 return if template_tag_is_open()
1787 flag_frameset_ok = false
1789 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1791 if t.type is TYPE_START_TAG and t.name is 'frameset'
1793 return if open_els.length < 2
1794 second_i = open_els.length - 2
1795 second = open_els[second_i]
1796 return unless second.namespace is NS_HTML
1797 return unless second.name is 'body'
1798 if flag_frameset_ok is false
1801 for el, i in second.parent.children
1803 second.parent.children.splice i, 1
1805 open_els.splice second_i, 1
1806 # pop everything except the "root html element"
1807 while open_els.length > 1
1809 insert_html_element t
1810 ins_mode = ins_mode_in_frameset
1812 if t.type is TYPE_EOF
1814 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1815 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1816 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1819 unless ok_tags[t.name] is el.namespace
1822 if template_ins_modes.length > 0
1823 ins_mode_in_template t
1827 if t.type is TYPE_END_TAG and t.name is 'body'
1828 unless is_in_scope 'body', NS_HTML
1832 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1833 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1834 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1835 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1839 unless ok_tags[t.name] is el.namespace
1842 ins_mode = ins_mode_after_body
1844 if t.type is TYPE_END_TAG and t.name is 'html'
1845 unless is_in_scope 'body', NS_HTML
1849 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1850 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1851 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1852 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1856 unless ok_tags[t.name] is el.namespace
1859 ins_mode = ins_mode_after_body
1862 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1863 close_p_if_in_button_scope()
1864 insert_html_element t
1866 if t.type is TYPE_START_TAG and h_tags[t.name]?
1867 close_p_if_in_button_scope()
1868 if h_tags[open_els[0].name] is open_els[0].namespace
1871 insert_html_element t
1873 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1874 close_p_if_in_button_scope()
1875 insert_html_element t
1876 eat_next_token_if_newline()
1877 flag_frameset_ok = false
1879 if t.type is TYPE_START_TAG and t.name is 'form'
1880 unless form_element_pointer is null or template_tag_is_open()
1883 close_p_if_in_button_scope()
1884 el = insert_html_element t
1885 unless template_tag_is_open()
1886 form_element_pointer = el
1888 if t.type is TYPE_START_TAG and t.name is 'li'
1889 flag_frameset_ok = false
1890 for node in open_els
1891 if node.name is 'li' and node.namespace is NS_HTML
1892 generate_implied_end_tags 'li' # arg is exception
1893 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1896 el = open_els.shift()
1897 if el.name is 'li' and el.namespace is NS_HTML
1900 if el_is_special_not_adp node
1902 close_p_if_in_button_scope()
1903 insert_html_element t
1905 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1906 flag_frameset_ok = false
1907 for node in open_els
1908 if node.name is 'dd' and node.namespace is NS_HTML
1909 generate_implied_end_tags 'dd' # arg is exception
1910 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1913 el = open_els.shift()
1914 if el.name is 'dd' and el.namespace is NS_HTML
1917 if node.name is 'dt' and node.namespace is NS_HTML
1918 generate_implied_end_tags 'dt' # arg is exception
1919 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1922 el = open_els.shift()
1923 if el.name is 'dt' and el.namespace is NS_HTML
1926 if el_is_special_not_adp node
1928 close_p_if_in_button_scope()
1929 insert_html_element t
1931 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1932 close_p_if_in_button_scope()
1933 insert_html_element t
1934 tok_state = tok_state_plaintext
1936 if t.type is TYPE_START_TAG and t.name is 'button'
1937 if is_in_scope 'button', NS_HTML
1939 generate_implied_end_tags()
1941 el = open_els.shift()
1942 if el.name is 'button' and el.namespace is NS_HTML
1945 insert_html_element t
1946 flag_frameset_ok = false
1948 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1949 unless is_in_scope t.name, NS_HTML
1952 generate_implied_end_tags()
1953 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1956 el = open_els.shift()
1957 if el.name is t.name and el.namespace is NS_HTML
1960 if t.type is TYPE_END_TAG and t.name is 'form'
1961 unless template_tag_is_open()
1962 node = form_element_pointer
1963 form_element_pointer = null
1964 if node is null or not el_is_in_scope node
1967 generate_implied_end_tags()
1968 if open_els[0] isnt node
1970 for el, i in open_els
1972 open_els.splice i, 1
1975 unless is_in_scope 'form', NS_HTML
1978 generate_implied_end_tags()
1979 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1982 el = open_els.shift()
1983 if el.name is 'form' and el.namespace is NS_HTML
1986 if t.type is TYPE_END_TAG and t.name is 'p'
1987 unless is_in_button_scope 'p', NS_HTML
1989 insert_html_element new_open_tag 'p'
1992 if t.type is TYPE_END_TAG and t.name is 'li'
1993 unless is_in_li_scope 'li', NS_HTML
1996 generate_implied_end_tags 'li' # arg is exception
1997 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
2000 el = open_els.shift()
2001 if el.name is 'li' and el.namespace is NS_HTML
2004 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
2005 unless is_in_scope t.name, NS_HTML
2008 generate_implied_end_tags t.name # arg is exception
2009 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2012 el = open_els.shift()
2013 if el.name is t.name and el.namespace is NS_HTML
2016 if t.type is TYPE_END_TAG and h_tags[t.name]?
2019 if h_tags[el.name] is el.namespace
2022 if standard_scopers[el.name] is el.namespace
2027 generate_implied_end_tags()
2028 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2031 el = open_els.shift()
2032 if h_tags[el.name] is el.namespace
2036 if t.type is TYPE_START_TAG and t.name is 'a'
2037 # If the list of active formatting elements contains an a element
2038 # between the end of the list and the last marker on the list (or
2039 # the start of the list if there is no marker on the list), then
2040 # this is a parse error; run the adoption agency algorithm for the
2041 # tag name "a", then remove that element from the list of active
2042 # formatting elements and the stack of open elements if the
2043 # adoption agency algorithm didn't already remove it (it might not
2044 # have if the element is not in table scope).
2047 if el.type is TYPE_AFE_MARKER
2049 if el.name is 'a' and el.namespace is NS_HTML
2057 for el, i in open_els
2059 open_els.splice i, 1
2061 el = insert_html_element t
2064 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2066 el = insert_html_element t
2069 if t.type is TYPE_START_TAG and t.name is 'nobr'
2071 if is_in_scope 'nobr', NS_HTML
2073 adoption_agency 'nobr'
2075 el = insert_html_element t
2078 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2079 adoption_agency t.name
2081 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2083 insert_html_element t
2085 flag_frameset_ok = false
2087 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2088 unless is_in_scope t.name, NS_HTML
2091 generate_implied_end_tags()
2092 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2095 el = open_els.shift()
2096 if el.name is t.name and el.namespace is NS_HTML
2098 clear_afe_to_marker()
2100 if t.type is TYPE_START_TAG and t.name is 'table'
2101 unless doc.flag('quirks mode') is QUIRKS_YES
2102 close_p_if_in_button_scope() # test
2103 insert_html_element t
2104 flag_frameset_ok = false
2105 ins_mode = ins_mode_in_table
2107 if t.type is TYPE_END_TAG and t.name is 'br'
2109 # W3C: t.type = TYPE_START_TAG
2110 t = new_open_tag 'br' # WHATWG
2112 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2114 insert_html_element t
2116 t.acknowledge_self_closing()
2117 flag_frameset_ok = false
2119 if t.type is TYPE_START_TAG and t.name is 'input'
2121 insert_html_element t
2123 t.acknowledge_self_closing()
2124 unless is_input_hidden_tok t
2125 flag_frameset_ok = false
2127 if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
2128 # WHATWG adds 'menuitem' for this block
2129 insert_html_element t
2131 t.acknowledge_self_closing()
2133 if t.type is TYPE_START_TAG and t.name is 'hr'
2134 close_p_if_in_button_scope()
2135 insert_html_element t
2137 t.acknowledge_self_closing()
2138 flag_frameset_ok = false
2140 if t.type is TYPE_START_TAG and t.name is 'image'
2145 if t.type is TYPE_START_TAG and t.name is 'isindex'
2147 if template_tag_is_open() is false and form_element_pointer isnt null
2149 t.acknowledge_self_closing()
2150 flag_frameset_ok = false
2151 close_p_if_in_button_scope()
2152 el = insert_html_element new_open_tag 'form'
2153 unless template_tag_is_open()
2154 form_element_pointer = el
2157 el.attrs['action'] = a[1]
2159 insert_html_element new_open_tag 'hr'
2162 insert_html_element new_open_tag 'label'
2163 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2164 input_el = new_open_tag 'input'
2169 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2170 input_el.attrs_a.push [a[0], a[1]]
2171 input_el.attrs_a.push ['name', 'isindex']
2172 # fixfull this next bit is in english... internationalize?
2173 prompt ?= "This is a searchable index. Enter search keywords: "
2174 insert_character new_character_token prompt # fixfull split
2175 # TODO submit typo "balue" in spec
2176 insert_html_element input_el
2178 # insert_character '' # you can put chars here if promt attr missing
2180 insert_html_element new_open_tag 'hr'
2183 unless template_tag_is_open()
2184 form_element_pointer = null
2186 if t.type is TYPE_START_TAG and t.name is 'textarea'
2187 insert_html_element t
2188 eat_next_token_if_newline()
2189 tok_state = tok_state_rcdata
2190 original_ins_mode = ins_mode
2191 flag_frameset_ok = false
2192 ins_mode = ins_mode_text
2194 if t.type is TYPE_START_TAG and t.name is 'xmp'
2195 close_p_if_in_button_scope()
2197 flag_frameset_ok = false
2198 parse_generic_raw_text t
2200 if t.type is TYPE_START_TAG and t.name is 'iframe'
2201 flag_frameset_ok = false
2202 parse_generic_raw_text t
2204 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2205 parse_generic_raw_text t
2207 if t.type is TYPE_START_TAG and t.name is 'select'
2209 insert_html_element t
2210 flag_frameset_ok = false
2211 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2212 ins_mode = ins_mode_in_select_in_table
2214 ins_mode = ins_mode_in_select
2216 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2217 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2220 insert_html_element t
2222 # this comment block implements the W3C spec
2223 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2224 # if is_in_scope 'ruby', NS_HTML
2225 # generate_implied_end_tags()
2226 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2228 # insert_html_element t
2230 # if t.type is TYPE_START_TAG and t.name is 'rt'
2231 # if is_in_scope 'ruby', NS_HTML
2232 # generate_implied_end_tags 'rtc' # arg is exception
2233 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2235 # insert_html_element t
2237 # below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2238 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2239 if is_in_scope 'ruby', NS_HTML
2240 generate_implied_end_tags()
2241 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2243 insert_html_element t
2245 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2246 if is_in_scope 'ruby', NS_HTML
2247 generate_implied_end_tags 'rtc'
2248 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2250 insert_html_element t
2253 if t.type is TYPE_START_TAG and t.name is 'math'
2255 adjust_mathml_attributes t
2256 adjust_foreign_attributes t
2257 insert_foreign_element t, NS_MATHML
2258 if t.flag 'self-closing'
2260 t.acknowledge_self_closing()
2262 if t.type is TYPE_START_TAG and t.name is 'svg'
2264 adjust_svg_attributes t
2265 adjust_foreign_attributes t
2266 insert_foreign_element t, NS_SVG
2267 if t.flag 'self-closing'
2269 t.acknowledge_self_closing()
2271 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2274 if t.type is TYPE_START_TAG # any other start tag
2276 insert_html_element t
2278 if t.type is TYPE_END_TAG # any other end tag
2279 in_body_any_other_end_tag t.name
2283 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2284 ins_mode_text = (t) ->
2285 if t.type is TYPE_TEXT
2288 if t.type is TYPE_EOF
2290 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2291 open_els[0].flag 'already started', true
2293 ins_mode = original_ins_mode
2296 if t.type is TYPE_END_TAG and t.name is 'script'
2298 ins_mode = original_ins_mode
2299 # fixfull the spec seems to assume that I'm going to run the script
2300 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2302 if t.type is TYPE_END_TAG
2304 ins_mode = original_ins_mode
2308 # the functions below implement the tokenizer stats described here:
2309 # http://www.w3.org/TR/html5/syntax.html#tokenization
2311 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2312 ins_mode_in_table_else = (t) ->
2314 flag_foster_parenting = true
2316 flag_foster_parenting = false
2318 ins_mode_in_table = (t) ->
2321 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2322 pending_table_character_tokens = []
2323 original_ins_mode = ins_mode
2324 ins_mode = ins_mode_in_table_text
2327 ins_mode_in_table_else t
2335 clear_stack_to_table_context()
2337 insert_html_element t
2338 ins_mode = ins_mode_in_caption
2340 clear_stack_to_table_context()
2341 insert_html_element t
2342 ins_mode = ins_mode_in_column_group
2344 clear_stack_to_table_context()
2345 insert_html_element new_open_tag 'colgroup'
2346 ins_mode = ins_mode_in_column_group
2348 when 'tbody', 'tfoot', 'thead'
2349 clear_stack_to_table_context()
2350 insert_html_element t
2351 ins_mode = ins_mode_in_table_body
2352 when 'td', 'th', 'tr'
2353 clear_stack_to_table_context()
2354 insert_html_element new_open_tag 'tbody'
2355 ins_mode = ins_mode_in_table_body
2359 if is_in_table_scope 'table', NS_HTML
2361 el = open_els.shift()
2362 if el.name is 'table' and el.namespace is NS_HTML
2366 when 'style', 'script', 'template'
2369 unless is_input_hidden_tok t
2370 ins_mode_in_table_else t
2373 el = insert_html_element t
2375 t.acknowledge_self_closing()
2378 if form_element_pointer?
2380 if template_tag_is_open()
2382 form_element_pointer = insert_html_element t
2385 ins_mode_in_table_else t
2389 if is_in_table_scope 'table', NS_HTML
2391 el = open_els.shift()
2392 if el.name is 'table' and el.namespace is NS_HTML
2397 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2402 ins_mode_in_table_else t
2406 ins_mode_in_table_else t
2410 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2411 ins_mode_in_table_text = (t) ->
2412 if t.type is TYPE_TEXT and t.text is "\u0000"
2416 if t.type is TYPE_TEXT
2417 pending_table_character_tokens.push t
2421 for old in pending_table_character_tokens
2422 unless is_space_tok old
2426 for old in pending_table_character_tokens
2427 insert_character old
2429 for old in pending_table_character_tokens
2430 ins_mode_in_table_else old
2431 pending_table_character_tokens = []
2432 ins_mode = original_ins_mode
2436 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2437 ins_mode_in_caption = (t) ->
2438 if t.type is TYPE_END_TAG and t.name is 'caption'
2439 if is_in_table_scope 'caption', NS_HTML
2440 generate_implied_end_tags()
2441 if open_els[0].name isnt 'caption'
2444 el = open_els.shift()
2445 if el.name is 'caption' and el.namespace is NS_HTML
2447 clear_afe_to_marker()
2448 ins_mode = ins_mode_in_table
2453 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2455 if is_in_table_scope 'caption', NS_HTML
2457 el = open_els.shift()
2458 if el.name is 'caption' and el.namespace is NS_HTML
2460 clear_afe_to_marker()
2461 ins_mode = ins_mode_in_table
2463 # else fragment case
2465 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2472 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2473 ins_mode_in_column_group = (t) ->
2477 if t.type is TYPE_COMMENT
2480 if t.type is TYPE_DOCTYPE
2483 if t.type is TYPE_START_TAG and t.name is 'html'
2486 if t.type is TYPE_START_TAG and t.name is 'col'
2487 el = insert_html_element t
2489 t.acknowledge_self_closing()
2491 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2492 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2494 ins_mode = ins_mode_in_table
2498 if t.type is TYPE_END_TAG and t.name is 'col'
2501 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2504 if t.type is TYPE_EOF
2508 if open_els[0].name isnt 'colgroup'
2512 ins_mode = ins_mode_in_table
2516 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2517 ins_mode_in_table_body = (t) ->
2518 if t.type is TYPE_START_TAG and t.name is 'tr'
2519 clear_stack_to_table_body_context()
2520 insert_html_element t
2521 ins_mode = ins_mode_in_row
2523 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2525 clear_stack_to_table_body_context()
2526 insert_html_element new_open_tag 'tr'
2527 ins_mode = ins_mode_in_row
2530 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2531 unless is_in_table_scope t.name, NS_HTML
2534 clear_stack_to_table_body_context()
2536 ins_mode = ins_mode_in_table
2538 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2541 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2544 if table_scopers[el.name] is el.namespace
2549 clear_stack_to_table_body_context()
2551 ins_mode = ins_mode_in_table
2554 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2561 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2562 ins_mode_in_row = (t) ->
2563 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2564 clear_stack_to_table_row_context()
2565 insert_html_element t
2566 ins_mode = ins_mode_in_cell
2569 if t.type is TYPE_END_TAG and t.name is 'tr'
2570 if is_in_table_scope 'tr', NS_HTML
2571 clear_stack_to_table_row_context()
2573 ins_mode = ins_mode_in_table_body
2577 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2578 if is_in_table_scope 'tr', NS_HTML
2579 clear_stack_to_table_row_context()
2581 ins_mode = ins_mode_in_table_body
2586 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2587 if is_in_table_scope t.name, NS_HTML
2588 if is_in_table_scope 'tr', NS_HTML
2589 clear_stack_to_table_row_context()
2591 ins_mode = ins_mode_in_table_body
2596 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2603 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2605 generate_implied_end_tags()
2606 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2609 el = open_els.shift()
2610 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2612 clear_afe_to_marker()
2613 ins_mode = ins_mode_in_row
2616 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2617 ins_mode_in_cell = (t) ->
2618 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2619 if is_in_table_scope t.name, NS_HTML
2620 generate_implied_end_tags()
2621 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2624 el = open_els.shift()
2625 if el.name is t.name and el.namespace is NS_HTML
2627 clear_afe_to_marker()
2628 ins_mode = ins_mode_in_row
2632 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2635 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2638 if table_scopers[el.name] is el.namespace
2646 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2649 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2650 if is_in_table_scope t.name, NS_HTML
2660 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2661 ins_mode_in_select = (t) ->
2662 if t.type is TYPE_TEXT and t.text is "\u0000"
2665 if t.type is TYPE_TEXT
2668 if t.type is TYPE_COMMENT
2671 if t.type is TYPE_DOCTYPE
2674 if t.type is TYPE_START_TAG and t.name is 'html'
2677 if t.type is TYPE_START_TAG and t.name is 'option'
2678 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2680 insert_html_element t
2682 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2683 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2685 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2687 insert_html_element t
2689 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2690 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2691 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2693 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2698 if t.type is TYPE_END_TAG and t.name is 'option'
2699 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2704 if t.type is TYPE_END_TAG and t.name is 'select'
2705 if is_in_select_scope 'select', NS_HTML
2707 el = open_els.shift()
2708 if el.name is 'select' and el.namespace is NS_HTML
2714 if t.type is TYPE_START_TAG and t.name is 'select'
2717 el = open_els.shift()
2718 if el.name is 'select' and el.namespace is NS_HTML
2721 # spec says that this is the same as </select> but it doesn't say
2722 # to check scope first
2724 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2726 unless is_in_select_scope 'select', NS_HTML
2729 el = open_els.shift()
2730 if el.name is 'select' and el.namespace is NS_HTML
2735 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2738 if t.type is TYPE_EOF
2745 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2746 ins_mode_in_select_in_table = (t) ->
2747 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2750 el = open_els.shift()
2751 if el.name is 'select' and el.namespace is NS_HTML
2756 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2758 unless is_in_table_scope t.name, NS_HTML
2761 el = open_els.shift()
2762 if el.name is 'select' and el.namespace is NS_HTML
2768 ins_mode_in_select t
2771 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2772 ins_mode_in_template = (t) ->
2773 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2776 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2779 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2780 template_ins_modes.shift()
2781 template_ins_modes.unshift ins_mode_in_table
2782 ins_mode = ins_mode_in_table
2785 if t.type is TYPE_START_TAG and t.name is 'col'
2786 template_ins_modes.shift()
2787 template_ins_modes.unshift ins_mode_in_column_group
2788 ins_mode = ins_mode_in_column_group
2791 if t.type is TYPE_START_TAG and t.name is 'tr'
2792 template_ins_modes.shift()
2793 template_ins_modes.unshift ins_mode_in_table_body
2794 ins_mode = ins_mode_in_table_body
2797 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2798 template_ins_modes.shift()
2799 template_ins_modes.unshift ins_mode_in_row
2800 ins_mode = ins_mode_in_row
2803 if t.type is TYPE_START_TAG
2804 template_ins_modes.shift()
2805 template_ins_modes.unshift ins_mode_in_body
2806 ins_mode = ins_mode_in_body
2809 if t.type is TYPE_END_TAG
2812 if t.type is TYPE_EOF
2813 unless template_tag_is_open()
2818 el = open_els.shift()
2819 if el.name is 'template' and el.namespace is NS_HTML
2821 clear_afe_to_marker()
2822 template_ins_modes.shift()
2827 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2828 ins_mode_after_body = (t) ->
2832 if t.type is TYPE_COMMENT
2833 first = open_els[open_els.length - 1]
2834 insert_comment t, [first, first.children.length]
2836 if t.type is TYPE_DOCTYPE
2839 if t.type is TYPE_START_TAG and t.name is 'html'
2842 if t.type is TYPE_END_TAG and t.name is 'html'
2843 if flag_fragment_parsing
2846 ins_mode = ins_mode_after_after_body
2848 if t.type is TYPE_EOF
2853 ins_mode = ins_mode_in_body
2857 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2858 ins_mode_in_frameset = (t) ->
2862 if t.type is TYPE_COMMENT
2865 if t.type is TYPE_DOCTYPE
2868 if t.type is TYPE_START_TAG and t.name is 'html'
2871 if t.type is TYPE_START_TAG and t.name is 'frameset'
2872 insert_html_element t
2874 if t.type is TYPE_END_TAG and t.name is 'frameset'
2875 if open_els.length is 1
2877 return # fragment case
2879 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2880 ins_mode = ins_mode_after_frameset
2882 if t.type is TYPE_START_TAG and t.name is 'frame'
2883 insert_html_element t
2885 t.acknowledge_self_closing()
2887 if t.type is TYPE_START_TAG and t.name is 'noframes'
2890 if t.type is TYPE_EOF
2891 if open_els.length isnt 1
2899 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2900 ins_mode_after_frameset = (t) ->
2904 if t.type is TYPE_COMMENT
2907 if t.type is TYPE_DOCTYPE
2910 if t.type is TYPE_START_TAG and t.name is 'html'
2913 if t.type is TYPE_END_TAG and t.name is 'html'
2914 ins_mode = ins_mode_after_after_frameset
2916 if t.type is TYPE_START_TAG and t.name is 'noframes'
2919 if t.type is TYPE_EOF
2926 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2927 ins_mode_after_after_body = (t) ->
2928 if t.type is TYPE_COMMENT
2929 insert_comment t, [doc, doc.children.length]
2931 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2934 if t.type is TYPE_EOF
2939 ins_mode = ins_mode_in_body
2943 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2944 ins_mode_after_after_frameset = (t) ->
2945 if t.type is TYPE_COMMENT
2946 insert_comment t, [doc, doc.children.length]
2948 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2951 if t.type is TYPE_EOF
2954 if t.type is TYPE_START_TAG and t.name is 'noframes'
2961 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2962 has_color_face_or_size = (t) ->
2964 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2967 in_foreign_content_end_script = ->
2971 in_foreign_content_other_start = (t) ->
2972 acn = adjusted_current_node()
2973 if acn.namespace is NS_MATHML
2974 adjust_mathml_attributes t
2975 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2976 t.name = svg_name_fixes[t.name]
2977 if acn.namespace is NS_SVG
2978 adjust_svg_attributes t
2979 adjust_foreign_attributes t
2980 insert_foreign_element t, acn.namespace
2981 if t.flag 'self-closing'
2982 if t.name is 'script'
2983 t.acknowledge_self_closing()
2984 in_foreign_content_end_script()
2988 t.acknowledge_self_closing()
2990 in_foreign_content = (t) ->
2991 if t.type is TYPE_TEXT and t.text is "\u0000"
2993 insert_character new_character_token "\ufffd"
2998 if t.type is TYPE_TEXT
2999 flag_frameset_ok = false
3002 if t.type is TYPE_COMMENT
3005 if t.type is TYPE_DOCTYPE
3008 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
3010 if flag_fragment_parsing
3011 in_foreign_content_other_start t
3013 loop # is this safe?
3015 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
3019 if t.type is TYPE_START_TAG
3020 in_foreign_content_other_start t
3022 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
3023 in_foreign_content_end_script()
3025 if t.type is TYPE_END_TAG
3028 if node.name.toLowerCase() isnt t.name
3031 if node is open_els[open_els.length - 1]
3033 if node.name.toLowerCase() is t.name
3035 el = open_els.shift()
3040 if node.namespace is NS_HTML
3042 ins_mode t # explicitly call HTML insertion mode
3046 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
3048 switch c = txt.charAt(cur++)
3050 return new_text_node parse_character_reference()
3052 tok_state = tok_state_tag_open
3055 return new_text_node c
3057 return new_eof_token()
3059 return new_text_node c
3062 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
3063 # not needed: tok_state_character_reference_in_data = ->
3064 # just call parse_character_reference()
3066 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
3067 tok_state_rcdata = ->
3068 switch c = txt.charAt(cur++)
3070 return new_text_node parse_character_reference()
3072 tok_state = tok_state_rcdata_less_than_sign
3075 return new_character_token "\ufffd"
3077 return new_eof_token()
3079 return new_character_token c
3082 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
3083 # not needed: tok_state_character_reference_in_rcdata = ->
3084 # just call parse_character_reference()
3086 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
3087 tok_state_rawtext = ->
3088 switch c = txt.charAt(cur++)
3090 tok_state = tok_state_rawtext_less_than_sign
3093 return new_character_token "\ufffd"
3095 return new_eof_token()
3097 return new_character_token c
3100 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
3101 tok_state_script_data = ->
3102 switch c = txt.charAt(cur++)
3104 tok_state = tok_state_script_data_less_than_sign
3107 return new_character_token "\ufffd"
3109 return new_eof_token()
3111 return new_character_token c
3114 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3115 tok_state_plaintext = ->
3116 switch c = txt.charAt(cur++)
3119 return new_character_token "\ufffd"
3121 return new_eof_token()
3123 return new_character_token c
3127 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3128 tok_state_tag_open = ->
3129 c = txt.charAt(cur++)
3131 tok_state = tok_state_markup_declaration_open
3134 tok_state = tok_state_end_tag_open
3137 tok_cur_tag = new_open_tag c.toLowerCase()
3138 tok_state = tok_state_tag_name
3141 tok_cur_tag = new_open_tag c
3142 tok_state = tok_state_tag_name
3146 tok_cur_tag = new_comment_token '?' # FIXME right?
3147 tok_state = tok_state_bogus_comment
3151 tok_state = tok_state_data
3152 cur -= 1 # we didn't parse/handle the char after <
3153 return new_text_node '<'
3155 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3156 tok_state_end_tag_open = ->
3157 c = txt.charAt(cur++)
3159 tok_cur_tag = new_end_tag c.toLowerCase()
3160 tok_state = tok_state_tag_name
3163 tok_cur_tag = new_end_tag c
3164 tok_state = tok_state_tag_name
3168 tok_state = tok_state_data
3172 tok_state = tok_state_data
3173 return new_text_node '</'
3176 tok_cur_tag = new_comment_token c
3177 tok_state = tok_state_bogus_comment
3180 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3181 tok_state_tag_name = ->
3182 switch c = txt.charAt(cur++)
3183 when "\t", "\n", "\u000c", ' '
3184 tok_state = tok_state_before_attribute_name
3186 tok_state = tok_state_self_closing_start_tag
3188 tok_state = tok_state_data
3194 tok_cur_tag.name += "\ufffd"
3197 tok_state = tok_state_data
3200 tok_cur_tag.name += c.toLowerCase()
3202 tok_cur_tag.name += c
3205 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3206 tok_state_rcdata_less_than_sign = ->
3207 c = txt.charAt(cur++)
3209 temporary_buffer = ''
3210 tok_state = tok_state_rcdata_end_tag_open
3213 tok_state = tok_state_rcdata
3214 cur -= 1 # reconsume the input character
3215 return new_character_token '<'
3217 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3218 tok_state_rcdata_end_tag_open = ->
3219 c = txt.charAt(cur++)
3221 tok_cur_tag = new_end_tag c.toLowerCase()
3222 temporary_buffer += c
3223 tok_state = tok_state_rcdata_end_tag_name
3226 tok_cur_tag = new_end_tag c
3227 temporary_buffer += c
3228 tok_state = tok_state_rcdata_end_tag_name
3231 tok_state = tok_state_rcdata
3232 cur -= 1 # reconsume the input character
3233 return new_character_token "</" # fixfull separate these
3235 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3236 is_appropriate_end_tag = (t) ->
3237 # fixfull: this assumes that open_els[0].name is "the tag name of the last
3238 # start tag to have been emitted from this tokenizer"
3239 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3241 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3242 tok_state_rcdata_end_tag_name = ->
3243 c = txt.charAt(cur++)
3244 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3245 if is_appropriate_end_tag tok_cur_tag
3246 tok_state = tok_state_before_attribute_name
3248 # else fall through to "Anything else"
3250 if is_appropriate_end_tag tok_cur_tag
3251 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3253 # else fall through to "Anything else"
3255 if is_appropriate_end_tag tok_cur_tag
3256 tok_state = tok_state_data
3258 # else fall through to "Anything else"
3260 tok_cur_tag.name += c.toLowerCase()
3261 temporary_buffer += c
3264 tok_cur_tag.name += c
3265 temporary_buffer += c
3268 tok_state = tok_state_rcdata
3269 cur -= 1 # reconsume the input character
3270 return new_character_token '</' + temporary_buffer # fixfull separate these
3272 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3273 tok_state_rawtext_less_than_sign = ->
3274 c = txt.charAt(cur++)
3276 temporary_buffer = ''
3277 tok_state = tok_state_rawtext_end_tag_open
3280 tok_state = tok_state_rawtext
3281 cur -= 1 # reconsume the input character
3282 return new_character_token '<'
3284 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3285 tok_state_rawtext_end_tag_open = ->
3286 c = txt.charAt(cur++)
3288 tok_cur_tag = new_end_tag c.toLowerCase()
3289 temporary_buffer += c
3290 tok_state = tok_state_rawtext_end_tag_name
3293 tok_cur_tag = new_end_tag c
3294 temporary_buffer += c
3295 tok_state = tok_state_rawtext_end_tag_name
3298 tok_state = tok_state_rawtext
3299 cur -= 1 # reconsume the input character
3300 return new_character_token "</" # fixfull separate these
3302 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3303 tok_state_rawtext_end_tag_name = ->
3304 c = txt.charAt(cur++)
3305 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3306 if is_appropriate_end_tag tok_cur_tag
3307 tok_state = tok_state_before_attribute_name
3309 # else fall through to "Anything else"
3311 if is_appropriate_end_tag tok_cur_tag
3312 tok_state = tok_state_self_closing_start_tag
3314 # else fall through to "Anything else"
3316 if is_appropriate_end_tag tok_cur_tag
3317 tok_state = tok_state_data
3319 # else fall through to "Anything else"
3321 tok_cur_tag.name += c.toLowerCase()
3322 temporary_buffer += c
3325 tok_cur_tag.name += c
3326 temporary_buffer += c
3329 tok_state = tok_state_rawtext
3330 cur -= 1 # reconsume the input character
3331 return new_character_token '</' + temporary_buffer # fixfull separate these
3333 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3334 tok_state_script_data_less_than_sign = ->
3335 c = txt.charAt(cur++)
3337 temporary_buffer = ''
3338 tok_state = tok_state_script_data_end_tag_open
3341 tok_state = tok_state_script_data_escape_start
3342 return new_character_token '<!' # fixfull split
3344 tok_state = tok_state_script_data
3345 cur -= 1 # Reconsume
3346 return new_character_token '<'
3348 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3349 tok_state_script_data_end_tag_open = ->
3350 c = txt.charAt(cur++)
3352 tok_cur_tag = new_end_tag c.toLowerCase()
3353 temporary_buffer += c
3354 tok_state = tok_state_script_data_end_tag_name
3357 tok_cur_tag = new_end_tag c
3358 temporary_buffer += c
3359 tok_state = tok_state_script_data_end_tag_name
3362 tok_state = tok_state_script_data
3363 cur -= 1 # Reconsume
3364 return new_character_token '</'
3366 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3367 tok_state_script_data_end_tag_name = ->
3368 c = txt.charAt(cur++)
3369 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3370 if is_appropriate_end_tag tok_cur_tag
3371 tok_state = tok_state_before_attribute_name
3375 if is_appropriate_end_tag tok_cur_tag
3376 tok_state = tok_state_self_closing_start_tag
3380 if is_appropriate_end_tag tok_cur_tag
3381 tok_state = tok_state_data
3385 tok_cur_tag.name += c.toLowerCase()
3386 temporary_buffer += c
3389 tok_cur_tag.name += c
3390 temporary_buffer += c
3393 tok_state = tok_state_script_data
3394 cur -= 1 # Reconsume
3395 return new_character_token "</#{temporary_buffer}" # fixfull split
3397 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3398 tok_state_script_data_escape_start = ->
3399 c = txt.charAt(cur++)
3401 tok_state = tok_state_script_data_escape_start_dash
3402 return new_character_token '-'
3404 tok_state = tok_state_script_data
3405 cur -= 1 # Reconsume
3408 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3409 tok_state_script_data_escape_start_dash = ->
3410 c = txt.charAt(cur++)
3412 tok_state = tok_state_script_data_escaped_dash_dash
3413 return new_character_token '-'
3415 tok_state = tok_state_script_data
3416 cur -= 1 # Reconsume
3419 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3420 tok_state_script_data_escaped = ->
3421 c = txt.charAt(cur++)
3423 tok_state = tok_state_script_data_escaped_dash
3424 return new_character_token '-'
3426 tok_state = tok_state_script_data_escaped_less_than_sign
3430 return new_character_token "\ufffd"
3432 tok_state = tok_state_data
3434 cur -= 1 # Reconsume
3437 return new_character_token c
3439 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3440 tok_state_script_data_escaped_dash = ->
3441 c = txt.charAt(cur++)
3443 tok_state = tok_state_script_data_escaped_dash_dash
3444 return new_character_token '-'
3446 tok_state = tok_state_script_data_escaped_less_than_sign
3450 tok_state = tok_state_script_data_escaped
3451 return new_character_token "\ufffd"
3453 tok_state = tok_state_data
3455 cur -= 1 # Reconsume
3458 tok_state = tok_state_script_data_escaped
3459 return new_character_token c
3461 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3462 tok_state_script_data_escaped_dash_dash = ->
3463 c = txt.charAt(cur++)
3465 return new_character_token '-'
3467 tok_state = tok_state_script_data_escaped_less_than_sign
3470 tok_state = tok_state_script_data
3471 return new_character_token '>'
3474 tok_state = tok_state_script_data_escaped
3475 return new_character_token "\ufffd"
3478 tok_state = tok_state_data
3479 cur -= 1 # Reconsume
3482 tok_state = tok_state_script_data_escaped
3483 return new_character_token c
3485 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3486 tok_state_script_data_escaped_less_than_sign = ->
3487 c = txt.charAt(cur++)
3489 temporary_buffer = ''
3490 tok_state = tok_state_script_data_escaped_end_tag_open
3493 temporary_buffer = c.toLowerCase() # yes, really
3494 tok_state = tok_state_script_data_double_escape_start
3495 return new_character_token "<#{c}" # fixfull split
3497 temporary_buffer = c
3498 tok_state = tok_state_script_data_double_escape_start
3499 return new_character_token "<#{c}" # fixfull split
3501 tok_state = tok_state_script_data_escaped
3502 cur -= 1 # Reconsume
3503 return new_character_token '<'
3505 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3506 tok_state_script_data_escaped_end_tag_open = ->
3507 c = txt.charAt(cur++)
3509 tok_cur_tag = new_end_tag c.toLowerCase()
3510 temporary_buffer += c
3511 tok_state = tok_state_script_data_escaped_end_tag_name
3514 tok_cur_tag = new_end_tag c
3515 temporary_buffer += c
3516 tok_state = tok_state_script_data_escaped_end_tag_name
3519 tok_state = tok_state_script_data_escaped
3520 cur -= 1 # Reconsume
3521 return new_character_token '</' # fixfull split
3523 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3524 tok_state_script_data_escaped_end_tag_name = ->
3525 c = txt.charAt(cur++)
3526 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3527 if is_appropriate_end_tag tok_cur_tag
3528 tok_state = tok_state_before_attribute_name
3532 if is_appropriate_end_tag tok_cur_tag
3533 tok_state = tok_state_self_closing_start_tag
3537 if is_appropriate_end_tag tok_cur_tag
3538 tok_state = tok_state_data
3542 tok_cur_tag.name += c.toLowerCase()
3543 temporary_buffer += c.toLowerCase()
3546 tok_cur_tag.name += c
3547 temporary_buffer += c.toLowerCase()
3550 tok_state = tok_state_script_data_escaped
3551 cur -= 1 # Reconsume
3552 return new_character_token "</#{temporary_buffer}" # fixfull split
3554 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3555 tok_state_script_data_double_escape_start = ->
3556 c = txt.charAt(cur++)
3557 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3558 if temporary_buffer is 'script'
3559 tok_state = tok_state_script_data_double_escaped
3561 tok_state = tok_state_script_data_escaped
3562 return new_character_token c
3564 temporary_buffer += c.toLowerCase() # yes, really lowercase
3565 return new_character_token c
3567 temporary_buffer += c
3568 return new_character_token c
3570 tok_state = tok_state_script_data_escaped
3571 cur -= 1 # Reconsume
3574 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3575 tok_state_script_data_double_escaped = ->
3576 c = txt.charAt(cur++)
3578 tok_state = tok_state_script_data_double_escaped_dash
3579 return new_character_token '-'
3581 tok_state = tok_state_script_data_double_escaped_less_than_sign
3582 return new_character_token '<'
3585 return new_character_token "\ufffd"
3588 tok_state = tok_state_data
3589 cur -= 1 # Reconsume
3592 return new_character_token c
3594 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3595 tok_state_script_data_double_escaped_dash = ->
3596 c = txt.charAt(cur++)
3598 tok_state = tok_state_script_data_double_escaped_dash_dash
3599 return new_character_token '-'
3601 tok_state = tok_state_script_data_double_escaped_less_than_sign
3602 return new_character_token '<'
3605 tok_state = tok_state_script_data_double_escaped
3606 return new_character_token "\ufffd"
3609 tok_state = tok_state_data
3610 cur -= 1 # Reconsume
3613 tok_state = tok_state_script_data_double_escaped
3614 return new_character_token c
3616 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3617 tok_state_script_data_double_escaped_dash_dash = ->
3618 c = txt.charAt(cur++)
3620 return new_character_token '-'
3622 tok_state = tok_state_script_data_double_escaped_less_than_sign
3623 return new_character_token '<'
3625 tok_state = tok_state_script_data
3626 return new_character_token '>'
3629 tok_state = tok_state_script_data_double_escaped
3630 return new_character_token "\ufffd"
3633 tok_state = tok_state_data
3634 cur -= 1 # Reconsume
3637 tok_state = tok_state_script_data_double_escaped
3638 return new_character_token c
3640 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3641 tok_state_script_data_double_escaped_less_than_sign = ->
3642 c = txt.charAt(cur++)
3644 temporary_buffer = ''
3645 tok_state = tok_state_script_data_double_escape_end
3646 return new_character_token '/'
3648 tok_state = tok_state_script_data_double_escaped
3649 cur -= 1 # Reconsume
3652 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3653 tok_state_script_data_double_escape_end = ->
3654 c = txt.charAt(cur++)
3655 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3656 if temporary_buffer is 'script'
3657 tok_state = tok_state_script_data_escaped
3659 tok_state = tok_state_script_data_double_escaped
3660 return new_character_token c
3662 temporary_buffer += c.toLowerCase() # yes, really lowercase
3663 return new_character_token c
3665 temporary_buffer += c
3666 return new_character_token c
3668 tok_state = tok_state_script_data_double_escaped
3669 cur -= 1 # Reconsume
3672 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3673 tok_state_before_attribute_name = ->
3675 switch c = txt.charAt(cur++)
3676 when "\t", "\n", "\u000c", ' '
3679 tok_state = tok_state_self_closing_start_tag
3682 tok_state = tok_state_data
3688 attr_name = "\ufffd"
3689 when '"', "'", '<', '='
3694 tok_state = tok_state_data
3697 attr_name = c.toLowerCase()
3701 tok_cur_tag.attrs_a.unshift [attr_name, '']
3702 tok_state = tok_state_attribute_name
3705 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3706 tok_state_attribute_name = ->
3707 switch c = txt.charAt(cur++)
3708 when "\t", "\n", "\u000c", ' '
3709 tok_state = tok_state_after_attribute_name
3711 tok_state = tok_state_self_closing_start_tag
3713 tok_state = tok_state_before_attribute_value
3715 tok_state = tok_state_data
3721 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3724 tok_cur_tag.attrs_a[0][0] += c
3727 tok_state = tok_state_data
3730 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3732 tok_cur_tag.attrs_a[0][0] += c
3735 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3736 tok_state_after_attribute_name = ->
3737 c = txt.charAt(cur++)
3738 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3741 tok_state = tok_state_self_closing_start_tag
3744 tok_state = tok_state_before_attribute_value
3747 tok_state = tok_state_data
3750 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3751 tok_state = tok_state_attribute_name
3755 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3756 tok_state = tok_state_attribute_name
3760 tok_state = tok_state_data
3761 cur -= 1 # reconsume
3763 if c is '"' or c is "'" or c is '<'
3765 # fall through to Anything else
3767 tok_cur_tag.attrs_a.unshift [c, '']
3768 tok_state = tok_state_attribute_name
3771 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3772 tok_state_before_attribute_value = ->
3773 switch c = txt.charAt(cur++)
3774 when "\t", "\n", "\u000c", ' '
3777 tok_state = tok_state_attribute_value_double_quoted
3779 tok_state = tok_state_attribute_value_unquoted
3782 tok_state = tok_state_attribute_value_single_quoted
3785 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3786 tok_state = tok_state_attribute_value_unquoted
3789 tok_state = tok_state_data
3795 tok_state = tok_state_data
3797 tok_cur_tag.attrs_a[0][1] += c
3798 tok_state = tok_state_attribute_value_unquoted
3801 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3802 tok_state_attribute_value_double_quoted = ->
3803 switch c = txt.charAt(cur++)
3805 tok_state = tok_state_after_attribute_value_quoted
3807 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3810 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3813 tok_state = tok_state_data
3815 tok_cur_tag.attrs_a[0][1] += c
3818 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3819 tok_state_attribute_value_single_quoted = ->
3820 switch c = txt.charAt(cur++)
3822 tok_state = tok_state_after_attribute_value_quoted
3824 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3827 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3830 tok_state = tok_state_data
3832 tok_cur_tag.attrs_a[0][1] += c
3835 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3836 tok_state_attribute_value_unquoted = ->
3837 switch c = txt.charAt(cur++)
3838 when "\t", "\n", "\u000c", ' '
3839 tok_state = tok_state_before_attribute_name
3841 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3843 tok_state = tok_state_data
3848 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3851 tok_state = tok_state_data
3853 # Parse Error if ', <, = or ` (backtick)
3854 tok_cur_tag.attrs_a[0][1] += c
3857 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3858 tok_state_after_attribute_value_quoted = ->
3859 switch c = txt.charAt(cur++)
3860 when "\t", "\n", "\u000c", ' '
3861 tok_state = tok_state_before_attribute_name
3863 tok_state = tok_state_self_closing_start_tag
3865 tok_state = tok_state_data
3871 tok_state = tok_state_data
3874 tok_state = tok_state_before_attribute_name
3875 cur -= 1 # we didn't handle that char
3878 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3879 tok_state_self_closing_start_tag = ->
3880 c = txt.charAt(cur++)
3882 tok_cur_tag.flag 'self-closing', true
3883 tok_state = tok_state_data
3887 tok_state = tok_state_data
3888 cur -= 1 # Reconsume
3892 tok_state = tok_state_before_attribute_name
3893 cur -= 1 # Reconsume
3896 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3897 # WARNING: put a comment token in tok_cur_tag before setting this state
3898 tok_state_bogus_comment = ->
3899 next_gt = txt.indexOf '>', cur
3901 val = txt.substr cur
3904 val = txt.substr cur, (next_gt - cur)
3906 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3907 tok_cur_tag.text += val
3908 tok_state = tok_state_data
3911 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3912 tok_state_markup_declaration_open = ->
3913 if txt.substr(cur, 2) is '--'
3915 tok_cur_tag = new_comment_token ''
3916 tok_state = tok_state_comment_start
3918 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3920 tok_state = tok_state_doctype
3922 acn = adjusted_current_node()
3923 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3925 tok_state = tok_state_cdata_section
3929 tok_cur_tag = new_comment_token ''
3930 tok_state = tok_state_bogus_comment
3933 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3934 tok_state_comment_start = ->
3935 switch c = txt.charAt(cur++)
3937 tok_state = tok_state_comment_start_dash
3940 tok_state = tok_state_comment
3941 return new_character_token "\ufffd"
3944 tok_state = tok_state_data
3948 tok_state = tok_state_data
3949 cur -= 1 # Reconsume
3952 tok_cur_tag.text += c
3953 tok_state = tok_state_comment
3956 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3957 tok_state_comment_start_dash = ->
3958 switch c = txt.charAt(cur++)
3960 tok_state = tok_state_comment_end
3963 tok_cur_tag.text += "-\ufffd"
3964 tok_state = tok_state_comment
3967 tok_state = tok_state_data
3971 tok_state = tok_state_data
3972 cur -= 1 # Reconsume
3975 tok_cur_tag.text += "-#{c}"
3976 tok_state = tok_state_comment
3979 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3980 tok_state_comment = ->
3981 switch c = txt.charAt(cur++)
3983 tok_state = tok_state_comment_end_dash
3986 tok_cur_tag.text += "\ufffd"
3989 tok_state = tok_state_data
3990 cur -= 1 # Reconsume
3993 tok_cur_tag.text += c
3996 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3997 tok_state_comment_end_dash = ->
3998 switch c = txt.charAt(cur++)
4000 tok_state = tok_state_comment_end
4003 tok_cur_tag.text += "-\ufffd"
4004 tok_state = tok_state_comment
4007 tok_state = tok_state_data
4008 cur -= 1 # Reconsume
4011 tok_cur_tag.text += "-#{c}"
4012 tok_state = tok_state_comment
4015 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
4016 tok_state_comment_end = ->
4017 switch c = txt.charAt(cur++)
4019 tok_state = tok_state_data
4023 tok_cur_tag.text += "--\ufffd"
4024 tok_state = tok_state_comment
4027 tok_state = tok_state_comment_end_bang
4030 tok_cur_tag.text += '-'
4033 tok_state = tok_state_data
4034 cur -= 1 # Reconsume
4038 tok_cur_tag.text += "--#{c}"
4039 tok_state = tok_state_comment
4042 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
4043 tok_state_comment_end_bang = ->
4044 switch c = txt.charAt(cur++)
4046 tok_cur_tag.text += "--!#{c}"
4047 tok_state = tok_state_comment_end_dash
4049 tok_state = tok_state_data
4053 tok_cur_tag.text += "--!\ufffd"
4054 tok_state = tok_state_comment
4057 tok_state = tok_state_data
4058 cur -= 1 # Reconsume
4061 tok_cur_tag.text += "--!#{c}"
4062 tok_state = tok_state_comment
4065 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4066 tok_state_doctype = ->
4067 switch c = txt.charAt(cur++)
4068 when "\t", "\u000a", "\u000c", ' '
4069 tok_state = tok_state_before_doctype_name
4072 tok_state = tok_state_data
4073 el = new_doctype_token ''
4074 el.flag 'force-quirks', true
4075 cur -= 1 # Reconsume
4079 tok_state = tok_state_before_doctype_name
4080 cur -= 1 # Reconsume
4083 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4084 tok_state_before_doctype_name = ->
4085 c = txt.charAt(cur++)
4086 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4089 tok_cur_tag = new_doctype_token c.toLowerCase()
4090 tok_state = tok_state_doctype_name
4094 tok_cur_tag = new_doctype_token "\ufffd"
4095 tok_state = tok_state_doctype_name
4099 el = new_doctype_token ''
4100 el.flag 'force-quirks', true
4101 tok_state = tok_state_data
4105 tok_state = tok_state_data
4106 el = new_doctype_token ''
4107 el.flag 'force-quirks', true
4108 cur -= 1 # Reconsume
4111 tok_cur_tag = new_doctype_token c
4112 tok_state = tok_state_doctype_name
4115 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4116 tok_state_doctype_name = ->
4117 c = txt.charAt(cur++)
4118 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4119 tok_state = tok_state_after_doctype_name
4122 tok_state = tok_state_data
4125 tok_cur_tag.name += c.toLowerCase()
4129 tok_cur_tag.name += "\ufffd"
4133 tok_state = tok_state_data
4134 tok_cur_tag.flag 'force-quirks', true
4135 cur -= 1 # Reconsume
4138 tok_cur_tag.name += c
4141 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4142 tok_state_after_doctype_name = ->
4143 c = txt.charAt(cur++)
4144 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4147 tok_state = tok_state_data
4151 tok_state = tok_state_data
4152 tok_cur_tag.flag 'force-quirks', true
4153 cur -= 1 # Reconsume
4156 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4158 tok_state = tok_state_after_doctype_public_keyword
4160 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4162 tok_state = tok_state_after_doctype_system_keyword
4165 tok_cur_tag.flag 'force-quirks', true
4166 tok_state = tok_state_bogus_doctype
4169 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4170 tok_state_after_doctype_public_keyword = ->
4171 c = txt.charAt(cur++)
4172 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4173 tok_state = tok_state_before_doctype_public_identifier
4177 tok_cur_tag.public_identifier = ''
4178 tok_state = tok_state_doctype_public_identifier_double_quoted
4182 tok_cur_tag.public_identifier = ''
4183 tok_state = tok_state_doctype_public_identifier_single_quoted
4187 tok_cur_tag.flag 'force-quirks', true
4188 tok_state = tok_state_data
4192 tok_state = tok_state_data
4193 tok_cur_tag.flag 'force-quirks', true
4194 cur -= 1 # Reconsume
4198 tok_cur_tag.flag 'force-quirks', true
4199 tok_state = tok_state_bogus_doctype
4202 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4203 tok_state_before_doctype_public_identifier = ->
4204 c = txt.charAt(cur++)
4205 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4209 tok_cur_tag.public_identifier = ''
4210 tok_state = tok_state_doctype_public_identifier_double_quoted
4214 tok_cur_tag.public_identifier = ''
4215 tok_state = tok_state_doctype_public_identifier_single_quoted
4219 tok_cur_tag.flag 'force-quirks', true
4220 tok_state = tok_state_data
4224 tok_state = tok_state_data
4225 tok_cur_tag.flag 'force-quirks', true
4226 cur -= 1 # Reconsume
4230 tok_cur_tag.flag 'force-quirks', true
4231 tok_state = tok_state_bogus_doctype
4235 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4236 tok_state_doctype_public_identifier_double_quoted = ->
4237 c = txt.charAt(cur++)
4239 tok_state = tok_state_after_doctype_public_identifier
4243 tok_cur_tag.public_identifier += "\ufffd"
4247 tok_cur_tag.flag 'force-quirks', true
4248 tok_state = tok_state_data
4252 tok_state = tok_state_data
4253 tok_cur_tag.flag 'force-quirks', true
4254 cur -= 1 # Reconsume
4257 tok_cur_tag.public_identifier += c
4260 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4261 tok_state_doctype_public_identifier_single_quoted = ->
4262 c = txt.charAt(cur++)
4264 tok_state = tok_state_after_doctype_public_identifier
4268 tok_cur_tag.public_identifier += "\ufffd"
4272 tok_cur_tag.flag 'force-quirks', true
4273 tok_state = tok_state_data
4277 tok_state = tok_state_data
4278 tok_cur_tag.flag 'force-quirks', true
4279 cur -= 1 # Reconsume
4282 tok_cur_tag.public_identifier += c
4285 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4286 tok_state_after_doctype_public_identifier = ->
4287 c = txt.charAt(cur++)
4288 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4289 tok_state = tok_state_between_doctype_public_and_system_identifiers
4292 tok_state = tok_state_data
4296 tok_cur_tag.system_identifier = ''
4297 tok_state = tok_state_doctype_system_identifier_double_quoted
4301 tok_cur_tag.system_identifier = ''
4302 tok_state = tok_state_doctype_system_identifier_single_quoted
4306 tok_state = tok_state_data
4307 tok_cur_tag.flag 'force-quirks', true
4308 cur -= 1 # Reconsume
4312 tok_cur_tag.flag 'force-quirks', true
4313 tok_state = tok_state_bogus_doctype
4316 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4317 tok_state_between_doctype_public_and_system_identifiers = ->
4318 c = txt.charAt(cur++)
4319 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4322 tok_state = tok_state_data
4326 tok_cur_tag.system_identifier = ''
4327 tok_state = tok_state_doctype_system_identifier_double_quoted
4331 tok_cur_tag.system_identifier = ''
4332 tok_state = tok_state_doctype_system_identifier_single_quoted
4336 tok_state = tok_state_data
4337 tok_cur_tag.flag 'force-quirks', true
4338 cur -= 1 # Reconsume
4342 tok_cur_tag.flag 'force-quirks', true
4343 tok_state = tok_state_bogus_doctype
4346 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4347 tok_state_after_doctype_system_keyword = ->
4348 c = txt.charAt(cur++)
4349 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4350 tok_state = tok_state_before_doctype_system_identifier
4354 tok_cur_tag.system_identifier = ''
4355 tok_state = tok_state_doctype_system_identifier_double_quoted
4359 tok_cur_tag.system_identifier = ''
4360 tok_state = tok_state_doctype_system_identifier_single_quoted
4364 tok_cur_tag.flag 'force-quirks', true
4365 tok_state = tok_state_data
4369 tok_state = tok_state_data
4370 tok_cur_tag.flag 'force-quirks', true
4371 cur -= 1 # Reconsume
4375 tok_cur_tag.flag 'force-quirks', true
4376 tok_state = tok_state_bogus_doctype
4379 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4380 tok_state_before_doctype_system_identifier = ->
4381 c = txt.charAt(cur++)
4382 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4385 tok_cur_tag.system_identifier = ''
4386 tok_state = tok_state_doctype_system_identifier_double_quoted
4389 tok_cur_tag.system_identifier = ''
4390 tok_state = tok_state_doctype_system_identifier_single_quoted
4394 tok_cur_tag.flag 'force-quirks', true
4395 tok_state = tok_state_data
4399 tok_state = tok_state_data
4400 tok_cur_tag.flag 'force-quirks', true
4401 cur -= 1 # Reconsume
4405 tok_cur_tag.flag 'force-quirks', true
4406 tok_state = tok_state_bogus_doctype
4409 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4410 tok_state_doctype_system_identifier_double_quoted = ->
4411 c = txt.charAt(cur++)
4413 tok_state = tok_state_after_doctype_system_identifier
4417 tok_cur_tag.system_identifier += "\ufffd"
4421 tok_cur_tag.flag 'force-quirks', true
4422 tok_state = tok_state_data
4426 tok_state = tok_state_data
4427 tok_cur_tag.flag 'force-quirks', true
4428 cur -= 1 # Reconsume
4431 tok_cur_tag.system_identifier += c
4434 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4435 tok_state_doctype_system_identifier_single_quoted = ->
4436 c = txt.charAt(cur++)
4438 tok_state = tok_state_after_doctype_system_identifier
4442 tok_cur_tag.system_identifier += "\ufffd"
4446 tok_cur_tag.flag 'force-quirks', true
4447 tok_state = tok_state_data
4451 tok_state = tok_state_data
4452 tok_cur_tag.flag 'force-quirks', true
4453 cur -= 1 # Reconsume
4456 tok_cur_tag.system_identifier += c
4459 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4460 tok_state_after_doctype_system_identifier = ->
4461 c = txt.charAt(cur++)
4462 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4465 tok_state = tok_state_data
4469 tok_state = tok_state_data
4470 tok_cur_tag.flag 'force-quirks', true
4471 cur -= 1 # Reconsume
4475 # do _not_ tok_cur_tag.flag 'force-quirks', true
4476 tok_state = tok_state_bogus_doctype
4479 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4480 tok_state_bogus_doctype = ->
4481 c = txt.charAt(cur++)
4483 tok_state = tok_state_data
4486 tok_state = tok_state_data
4487 cur -= 1 # Reconsume
4492 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4493 tok_state_cdata_section = ->
4494 tok_state = tok_state_data
4495 next_gt = txt.indexOf ']]>', cur
4497 val = txt.substr cur
4500 val = txt.substr cur, (next_gt - cur)
4502 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
4504 return new_character_token val # fixfull split
4507 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4508 # Don't set this as a state, just call it
4509 # returns a string (NOT a text node)
4510 parse_character_reference = (allowed_char = null, in_attr = false) ->
4511 if cur >= txt.length
4513 switch c = txt.charAt(cur)
4514 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4515 # explicitly not a parse error
4518 # there has to be "one or more" alnums between & and ; to be a parse error
4521 if cur + 1 >= txt.length
4523 if txt.charAt(cur + 1).toLowerCase() is 'x'
4532 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4537 if txt.charAt(start + i) is ';'
4541 code_point = txt.substr(start, i)
4542 while code_point.charAt(0) is '0' and code_point.length > 1
4543 code_point = code_point.substr 1
4544 code_point = parseInt(code_point, base)
4545 if unicode_fixes[code_point]?
4547 return unicode_fixes[code_point]
4549 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4553 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4555 return from_code_point code_point
4559 if alnum.indexOf(txt.charAt(cur + i)) is -1
4562 # exit early, because parse_error() below needs at least one alnum
4564 if txt.charAt(cur + i) is ';'
4565 decoded = decode_named_char_ref txt.substr(cur, i)
4566 i += 1 # scan past the ';' (after, so we dno't pass it to decode)
4570 # else FALL THROUGH (check for match without last char(s) or ";")
4571 # no ';' terminator (only legacy char refs)
4573 for i in [2..max] # no prefix matches, so ok to check shortest first
4574 c = legacy_char_refs[txt.substr(cur, i)]
4577 if txt.charAt(cur + i) is '='
4578 # "because some legacy user agents will
4579 # misinterpret the markup in those cases"
4582 if alnum.indexOf(txt.charAt(cur + i)) > -1
4583 # this makes attributes forgiving about url args
4585 # ok, and besides the weird exceptions for attributes...
4586 # return the matching char
4587 cur += i # consume entity chars
4588 parse_error() # because no terminating ";"
4592 return # never reached
4594 eat_next_token_if_newline = ->
4599 if t.type is TYPE_TEXT
4600 # definition of a newline depends on whether it was a character ref or not
4601 if cur - old_cur is 1
4602 # not a character reference
4603 if t.text is "\u000d" or t.text is "\u000a"
4606 if t.text is "\u000a"
4612 # tree constructor initialization
4613 # see comments on TYPE_TAG/etc for the structure of this data
4616 doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
4617 doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
4618 fragment_root = null # fragment parsing algorithm returns children of this
4620 afe = [] # active formatting elements
4621 template_ins_modes = []
4622 ins_mode = ins_mode_initial
4623 original_ins_mode = ins_mode # TODO check spec
4624 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4625 flag_frameset_ok = true
4627 flag_foster_parenting = false
4628 form_element_pointer = null
4629 temporary_buffer = null
4630 pending_table_character_tokens = []
4631 head_element_pointer = null
4632 flag_fragment_parsing = false
4633 context_element = null
4634 prev_node_id = 0 # just for debugging
4636 # tokenizer initialization
4637 tok_state = tok_state_data
4640 # fragment parsing (text arg)
4642 # this handles the fragment from the tests in the format described here:
4643 # https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/README.md
4646 if f.substr(0, 5) is 'math '
4649 else if f.substr(0, 4) is 'svg '
4653 context_element = token_to_element t, ns
4654 context_element.document = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
4655 context_element.document.flag 'quirks mode', QUIRKS_NO
4656 # fragment parsing (Node arg)
4658 context_element = args.context
4660 # http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4661 # fragment parsing algorithm
4663 flag_fragment_parsing = true
4664 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4665 # search up the tree from context, to try to find it's document,
4666 # because this file only puts a "document" property on the root
4669 el = context_element
4672 old_doc = el.document
4679 doc.flag 'quirks mode', old_doc.flag 'quirks mode'
4681 if context_element.namespace is NS_HTML
4682 switch context_element.name
4683 when 'title', 'textarea'
4684 tok_state = tok_state_rcdata
4685 when 'style', 'xmp', 'iframe', 'noembed', 'noframes'
4686 tok_state = tok_state_rawtext
4688 tok_state = tok_state_script_data
4691 tok_state = tok_state_rawtext
4693 tok_state = tok_state_plaintext
4694 fragment_root = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4695 doc.children.push fragment_root
4696 fragment_root.document = doc
4697 open_els = [fragment_root]
4698 if context_element.name is 'template' and context_element.namespace is NS_HTML
4699 template_ins_modes.unshift ins_mode_in_template
4700 # fixfull create token for context (it should have it's original one already)
4702 # set form_element pointer... in the foreign doc?!
4703 el = context_element
4705 if el.name is 'form' and el.namespace is NS_HTML
4706 form_element_pointer = el
4713 # text pre-processing
4714 # FIXME check http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4715 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4716 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4720 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4721 parse_main_loop = ->
4726 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4731 if flag_fragment_parsing
4732 return fragment_root.children
4735 exports.parse = parse_html
4736 exports.debug_log_reset = debug_log_reset
4737 exports.debug_log_each = debug_log_each
4738 exports.TYPE_TAG = TYPE_TAG
4739 exports.TYPE_TEXT = TYPE_TEXT
4740 exports.TYPE_COMMENT = TYPE_COMMENT
4741 exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4742 exports.NS_HTML = NS_HTML
4743 exports.NS_MATHML = NS_MATHML
4744 exports.NS_SVG = NS_SVG
4745 exports.QUIRKS_NO = QUIRKS_NO
4746 exports.QUIRKS_LIMITED = QUIRKS_LIMITED
4747 exports.QUIRKS_YES = QUIRKS_YES