1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a thorough parser for html5, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
24 # http://www.w3.org/TR/html5/syntax.html
26 # except for some places marked "WHATWG" that are implemented as described here:
28 # https://html.spec.whatwg.org/multipage/syntax.html
30 # This code passes all of the tests in the .dat files at:
32 # https://github.com/JasonWoof/html5lib-tests/tree/patch-1/tree-construction
35 ##################################
36 ## how to use this code
37 ##################################
39 # See README.md for how to run this file in the browser or in node.js.
41 # This file exports a single useful function: parse_tml, and some constants
42 # (see the bottom of this file for those.)
46 # wheic.parse_html("<p><b>hi</p>")
48 # Or, if you don't want <html><head><body>/etc, do this:
50 # wheic.parse_html("<p><b>hi</p>", {fragment: "body"})
52 # return value is an array of Nodes, see "class Node" below.
54 # This code is a work in progress, eg try search this file for "fixfull",
60 # Jason was frequently confused by the terminology used to refer to different
61 # parts of the stacks and lists in the spec, so he made this chart to help keep
64 # stacks grow downward (current element is index=0)
66 # example: open_els = [a, b, c, d, e, f, g]
68 # "grows downwards" means it's visualized like this: (index: el "names")
70 # 6: g "start of the list", "topmost", "first"
72 # 4: e "previous" (to d), "above", "before"
73 # 3: d (previous/next are relative to this element)
74 # 2: c "next", "after", "lower", "below"
76 # 0: a "end of the list", "current node", "bottommost", "last"
78 if (typeof module) isnt 'undefined' and module.exports?
80 exports = module.exports
84 exports = window.wheic
86 from_code_point = (x) ->
87 if String.fromCodePoint?
88 return String.fromCodePoint x
91 return String.fromCharCode x
93 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
95 # Each node is an obect of the Node class. Here are the Node types:
96 TYPE_TAG = 0 # name, {attributes}, [children]
97 TYPE_TEXT = 1 # "text"
100 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
101 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
102 TYPE_END_TAG = 5 # name
104 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
105 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
107 # namespace constants
112 # quirks mode constants
117 # queue up debug logs, so eg they can be shown only for tests that fail
125 debug_log_each = (cb) ->
126 for str in g_debug_log
132 constructor: (type, args = {}) ->
133 @type = type # one of the TYPE_* constants above
134 @name = args.name ? '' # tag name
135 @text = args.text ? '' # contents for text/comment nodes
136 @attrs = args.attrs ? {}
137 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
138 @children = args.children ? []
139 @namespace = args.namespace ? NS_HTML
140 @parent = args.parent ? null
141 @token = args.token ? null
142 @flags = args.flags ? {}
146 @id = "#{++prev_node_id}"
147 acknowledge_self_closing: ->
149 @token.flag 'did_self_close', true
151 @flag 'did_self_close', true
153 flag: (key, value = null) ->
160 # helpers: (only take args that are normally known when parser creates nodes)
161 new_open_tag = (name) ->
162 return new Node TYPE_START_TAG, name: name
163 new_end_tag = (name) ->
164 return new Node TYPE_END_TAG, name: name
165 new_element = (name) ->
166 return new Node TYPE_TAG, name: name
167 new_text_node = (txt) ->
168 return new Node TYPE_TEXT, text: txt
169 new_character_token = new_text_node
170 new_comment_token = (txt) ->
171 return new Node TYPE_COMMENT, text: txt
172 new_doctype_token = (name) ->
173 return new Node TYPE_DOCTYPE, name: name
175 return new Node TYPE_EOF
177 return new Node TYPE_AFE_MARKER
178 new_aaa_bookmark = ->
179 return new Node TYPE_AAA_BOOKMARK
181 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
182 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
183 digits = "0123456789"
184 alnum = lc_alpha + uc_alpha + digits
185 hex_chars = digits + "abcdefABCDEF"
187 is_uc_alpha = (str) ->
188 return str.length is 1 and uc_alpha.indexOf(str) > -1
189 is_lc_alpha = (str) ->
190 return str.length is 1 and lc_alpha.indexOf(str) > -1
192 # some SVG elements have dashes in them
193 tag_name_chars = alnum + "-"
195 # http://www.w3.org/TR/html5/infrastructure.html#space-character
196 space_chars = "\u0009\u000a\u000c\u000d\u0020"
198 return txt.length is 1 and space_chars.indexOf(txt) > -1
199 is_space_tok = (t) ->
200 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
202 is_input_hidden_tok = (t) ->
203 return false unless t.type is TYPE_START_TAG
206 if a[1].toLowerCase() is 'hidden'
211 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
212 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
215 unicode_fixes[0x00] = "\uFFFD"
216 unicode_fixes[0x80] = "\u20AC"
217 unicode_fixes[0x82] = "\u201A"
218 unicode_fixes[0x83] = "\u0192"
219 unicode_fixes[0x84] = "\u201E"
220 unicode_fixes[0x85] = "\u2026"
221 unicode_fixes[0x86] = "\u2020"
222 unicode_fixes[0x87] = "\u2021"
223 unicode_fixes[0x88] = "\u02C6"
224 unicode_fixes[0x89] = "\u2030"
225 unicode_fixes[0x8A] = "\u0160"
226 unicode_fixes[0x8B] = "\u2039"
227 unicode_fixes[0x8C] = "\u0152"
228 unicode_fixes[0x8E] = "\u017D"
229 unicode_fixes[0x91] = "\u2018"
230 unicode_fixes[0x92] = "\u2019"
231 unicode_fixes[0x93] = "\u201C"
232 unicode_fixes[0x94] = "\u201D"
233 unicode_fixes[0x95] = "\u2022"
234 unicode_fixes[0x96] = "\u2013"
235 unicode_fixes[0x97] = "\u2014"
236 unicode_fixes[0x98] = "\u02DC"
237 unicode_fixes[0x99] = "\u2122"
238 unicode_fixes[0x9A] = "\u0161"
239 unicode_fixes[0x9B] = "\u203A"
240 unicode_fixes[0x9C] = "\u0153"
241 unicode_fixes[0x9E] = "\u017E"
242 unicode_fixes[0x9F] = "\u0178"
244 quirks_yes_pi_prefixes = [
245 "+//silmaril//dtd html pro v0r11 19970101//"
246 "-//as//dtd html 3.0 aswedit + extensions//"
247 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
248 "-//ietf//dtd html 2.0 level 1//"
249 "-//ietf//dtd html 2.0 level 2//"
250 "-//ietf//dtd html 2.0 strict level 1//"
251 "-//ietf//dtd html 2.0 strict level 2//"
252 "-//ietf//dtd html 2.0 strict//"
253 "-//ietf//dtd html 2.0//"
254 "-//ietf//dtd html 2.1e//"
255 "-//ietf//dtd html 3.0//"
256 "-//ietf//dtd html 3.2 final//"
257 "-//ietf//dtd html 3.2//"
258 "-//ietf//dtd html 3//"
259 "-//ietf//dtd html level 0//"
260 "-//ietf//dtd html level 1//"
261 "-//ietf//dtd html level 2//"
262 "-//ietf//dtd html level 3//"
263 "-//ietf//dtd html strict level 0//"
264 "-//ietf//dtd html strict level 1//"
265 "-//ietf//dtd html strict level 2//"
266 "-//ietf//dtd html strict level 3//"
267 "-//ietf//dtd html strict//"
268 "-//ietf//dtd html//"
269 "-//metrius//dtd metrius presentational//"
270 "-//microsoft//dtd internet explorer 2.0 html strict//"
271 "-//microsoft//dtd internet explorer 2.0 html//"
272 "-//microsoft//dtd internet explorer 2.0 tables//"
273 "-//microsoft//dtd internet explorer 3.0 html strict//"
274 "-//microsoft//dtd internet explorer 3.0 html//"
275 "-//microsoft//dtd internet explorer 3.0 tables//"
276 "-//netscape comm. corp.//dtd html//"
277 "-//netscape comm. corp.//dtd strict html//"
278 "-//o'reilly and associates//dtd html 2.0//"
279 "-//o'reilly and associates//dtd html extended 1.0//"
280 "-//o'reilly and associates//dtd html extended relaxed 1.0//"
281 "-//sq//dtd html 2.0 hotmetal + extensions//"
282 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
283 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
284 "-//spyglass//dtd html 2.0 extended//"
285 "-//sun microsystems corp.//dtd hotjava html//"
286 "-//sun microsystems corp.//dtd hotjava strict html//"
287 "-//w3c//dtd html 3 1995-03-24//"
288 "-//w3c//dtd html 3.2 draft//"
289 "-//w3c//dtd html 3.2 final//"
290 "-//w3c//dtd html 3.2//"
291 "-//w3c//dtd html 3.2s draft//"
292 "-//w3c//dtd html 4.0 frameset//"
293 "-//w3c//dtd html 4.0 transitional//"
294 "-//w3c//dtd html experimental 19960712//"
295 "-//w3c//dtd html experimental 970421//"
296 "-//w3c//dtd w3 html//"
297 "-//w3o//dtd w3 html 3.0//"
298 "-//webtechs//dtd mozilla html 2.0//"
299 "-//webtechs//dtd mozilla html//"
302 # These are the character references that don't need a terminating semicolon
303 # min length: 2, max: 6, none are a prefix of any other.
305 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
306 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
307 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
308 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
309 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
310 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
311 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
312 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
313 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
314 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
315 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
316 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
317 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
318 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
319 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
320 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
321 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
325 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
326 raw_text_elements = ['script', 'style']
327 escapable_raw_text_elements = ['textarea', 'title']
328 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
330 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
331 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
332 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
333 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
334 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
335 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
336 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
337 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
338 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
339 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
340 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
341 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
342 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
343 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
347 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
349 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
350 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
351 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
352 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
353 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
354 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
355 'determinant', 'diff', 'divergence', 'divide', 'domain',
356 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
357 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
358 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
359 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
360 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
361 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
362 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
363 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
364 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
365 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
366 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
367 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
368 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
369 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
370 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
371 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
372 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
373 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
374 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
375 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
376 'vectorproduct', 'xor'
378 # foreign_elements = [svg_elements..., mathml_elements...]
379 #normal_elements = All other allowed HTML elements are normal elements.
383 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
384 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
385 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
386 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
387 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
388 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
389 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
390 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
391 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
392 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
393 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
395 menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
397 meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
398 noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
399 plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
400 select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
401 table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
402 textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
403 tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
406 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
407 'annotation-xml':NS_MATHML,
410 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
413 formatting_elements = {
414 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
415 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
419 mathml_text_integration = {
420 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
422 is_mathml_text_integration_point = (el) ->
423 return mathml_text_integration[el.name] is el.namespace
424 is_html_integration = (el) -> # DON'T PASS A TOKEN
425 if el.namespace is NS_MATHML
426 if el.name is 'annotation-xml'
427 if el.attrs.encoding?
428 if el.attrs.encoding.toLowerCase() is 'text/html'
430 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
433 if el.namespace is NS_SVG
434 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
439 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
442 foster_parenting_targets = {
463 el_is_special = (e) ->
464 return special_elements[e.name] is e.namespace
466 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
467 el_is_special_not_adp = (el) ->
468 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
472 altglyphdef: 'altGlyphDef'
473 altglyphitem: 'altGlyphItem'
474 animatecolor: 'animateColor'
475 animatemotion: 'animateMotion'
476 animatetransform: 'animateTransform'
479 fecolormatrix: 'feColorMatrix'
480 fecomponenttransfer: 'feComponentTransfer'
481 fecomposite: 'feComposite'
482 feconvolvematrix: 'feConvolveMatrix'
483 fediffuselighting: 'feDiffuseLighting'
484 fedisplacementmap: 'feDisplacementMap'
485 fedistantlight: 'feDistantLight'
486 fedropshadow: 'feDropShadow'
492 fegaussianblur: 'feGaussianBlur'
495 femergenode: 'feMergeNode'
496 femorphology: 'feMorphology'
498 fepointlight: 'fePointLight'
499 fespecularlighting: 'feSpecularLighting'
500 fespotlight: 'feSpotLight'
502 feturbulence: 'feTurbulence'
503 foreignobject: 'foreignObject'
505 lineargradient: 'linearGradient'
506 radialgradient: 'radialGradient'
509 svg_attribute_fixes = {
510 attributename: 'attributeName'
511 attributetype: 'attributeType'
512 basefrequency: 'baseFrequency'
513 baseprofile: 'baseProfile'
515 clippathunits: 'clipPathUnits'
516 contentscripttype: 'contentScriptType'
517 contentstyletype: 'contentStyleType'
518 diffuseconstant: 'diffuseConstant'
520 externalresourcesrequired: 'externalResourcesRequired'
521 # WHATWG removes this: filterres: 'filterRes'
522 filterunits: 'filterUnits'
524 gradienttransform: 'gradientTransform'
525 gradientunits: 'gradientUnits'
526 kernelmatrix: 'kernelMatrix'
527 kernelunitlength: 'kernelUnitLength'
528 keypoints: 'keyPoints'
529 keysplines: 'keySplines'
531 lengthadjust: 'lengthAdjust'
532 limitingconeangle: 'limitingConeAngle'
533 markerheight: 'markerHeight'
534 markerunits: 'markerUnits'
535 markerwidth: 'markerWidth'
536 maskcontentunits: 'maskContentUnits'
537 maskunits: 'maskUnits'
538 numoctaves: 'numOctaves'
539 pathlength: 'pathLength'
540 patterncontentunits: 'patternContentUnits'
541 patterntransform: 'patternTransform'
542 patternunits: 'patternUnits'
543 pointsatx: 'pointsAtX'
544 pointsaty: 'pointsAtY'
545 pointsatz: 'pointsAtZ'
546 preservealpha: 'preserveAlpha'
547 preserveaspectratio: 'preserveAspectRatio'
548 primitiveunits: 'primitiveUnits'
551 repeatcount: 'repeatCount'
552 repeatdur: 'repeatDur'
553 requiredextensions: 'requiredExtensions'
554 requiredfeatures: 'requiredFeatures'
555 specularconstant: 'specularConstant'
556 specularexponent: 'specularExponent'
557 spreadmethod: 'spreadMethod'
558 startoffset: 'startOffset'
559 stddeviation: 'stdDeviation'
560 stitchtiles: 'stitchTiles'
561 surfacescale: 'surfaceScale'
562 systemlanguage: 'systemLanguage'
563 tablevalues: 'tableValues'
566 textlength: 'textLength'
568 viewtarget: 'viewTarget'
569 xchannelselector: 'xChannelSelector'
570 ychannelselector: 'yChannelSelector'
571 zoomandpan: 'zoomAndPan'
573 foreign_attr_fixes = {
574 'xlink:actuate': 'xlink actuate'
575 'xlink:arcrole': 'xlink arcrole'
576 'xlink:href': 'xlink href'
577 'xlink:role': 'xlink role'
578 'xlink:show': 'xlink show'
579 'xlink:title': 'xlink title'
580 'xlink:type': 'xlink type'
581 'xml:base': 'xml base'
582 'xml:lang': 'xml lang'
583 'xml:space': 'xml space'
585 'xmlns:xlink': 'xmlns xlink'
587 adjust_mathml_attributes = (t) ->
589 if a[0] is 'definitionurl'
590 a[0] = 'definitionURL'
592 adjust_svg_attributes = (t) ->
594 if svg_attribute_fixes[a[0]]?
595 a[0] = svg_attribute_fixes[a[0]]
597 adjust_foreign_attributes = (t) ->
600 if foreign_attr_fixes[a[0]]?
601 a[0] = foreign_attr_fixes[a[0]]
604 # decode_named_char_ref()
606 # The list of named character references is _huge_ so if we're running in a
607 # browser, we get the browser to decode them, rather than increasing the code
608 # size to include the table.
609 if context is 'module'
610 _decode_named_char_ref = require './html5-named-entities.coffee'
612 # TODO test this in IE8
613 decode_named_char_ref_el = document.createElement('textarea')
614 _decode_named_char_ref = (txt) ->
616 decode_named_char_ref_el.innerHTML = txt
617 decoded = decode_named_char_ref_el.value
618 return null if decoded is txt
620 # Pass the name of a named entity _that has a terminating semicolon_
621 # Entities without terminating semicolons should use legacy_char_refs[]
622 # Do not include the "&" or ";" in your argument, eg pass "alpha"
623 decode_named_char_ref_cache = {}
624 decode_named_char_ref = (txt) ->
625 decoded = decode_named_char_ref_cache[txt]
626 return decoded if decoded?
627 decoded = _decode_named_char_ref txt
628 return decode_named_char_ref_cache[txt] = decoded
630 parse_html = (args_html, args = {}) ->
632 cur = null # index of next char in txt to be parsed
633 # declare doc and tokenizer variables so they're in scope below
635 open_els = null # stack of open elements
636 afe = null # active formatting elements
637 template_ins_modes = null
639 original_ins_mode = null
641 tok_cur_tag = null # partially parsed tag
642 flag_scripting = null
643 flag_frameset_ok = null
645 flag_foster_parenting = null
646 form_element_pointer = null
647 temporary_buffer = null
648 pending_table_character_tokens = null
649 head_element_pointer = null
650 flag_fragment_parsing = null
651 context_element = null
661 console.log "Parse error at character #{cur} of #{txt.length}"
664 # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
665 # "Noah's Ark clause" but with three
666 afe_push = (new_el) ->
669 if el.type is TYPE_AFE_MARKER
671 if el.name is new_el.name and el.namespace is new_el.namespace
674 unless new_el.attrs[k] is v
678 for k, v of new_el.attrs
679 unless el.attrs[k] is v
691 afe.unshift new_afe_marker()
694 # the functions below impliment the Tree Contstruction algorithm
695 # http://www.w3.org/TR/html5/syntax.html#tree-construction
697 # But first... the helpers
698 template_tag_is_open = ->
700 if el.name is 'template' and el.namespace is NS_HTML
703 is_in_scope_x = (tag_name, scope, namespace) ->
705 if el.name is tag_name and (namespace is null or namespace is el.namespace)
707 if scope[el.name] is el.namespace
710 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
712 if el.name is tag_name and (namespace is null or namespace is el.namespace)
714 if scope[el.name] is el.namespace
716 if scope2[el.name] is el.namespace
720 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
721 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
724 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
725 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
727 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
729 button_scopers = button: NS_HTML
730 li_scopers = ol: NS_HTML, ul: NS_HTML
731 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
732 is_in_scope = (tag_name, namespace = null) ->
733 return is_in_scope_x tag_name, standard_scopers, namespace
734 is_in_button_scope = (tag_name, namespace = null) ->
735 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
736 is_in_table_scope = (tag_name, namespace = null) ->
737 return is_in_scope_x tag_name, table_scopers, namespace
738 # aka is_in_list_item_scope
739 is_in_li_scope = (tag_name, namespace = null) ->
740 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
741 is_in_select_scope = (tag_name, namespace = null) ->
743 if t.name is tag_name and (namespace is null or namespace is t.namespace)
745 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
748 # this checks for a particular element, not by name
749 # this requires a namespace match
750 el_is_in_scope = (needle) ->
754 if standard_scopers[el.name] is el.namespace
758 clear_to_table_stopers = {
763 clear_stack_to_table_context = ->
765 if clear_to_table_stopers[open_els[0].name]?
769 clear_to_table_body_stopers = {
776 clear_stack_to_table_body_context = ->
778 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
782 clear_to_table_row_stopers = {
787 clear_stack_to_table_row_context = ->
789 if clear_to_table_row_stopers[open_els[0].name]?
793 clear_afe_to_marker = ->
795 return unless afe.length > 0 # this happens in fragment case, ?spec error
797 if el.type is TYPE_AFE_MARKER
802 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
804 # 1. Let last be false.
806 # 2. Let node be the last node in the stack of open elements.
808 node = open_els[node_i]
809 # 3. Loop: If node is the first node in the stack of open elements,
810 # then set last to true, and, if the parser was originally created as
811 # part of the HTML fragment parsing algorithm (fragment case) set node
812 # to the context element.
814 if node_i is open_els.length - 1
816 if flag_fragment_parsing
817 node = context_element
818 # 4. If node is a select element, run these substeps:
819 if node.name is 'select' and node.namespace is NS_HTML
820 # 1. If last is true, jump to the step below labeled done.
822 # 2. Let ancestor be node.
825 # 3. Loop: If ancestor is the first node in the stack of
826 # open elements, jump to the step below labeled done.
828 if ancestor_i is open_els.length - 1
830 # 4. Let ancestor be the node before ancestor in the stack
833 ancestor = open_els[ancestor_i]
834 # 5. If ancestor is a template node, jump to the step below
836 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
838 # 6. If ancestor is a table node, switch the insertion mode
839 # to "in select in table" and abort these steps.
840 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
841 ins_mode = ins_mode_in_select_in_table
843 # 7. Jump back to the step labeled loop.
844 # 8. Done: Switch the insertion mode to "in select" and abort
846 ins_mode = ins_mode_in_select
848 # 5. If node is a td or th element and last is false, then switch
849 # the insertion mode to "in cell" and abort these steps.
850 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
851 ins_mode = ins_mode_in_cell
853 # 6. If node is a tr element, then switch the insertion mode to "in
854 # row" and abort these steps.
855 if node.name is 'tr' and node.namespace is NS_HTML
856 ins_mode = ins_mode_in_row
858 # 7. If node is a tbody, thead, or tfoot element, then switch the
859 # insertion mode to "in table body" and abort these steps.
860 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
861 ins_mode = ins_mode_in_table_body
863 # 8. If node is a caption element, then switch the insertion mode
864 # to "in caption" and abort these steps.
865 if node.name is 'caption' and node.namespace is NS_HTML
866 ins_mode = ins_mode_in_caption
868 # 9. If node is a colgroup element, then switch the insertion mode
869 # to "in column group" and abort these steps.
870 if node.name is 'colgroup' and node.namespace is NS_HTML
871 ins_mode = ins_mode_in_column_group
873 # 10. If node is a table element, then switch the insertion mode to
874 # "in table" and abort these steps.
875 if node.name is 'table' and node.namespace is NS_HTML
876 ins_mode = ins_mode_in_table
878 # 11. If node is a template element, then switch the insertion mode
879 # to the current template insertion mode and abort these steps.
880 if node.name is 'template' and node.namespace is NS_HTML
881 ins_mode = template_ins_modes[0]
883 # 12. If node is a head element and last is true, then switch the
884 # insertion mode to "in body" ("in body"! not "in head"!) and abort
885 # these steps. (fragment case)
886 if node.name is 'head' and node.namespace is NS_HTML and last
887 ins_mode = ins_mode_in_body
889 # 13. If node is a head element and last is false, then switch the
890 # insertion mode to "in head" and abort these steps.
891 if node.name is 'head' and node.namespace is NS_HTML and last is false
892 ins_mode = ins_mode_in_head
894 # 14. If node is a body element, then switch the insertion mode to
895 # "in body" and abort these steps.
896 if node.name is 'body' and node.namespace is NS_HTML
897 ins_mode = ins_mode_in_body
899 # 15. If node is a frameset element, then switch the insertion mode
900 # to "in frameset" and abort these steps. (fragment case)
901 if node.name is 'frameset' and node.namespace is NS_HTML
902 ins_mode = ins_mode_in_frameset
904 # 16. If node is an html element, run these substeps:
905 if node.name is 'html' and node.namespace is NS_HTML
906 # 1. If the head element pointer is null, switch the insertion
907 # mode to "before head" and abort these steps. (fragment case)
908 if head_element_pointer is null
909 ins_mode = ins_mode_before_head
911 # 2. Otherwise, the head element pointer is not null,
912 # switch the insertion mode to "after head" and abort these
914 ins_mode = ins_mode_after_head
916 # 17. If last is true, then switch the insertion mode to "in body"
917 # and abort these steps. (fragment case)
919 ins_mode = ins_mode_in_body
921 # 18. Let node now be the node before node in the stack of open
924 node = open_els[node_i]
925 # 19. Return to the step labeled loop.
930 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
931 adjusted_current_node = ->
932 if open_els.length is 1 and flag_fragment_parsing
933 return context_element
936 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
937 # this implementation is structured (mostly) as described at the link above.
938 # capitalized comments are the "labels" described at the link above.
940 return if afe.length is 0
941 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
946 if i is afe.length - 1
949 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
954 el = insert_html_element afe[i].token
960 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
961 # adoption agency algorithm
963 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
964 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
965 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
966 adoption_agency = (subject) ->
967 # this block implements tha W3C spec
968 # # 1. If the current node is an HTML element whose tag name is subject,
969 # # then run these substeps:
971 # # 1. Let element be the current node.
973 # # 2. Pop element off the stack of open elements.
975 # # 3. If element is also in the list of active formatting elements,
976 # # remove the element from the list.
978 # # 4. Abort the adoption agency algorithm.
979 # if open_els[0].name is subject and open_els[0].namespace is NS_HTML
980 # el = open_els.shift()
981 # # remove it from the list of active formatting elements (if found)
987 # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
988 # If the current node is an HTML element whose tag name is subject, and
989 # the current node is not in the list of active formatting elements,
990 # then pop the current node off the stack of open elements, and abort
992 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
993 # remove it from the list of active formatting elements (if found)
1009 # 5. Let formatting element be the last element in the list of
1010 # active formatting elements that: is between the end of the list
1011 # and the last scope marker in the list, if any, or the start of
1012 # the list otherwise, and has the tag name subject.
1014 for t, fe_of_afe in afe
1015 if t.type is TYPE_AFE_MARKER
1017 if t.name is subject
1020 # If there is no such element, then abort these steps and instead
1021 # act as described in the "any other end tag" entry above.
1023 in_body_any_other_end_tag subject
1025 # 6. If formatting element is not in the stack of open elements,
1026 # then this is a parse error; remove the element from the list, and
1027 # abort these steps.
1029 for t, fe_of_open_els in open_els
1035 # "remove it from the list" must mean afe, since it's not in open_els
1036 afe.splice fe_of_afe, 1
1038 # 7. If formatting element is in the stack of open elements, but
1039 # the element is not in scope, then this is a parse error; abort
1041 unless el_is_in_scope fe
1044 # 8. If formatting element is not the current node, this is a parse
1045 # error. (But do not abort these steps.)
1046 unless open_els[0] is fe
1049 # 9. Let furthest block be the topmost node in the stack of open
1050 # elements that is lower in the stack than formatting element, and
1051 # is an element in the special category. There might not be one.
1053 fb_of_open_els = null
1054 for t, i in open_els
1060 # and continue, to see if there's one that's more "topmost"
1061 # 10. If there is no furthest block, then the UA must first pop all
1062 # the nodes from the bottom of the stack of open elements, from the
1063 # current node up to and including formatting element, then remove
1064 # formatting element from the list of active formatting elements,
1065 # and finally abort these steps.
1068 t = open_els.shift()
1070 afe.splice fe_of_afe, 1
1072 # 11. Let common ancestor be the element immediately above
1073 # formatting element in the stack of open elements.
1074 ca = open_els[fe_of_open_els + 1] # common ancestor
1076 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
1077 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
1078 bookmark = new_aaa_bookmark()
1081 afe.splice i, 0, bookmark
1083 node = last_node = fb
1087 # 3. Let node be the element immediately above node in the
1088 # stack of open elements, or if node is no longer in the stack
1089 # of open elements (e.g. because it got removed by this
1090 # algorithm), the element that was immediately above node in
1091 # the stack of open elements before node was removed.
1093 for t, i in open_els
1095 node_next = open_els[i + 1]
1097 node = node_next ? node_above
1098 # TODO make sure node_above gets re-set if/when node is removed from open_els
1100 # 4. If node is formatting element, then go to the next step in
1101 # the overall algorithm.
1104 # 5. If inner loop counter is greater than three and node is in
1105 # the list of active formatting elements, then remove node from
1106 # the list of active formatting elements.
1115 # 6. If node is not in the list of active formatting elements,
1116 # then remove node from the stack of open elements and then go
1117 # back to the step labeled inner loop.
1119 for t, i in open_els
1121 node_above = open_els[i + 1]
1122 open_els.splice i, 1
1125 # 7. create an element for the token for which the element node
1126 # was created, in the HTML namespace, with common ancestor as
1127 # the intended parent; replace the entry for node in the list
1128 # of active formatting elements with an entry for the new
1129 # element, replace the entry for node in the stack of open
1130 # elements with an entry for the new element, and let node be
1132 new_node = token_to_element node.token, NS_HTML, ca
1137 for t, i in open_els
1139 node_above = open_els[i + 1]
1140 open_els[i] = new_node
1143 # 8. If last node is furthest block, then move the
1144 # aforementioned bookmark to be immediately after the new node
1145 # in the list of active formatting elements.
1153 # "after" means lower
1154 afe.splice i, 0, bookmark # "after as <-
1156 # 9. Insert last node into node, first removing it from its
1157 # previous parent node if any.
1158 if last_node.parent?
1159 for c, i in last_node.parent.children
1161 last_node.parent.children.splice i, 1
1163 node.children.push last_node
1164 last_node.parent = node
1165 # 10. Let last node be node.
1167 # 11. Return to the step labeled inner loop.
1168 # 14. Insert whatever last node ended up being in the previous step
1169 # at the appropriate place for inserting a node, but using common
1170 # ancestor as the override target.
1172 # In the case where fe is immediately followed by fb:
1173 # * inner loop exits out early (node==fe)
1175 # * last_node is still in the tree (not a duplicate)
1176 if last_node.parent?
1177 for c, i in last_node.parent.children
1179 last_node.parent.children.splice i, 1
1181 # can't use standard insert token thing, because it's already in
1182 # open_els and must stay at it's current position in open_els
1183 dest = adjusted_insertion_location ca
1184 dest[0].children.splice dest[1], 0, last_node
1185 last_node.parent = dest[0]
1186 # 15. Create an element for the token for which formatting element
1187 # was created, in the HTML namespace, with furthest block as the
1189 new_element = token_to_element fe.token, NS_HTML, fb
1190 # 16. Take all of the child nodes of furthest block and append them
1191 # to the element created in the last step.
1192 while fb.children.length
1193 t = fb.children.shift()
1194 t.parent = new_element
1195 new_element.children.push t
1196 # 17. Append that new element to furthest block.
1197 new_element.parent = fb
1198 fb.children.push new_element
1199 # 18. Remove formatting element from the list of active formatting
1200 # elements, and insert the new element into the list of active
1201 # formatting elements at the position of the aforementioned
1209 afe[i] = new_element
1211 # 19. Remove formatting element from the stack of open elements,
1212 # and insert the new element into the stack of open elements
1213 # immediately below the position of furthest block in that stack.
1214 for t, i in open_els
1216 open_els.splice i, 1
1218 for t, i in open_els
1220 open_els.splice i, 0, new_element
1222 # 20. Jump back to the step labeled outer loop.
1225 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1226 close_p_element = ->
1227 generate_implied_end_tags 'p' # arg is exception
1228 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1230 while open_els.length > 1 # just in case
1231 el = open_els.shift()
1232 if el.name is 'p' and el.namespace is NS_HTML
1235 close_p_if_in_button_scope = ->
1236 if is_in_button_scope 'p', NS_HTML
1240 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1241 # aka insert_a_character = (t) ->
1242 insert_character = (t) ->
1243 dest = adjusted_insertion_location()
1244 # fixfull check for Document node
1246 prev = dest[0].children[dest[1] - 1]
1247 if prev.type is TYPE_TEXT
1250 dest[0].children.splice dest[1], 0, t
1253 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1254 process_token = (t) ->
1255 acn = adjusted_current_node()
1259 if acn.namespace is NS_HTML
1262 if is_mathml_text_integration_point(acn)
1263 if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1266 if t.type is TYPE_TEXT
1269 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1272 if is_html_integration acn
1273 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1276 if t.type is TYPE_EOF
1279 in_foreign_content t
1283 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1284 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1285 adjusted_insertion_location = (override_target = null) ->
1286 # 1. If there was an override target specified, then let target be the
1289 target = override_target
1290 else # Otherwise, let target be the current node.
1291 target = open_els[0]
1292 # 2. Determine the adjusted insertion location using the first matching
1293 # steps from the following list:
1295 # If foster parenting is enabled and target is a table, tbody, tfoot,
1296 # thead, or tr element Foster parenting happens when content is
1297 # misnested in tables.
1298 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1299 loop # once. this is here so we can ``break`` to "abort these substeps"
1300 # 1. Let last template be the last template element in the
1301 # stack of open elements, if any.
1302 last_template = null
1303 last_template_i = null
1304 for el, i in open_els
1305 if el.name is 'template' and el.namespace is NS_HTML
1309 # 2. Let last table be the last table element in the stack of
1310 # open elements, if any.
1313 for el, i in open_els
1314 if el.name is 'table' and el.namespace is NS_HTML
1318 # 3. If there is a last template and either there is no last
1319 # table, or there is one, but last template is lower (more
1320 # recently added) than last table in the stack of open
1321 # elements, then: let adjusted insertion location be inside
1322 # last template's template contents, after its last child (if
1323 # any), and abort these substeps.
1324 if last_template and (last_table is null or last_template_i < last_table_i)
1325 target = last_template # fixfull should be it's contents
1326 target_i = target.children.length
1328 # 4. If there is no last table, then let adjusted insertion
1329 # location be inside the first element in the stack of open
1330 # elements (the html element), after its last child (if any),
1331 # and abort these substeps. (fragment case)
1332 if last_table is null
1334 target = open_els[open_els.length - 1]
1335 target_i = target.children.length
1337 # 5. If last table has a parent element, then let adjusted
1338 # insertion location be inside last table's parent element,
1339 # immediately before last table, and abort these substeps.
1340 if last_table.parent?
1341 for c, i in last_table.parent.children
1343 target = last_table.parent
1347 # 6. Let previous element be the element immediately above last
1348 # table in the stack of open elements.
1350 # huh? how could it not have a parent?
1351 previous_element = open_els[last_table_i + 1]
1352 # 7. Let adjusted insertion location be inside previous
1353 # element, after its last child (if any).
1354 target = previous_element
1355 target_i = target.children.length
1356 # Note: These steps are involved in part because it's possible
1357 # for elements, the table element in this case in particular,
1358 # to have been moved by a script around in the DOM, or indeed
1359 # removed from the DOM entirely, after the element was inserted
1361 break # don't really loop
1363 # Otherwise Let adjusted insertion location be inside target, after
1364 # its last child (if any).
1365 target_i = target.children.length
1367 # 3. If the adjusted insertion location is inside a template element,
1368 # let it instead be inside the template element's template contents,
1369 # after its last child (if any).
1370 # fixfull (template)
1372 # 4. Return the adjusted insertion location.
1373 return [target, target_i]
1375 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1376 # aka create_an_element_for_token
1377 token_to_element = (t, namespace, intended_parent) ->
1378 # convert attributes into a hash
1381 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1382 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1384 # TODO 2. If the newly created element has an xmlns attribute in the
1385 # XMLNS namespace whose value is not exactly the same as the element's
1386 # namespace, that is a parse error. Similarly, if the newly created
1387 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1388 # value is not the XLink Namespace, that is a parse error.
1390 # fixfull: the spec says stuff about form pointers and ownerDocument
1394 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1395 insert_foreign_element = (token, namespace) ->
1396 ail = adjusted_insertion_location()
1399 el = token_to_element token, namespace, ail_el
1400 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1402 ail_el.children.splice ail_i, 0, el
1405 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1406 insert_html_element = (token) ->
1407 return insert_foreign_element token, NS_HTML
1409 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1410 # position should be [node, index_within_children]
1411 insert_comment = (t, position = null) ->
1412 position ?= adjusted_insertion_location()
1413 position[0].children.splice position[1], 0, t
1417 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1418 parse_generic_raw_text = (t) ->
1419 insert_html_element t
1420 tok_state = tok_state_rawtext
1421 original_ins_mode = ins_mode
1422 ins_mode = ins_mode_text
1424 parse_generic_rcdata_text = (t) ->
1425 insert_html_element t
1426 tok_state = tok_state_rcdata
1427 original_ins_mode = ins_mode
1428 ins_mode = ins_mode_text
1431 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1432 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1433 generate_implied_end_tags = (except = null) ->
1434 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1438 # 8.2.5.4 The rules for parsing tokens in HTML content
1439 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1441 # 8.2.5.4.1 The "initial" insertion mode
1442 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1443 is_quirks_yes_doctype = (t) ->
1444 if t.flag 'force-quirks'
1446 if t.name isnt 'html'
1448 if t.public_identifier?
1449 pi = t.public_identifier.toLowerCase()
1450 for p in quirks_yes_pi_prefixes
1451 if pi.substr(0, p.length) is p
1453 if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
1455 if t.system_identifier?
1456 if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
1458 else if t.public_identifier?
1459 # already did this: pi = t.public_identifier.toLowerCase()
1460 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1463 is_quirks_limited_doctype = (t) ->
1464 if t.public_identifier?
1465 pi = t.public_identifier.toLowerCase()
1466 if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
1468 if t.system_identifier?
1469 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1472 ins_mode_initial = (t) ->
1475 if t.type is TYPE_COMMENT
1479 if t.type is TYPE_DOCTYPE
1480 # fixfull syntax error from first paragraph and following bullets
1481 # fixfull set doc.doctype
1482 # fixfull is the "not an iframe srcdoc" thing relevant?
1483 if is_quirks_yes_doctype t
1484 doc.flag 'quirks mode', QUIRKS_YES
1485 else if is_quirks_limited_doctype t
1486 doc.flag 'quirks mode', QUIRKS_LIMITED
1488 ins_mode = ins_mode_before_html
1491 # fixfull not iframe srcdoc?
1493 doc.flag 'quirks mode', QUIRKS_YES
1494 ins_mode = ins_mode_before_html
1498 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1499 ins_mode_before_html = (t) ->
1500 if t.type is TYPE_DOCTYPE
1503 if t.type is TYPE_COMMENT
1508 if t.type is TYPE_START_TAG and t.name is 'html'
1509 el = token_to_element t, NS_HTML, doc
1510 doc.children.push el
1512 open_els.unshift(el)
1513 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1514 ins_mode = ins_mode_before_head
1516 if t.type is TYPE_END_TAG
1517 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1518 # fall through to "anything else"
1523 el = token_to_element new_open_tag('html'), NS_HTML, doc
1524 doc.children.push el
1527 # ?fixfull browsing context
1528 ins_mode = ins_mode_before_head
1532 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1533 ins_mode_before_head = (t) ->
1536 if t.type is TYPE_COMMENT
1539 if t.type is TYPE_DOCTYPE
1542 if t.type is TYPE_START_TAG and t.name is 'html'
1545 if t.type is TYPE_START_TAG and t.name is 'head'
1546 el = insert_html_element t
1547 head_element_pointer = el
1548 ins_mode = ins_mode_in_head
1550 if t.type is TYPE_END_TAG
1551 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1552 # fall through to Anything else below
1557 el = insert_html_element new_open_tag 'head'
1558 head_element_pointer = el
1559 ins_mode = ins_mode_in_head
1563 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1564 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1565 open_els.shift() # spec says this will be a 'head' node
1566 ins_mode = ins_mode_after_head
1569 ins_mode_in_head = (t) ->
1570 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1573 if t.type is TYPE_COMMENT
1576 if t.type is TYPE_DOCTYPE
1579 if t.type is TYPE_START_TAG and t.name is 'html'
1582 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1583 el = insert_html_element t
1585 t.acknowledge_self_closing()
1587 if t.type is TYPE_START_TAG and t.name is 'meta'
1588 el = insert_html_element t
1590 t.acknowledge_self_closing()
1591 # fixfull encoding stuff
1593 if t.type is TYPE_START_TAG and t.name is 'title'
1594 parse_generic_rcdata_text t
1596 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1597 parse_generic_raw_text t
1599 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1600 insert_html_element t
1601 ins_mode = ins_mode_in_head_noscript
1603 if t.type is TYPE_START_TAG and t.name is 'script'
1604 ail = adjusted_insertion_location()
1605 el = token_to_element t, NS_HTML, ail
1606 el.flag 'parser-inserted', true
1607 # fixfull frament case
1608 ail[0].children.splice ail[1], 0, el
1610 tok_state = tok_state_script_data
1611 original_ins_mode = ins_mode # make sure orig... is defined
1612 ins_mode = ins_mode_text
1614 if t.type is TYPE_END_TAG and t.name is 'head'
1615 open_els.shift() # will be a head element... spec says so
1616 ins_mode = ins_mode_after_head
1618 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1619 ins_mode_in_head_else t
1621 if t.type is TYPE_START_TAG and t.name is 'template'
1622 insert_html_element t
1624 flag_frameset_ok = false
1625 ins_mode = ins_mode_in_template
1626 template_ins_modes.unshift ins_mode_in_template
1628 if t.type is TYPE_END_TAG and t.name is 'template'
1629 if template_tag_is_open()
1630 generate_implied_end_tags
1631 if open_els[0].name isnt 'template'
1634 el = open_els.shift()
1635 if el.name is 'template' and el.namespace is NS_HTML
1637 clear_afe_to_marker()
1638 template_ins_modes.shift()
1643 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1646 ins_mode_in_head_else t
1649 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1650 ins_mode_in_head_noscript_else = (t) ->
1653 ins_mode = ins_mode_in_head
1656 ins_mode_in_head_noscript = (t) ->
1657 if t.type is TYPE_DOCTYPE
1660 if t.type is TYPE_START_TAG and t.name is 'html'
1663 if t.type is TYPE_END_TAG and t.name is 'noscript'
1665 ins_mode = ins_mode_in_head
1667 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1670 if t.type is TYPE_END_TAG and t.name is 'br'
1671 ins_mode_in_head_noscript_else t
1673 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1677 ins_mode_in_head_noscript_else t
1680 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1681 ins_mode_after_head_else = (t) ->
1682 body_tok = new_open_tag 'body'
1683 insert_html_element body_tok
1684 ins_mode = ins_mode_in_body
1687 ins_mode_after_head = (t) ->
1691 if t.type is TYPE_COMMENT
1694 if t.type is TYPE_DOCTYPE
1697 if t.type is TYPE_START_TAG and t.name is 'html'
1700 if t.type is TYPE_START_TAG and t.name is 'body'
1701 insert_html_element t
1702 flag_frameset_ok = false
1703 ins_mode = ins_mode_in_body
1705 if t.type is TYPE_START_TAG and t.name is 'frameset'
1706 insert_html_element t
1707 ins_mode = ins_mode_in_frameset
1709 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1711 open_els.unshift head_element_pointer
1713 for el, i in open_els
1714 if el is head_element_pointer
1715 open_els.splice i, 1
1718 if t.type is TYPE_END_TAG and t.name is 'template'
1721 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1722 ins_mode_after_head_else t
1724 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1728 ins_mode_after_head_else t
1731 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1732 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1735 if node.name is name and node.namespace is NS_HTML
1736 generate_implied_end_tags name # arg is exception
1737 unless node is open_els[0]
1740 el = open_els.shift()
1743 if special_elements[node.name] is node.namespace
1746 for el, i in open_els
1748 node = open_els[i + 1]
1751 ins_mode_in_body = (t) ->
1752 if t.type is TYPE_TEXT and t.text is "\u0000"
1759 if t.type is TYPE_TEXT
1762 flag_frameset_ok = false
1764 if t.type is TYPE_COMMENT
1767 if t.type is TYPE_DOCTYPE
1770 if t.type is TYPE_START_TAG and t.name is 'html'
1772 return if template_tag_is_open()
1773 root_attrs = open_els[open_els.length - 1].attrs
1775 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1778 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1781 if t.type is TYPE_START_TAG and t.name is 'body'
1783 return if open_els.length < 2
1784 second = open_els[open_els.length - 2]
1785 return unless second.namespace is NS_HTML
1786 return unless second.name is 'body'
1787 return if template_tag_is_open()
1788 flag_frameset_ok = false
1790 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1792 if t.type is TYPE_START_TAG and t.name is 'frameset'
1794 return if open_els.length < 2
1795 second_i = open_els.length - 2
1796 second = open_els[second_i]
1797 return unless second.namespace is NS_HTML
1798 return unless second.name is 'body'
1799 if flag_frameset_ok is false
1802 for el, i in second.parent.children
1804 second.parent.children.splice i, 1
1806 open_els.splice second_i, 1
1807 # pop everything except the "root html element"
1808 while open_els.length > 1
1810 insert_html_element t
1811 ins_mode = ins_mode_in_frameset
1813 if t.type is TYPE_EOF
1815 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1816 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1817 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1820 unless ok_tags[t.name] is el.namespace
1823 if template_ins_modes.length > 0
1824 ins_mode_in_template t
1828 if t.type is TYPE_END_TAG and t.name is 'body'
1829 unless is_in_scope 'body', NS_HTML
1833 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1834 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1835 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1836 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1840 unless ok_tags[t.name] is el.namespace
1843 ins_mode = ins_mode_after_body
1845 if t.type is TYPE_END_TAG and t.name is 'html'
1846 unless is_in_scope 'body', NS_HTML
1850 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1851 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1852 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1853 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1857 unless ok_tags[t.name] is el.namespace
1860 ins_mode = ins_mode_after_body
1863 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1864 close_p_if_in_button_scope()
1865 insert_html_element t
1867 if t.type is TYPE_START_TAG and h_tags[t.name]?
1868 close_p_if_in_button_scope()
1869 if h_tags[open_els[0].name] is open_els[0].namespace
1872 insert_html_element t
1874 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1875 close_p_if_in_button_scope()
1876 insert_html_element t
1877 eat_next_token_if_newline()
1878 flag_frameset_ok = false
1880 if t.type is TYPE_START_TAG and t.name is 'form'
1881 unless form_element_pointer is null or template_tag_is_open()
1884 close_p_if_in_button_scope()
1885 el = insert_html_element t
1886 unless template_tag_is_open()
1887 form_element_pointer = el
1889 if t.type is TYPE_START_TAG and t.name is 'li'
1890 flag_frameset_ok = false
1891 for node in open_els
1892 if node.name is 'li' and node.namespace is NS_HTML
1893 generate_implied_end_tags 'li' # arg is exception
1894 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1897 el = open_els.shift()
1898 if el.name is 'li' and el.namespace is NS_HTML
1901 if el_is_special_not_adp node
1903 close_p_if_in_button_scope()
1904 insert_html_element t
1906 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1907 flag_frameset_ok = false
1908 for node in open_els
1909 if node.name is 'dd' and node.namespace is NS_HTML
1910 generate_implied_end_tags 'dd' # arg is exception
1911 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1914 el = open_els.shift()
1915 if el.name is 'dd' and el.namespace is NS_HTML
1918 if node.name is 'dt' and node.namespace is NS_HTML
1919 generate_implied_end_tags 'dt' # arg is exception
1920 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1923 el = open_els.shift()
1924 if el.name is 'dt' and el.namespace is NS_HTML
1927 if el_is_special_not_adp node
1929 close_p_if_in_button_scope()
1930 insert_html_element t
1932 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1933 close_p_if_in_button_scope()
1934 insert_html_element t
1935 tok_state = tok_state_plaintext
1937 if t.type is TYPE_START_TAG and t.name is 'button'
1938 if is_in_scope 'button', NS_HTML
1940 generate_implied_end_tags()
1942 el = open_els.shift()
1943 if el.name is 'button' and el.namespace is NS_HTML
1946 insert_html_element t
1947 flag_frameset_ok = false
1949 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1950 unless is_in_scope t.name, NS_HTML
1953 generate_implied_end_tags()
1954 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1957 el = open_els.shift()
1958 if el.name is t.name and el.namespace is NS_HTML
1961 if t.type is TYPE_END_TAG and t.name is 'form'
1962 unless template_tag_is_open()
1963 node = form_element_pointer
1964 form_element_pointer = null
1965 if node is null or not el_is_in_scope node
1968 generate_implied_end_tags()
1969 if open_els[0] isnt node
1971 for el, i in open_els
1973 open_els.splice i, 1
1976 unless is_in_scope 'form', NS_HTML
1979 generate_implied_end_tags()
1980 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1983 el = open_els.shift()
1984 if el.name is 'form' and el.namespace is NS_HTML
1987 if t.type is TYPE_END_TAG and t.name is 'p'
1988 unless is_in_button_scope 'p', NS_HTML
1990 insert_html_element new_open_tag 'p'
1993 if t.type is TYPE_END_TAG and t.name is 'li'
1994 unless is_in_li_scope 'li', NS_HTML
1997 generate_implied_end_tags 'li' # arg is exception
1998 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
2001 el = open_els.shift()
2002 if el.name is 'li' and el.namespace is NS_HTML
2005 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
2006 unless is_in_scope t.name, NS_HTML
2009 generate_implied_end_tags t.name # arg is exception
2010 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2013 el = open_els.shift()
2014 if el.name is t.name and el.namespace is NS_HTML
2017 if t.type is TYPE_END_TAG and h_tags[t.name]?
2020 if h_tags[el.name] is el.namespace
2023 if standard_scopers[el.name] is el.namespace
2028 generate_implied_end_tags()
2029 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2032 el = open_els.shift()
2033 if h_tags[el.name] is el.namespace
2037 if t.type is TYPE_START_TAG and t.name is 'a'
2038 # If the list of active formatting elements contains an a element
2039 # between the end of the list and the last marker on the list (or
2040 # the start of the list if there is no marker on the list), then
2041 # this is a parse error; run the adoption agency algorithm for the
2042 # tag name "a", then remove that element from the list of active
2043 # formatting elements and the stack of open elements if the
2044 # adoption agency algorithm didn't already remove it (it might not
2045 # have if the element is not in table scope).
2048 if el.type is TYPE_AFE_MARKER
2050 if el.name is 'a' and el.namespace is NS_HTML
2058 for el, i in open_els
2060 open_els.splice i, 1
2062 el = insert_html_element t
2065 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2067 el = insert_html_element t
2070 if t.type is TYPE_START_TAG and t.name is 'nobr'
2072 if is_in_scope 'nobr', NS_HTML
2074 adoption_agency 'nobr'
2076 el = insert_html_element t
2079 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2080 adoption_agency t.name
2082 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2084 insert_html_element t
2086 flag_frameset_ok = false
2088 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2089 unless is_in_scope t.name, NS_HTML
2092 generate_implied_end_tags()
2093 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2096 el = open_els.shift()
2097 if el.name is t.name and el.namespace is NS_HTML
2099 clear_afe_to_marker()
2101 if t.type is TYPE_START_TAG and t.name is 'table'
2102 unless doc.flag('quirks mode') is QUIRKS_YES
2103 close_p_if_in_button_scope() # test
2104 insert_html_element t
2105 flag_frameset_ok = false
2106 ins_mode = ins_mode_in_table
2108 if t.type is TYPE_END_TAG and t.name is 'br'
2110 # W3C: t.type = TYPE_START_TAG
2111 t = new_open_tag 'br' # WHATWG
2113 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2115 insert_html_element t
2117 t.acknowledge_self_closing()
2118 flag_frameset_ok = false
2120 if t.type is TYPE_START_TAG and t.name is 'input'
2122 insert_html_element t
2124 t.acknowledge_self_closing()
2125 unless is_input_hidden_tok t
2126 flag_frameset_ok = false
2128 if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
2129 # WHATWG adds 'menuitem' for this block
2130 insert_html_element t
2132 t.acknowledge_self_closing()
2134 if t.type is TYPE_START_TAG and t.name is 'hr'
2135 close_p_if_in_button_scope()
2136 insert_html_element t
2138 t.acknowledge_self_closing()
2139 flag_frameset_ok = false
2141 if t.type is TYPE_START_TAG and t.name is 'image'
2146 if t.type is TYPE_START_TAG and t.name is 'isindex'
2148 if template_tag_is_open() is false and form_element_pointer isnt null
2150 t.acknowledge_self_closing()
2151 flag_frameset_ok = false
2152 close_p_if_in_button_scope()
2153 el = insert_html_element new_open_tag 'form'
2154 unless template_tag_is_open()
2155 form_element_pointer = el
2158 el.attrs['action'] = a[1]
2160 insert_html_element new_open_tag 'hr'
2163 insert_html_element new_open_tag 'label'
2164 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2165 input_el = new_open_tag 'input'
2170 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2171 input_el.attrs_a.push [a[0], a[1]]
2172 input_el.attrs_a.push ['name', 'isindex']
2173 # fixfull this next bit is in english... internationalize?
2174 prompt ?= "This is a searchable index. Enter search keywords: "
2175 insert_character new_character_token prompt # fixfull split
2176 # TODO submit typo "balue" in spec
2177 insert_html_element input_el
2179 # insert_character '' # you can put chars here if promt attr missing
2181 insert_html_element new_open_tag 'hr'
2184 unless template_tag_is_open()
2185 form_element_pointer = null
2187 if t.type is TYPE_START_TAG and t.name is 'textarea'
2188 insert_html_element t
2189 eat_next_token_if_newline()
2190 tok_state = tok_state_rcdata
2191 original_ins_mode = ins_mode
2192 flag_frameset_ok = false
2193 ins_mode = ins_mode_text
2195 if t.type is TYPE_START_TAG and t.name is 'xmp'
2196 close_p_if_in_button_scope()
2198 flag_frameset_ok = false
2199 parse_generic_raw_text t
2201 if t.type is TYPE_START_TAG and t.name is 'iframe'
2202 flag_frameset_ok = false
2203 parse_generic_raw_text t
2205 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2206 parse_generic_raw_text t
2208 if t.type is TYPE_START_TAG and t.name is 'select'
2210 insert_html_element t
2211 flag_frameset_ok = false
2212 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2213 ins_mode = ins_mode_in_select_in_table
2215 ins_mode = ins_mode_in_select
2217 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2218 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2221 insert_html_element t
2223 # this comment block implements the W3C spec
2224 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2225 # if is_in_scope 'ruby', NS_HTML
2226 # generate_implied_end_tags()
2227 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2229 # insert_html_element t
2231 # if t.type is TYPE_START_TAG and t.name is 'rt'
2232 # if is_in_scope 'ruby', NS_HTML
2233 # generate_implied_end_tags 'rtc' # arg is exception
2234 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2236 # insert_html_element t
2238 # below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2239 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2240 if is_in_scope 'ruby', NS_HTML
2241 generate_implied_end_tags()
2242 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2244 insert_html_element t
2246 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2247 if is_in_scope 'ruby', NS_HTML
2248 generate_implied_end_tags 'rtc'
2249 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2251 insert_html_element t
2254 if t.type is TYPE_START_TAG and t.name is 'math'
2256 adjust_mathml_attributes t
2257 adjust_foreign_attributes t
2258 insert_foreign_element t, NS_MATHML
2259 if t.flag 'self-closing'
2261 t.acknowledge_self_closing()
2263 if t.type is TYPE_START_TAG and t.name is 'svg'
2265 adjust_svg_attributes t
2266 adjust_foreign_attributes t
2267 insert_foreign_element t, NS_SVG
2268 if t.flag 'self-closing'
2270 t.acknowledge_self_closing()
2272 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2275 if t.type is TYPE_START_TAG # any other start tag
2277 insert_html_element t
2279 if t.type is TYPE_END_TAG # any other end tag
2280 in_body_any_other_end_tag t.name
2284 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2285 ins_mode_text = (t) ->
2286 if t.type is TYPE_TEXT
2289 if t.type is TYPE_EOF
2291 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2292 open_els[0].flag 'already started', true
2294 ins_mode = original_ins_mode
2297 if t.type is TYPE_END_TAG and t.name is 'script'
2299 ins_mode = original_ins_mode
2300 # fixfull the spec seems to assume that I'm going to run the script
2301 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2303 if t.type is TYPE_END_TAG
2305 ins_mode = original_ins_mode
2309 # the functions below implement the tokenizer stats described here:
2310 # http://www.w3.org/TR/html5/syntax.html#tokenization
2312 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2313 ins_mode_in_table_else = (t) ->
2315 flag_foster_parenting = true
2317 flag_foster_parenting = false
2319 ins_mode_in_table = (t) ->
2322 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2323 pending_table_character_tokens = []
2324 original_ins_mode = ins_mode
2325 ins_mode = ins_mode_in_table_text
2328 ins_mode_in_table_else t
2336 clear_stack_to_table_context()
2338 insert_html_element t
2339 ins_mode = ins_mode_in_caption
2341 clear_stack_to_table_context()
2342 insert_html_element t
2343 ins_mode = ins_mode_in_column_group
2345 clear_stack_to_table_context()
2346 insert_html_element new_open_tag 'colgroup'
2347 ins_mode = ins_mode_in_column_group
2349 when 'tbody', 'tfoot', 'thead'
2350 clear_stack_to_table_context()
2351 insert_html_element t
2352 ins_mode = ins_mode_in_table_body
2353 when 'td', 'th', 'tr'
2354 clear_stack_to_table_context()
2355 insert_html_element new_open_tag 'tbody'
2356 ins_mode = ins_mode_in_table_body
2360 if is_in_table_scope 'table', NS_HTML
2362 el = open_els.shift()
2363 if el.name is 'table' and el.namespace is NS_HTML
2367 when 'style', 'script', 'template'
2370 unless is_input_hidden_tok t
2371 ins_mode_in_table_else t
2374 el = insert_html_element t
2376 t.acknowledge_self_closing()
2379 if form_element_pointer?
2381 if template_tag_is_open()
2383 form_element_pointer = insert_html_element t
2386 ins_mode_in_table_else t
2390 if is_in_table_scope 'table', NS_HTML
2392 el = open_els.shift()
2393 if el.name is 'table' and el.namespace is NS_HTML
2398 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2403 ins_mode_in_table_else t
2407 ins_mode_in_table_else t
2411 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2412 ins_mode_in_table_text = (t) ->
2413 if t.type is TYPE_TEXT and t.text is "\u0000"
2417 if t.type is TYPE_TEXT
2418 pending_table_character_tokens.push t
2422 for old in pending_table_character_tokens
2423 unless is_space_tok old
2427 for old in pending_table_character_tokens
2428 insert_character old
2430 for old in pending_table_character_tokens
2431 ins_mode_in_table_else old
2432 pending_table_character_tokens = []
2433 ins_mode = original_ins_mode
2437 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2438 ins_mode_in_caption = (t) ->
2439 if t.type is TYPE_END_TAG and t.name is 'caption'
2440 if is_in_table_scope 'caption', NS_HTML
2441 generate_implied_end_tags()
2442 if open_els[0].name isnt 'caption'
2445 el = open_els.shift()
2446 if el.name is 'caption' and el.namespace is NS_HTML
2448 clear_afe_to_marker()
2449 ins_mode = ins_mode_in_table
2454 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2456 if is_in_table_scope 'caption', NS_HTML
2458 el = open_els.shift()
2459 if el.name is 'caption' and el.namespace is NS_HTML
2461 clear_afe_to_marker()
2462 ins_mode = ins_mode_in_table
2464 # else fragment case
2466 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2473 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2474 ins_mode_in_column_group = (t) ->
2478 if t.type is TYPE_COMMENT
2481 if t.type is TYPE_DOCTYPE
2484 if t.type is TYPE_START_TAG and t.name is 'html'
2487 if t.type is TYPE_START_TAG and t.name is 'col'
2488 el = insert_html_element t
2490 t.acknowledge_self_closing()
2492 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2493 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2495 ins_mode = ins_mode_in_table
2499 if t.type is TYPE_END_TAG and t.name is 'col'
2502 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2505 if t.type is TYPE_EOF
2509 if open_els[0].name isnt 'colgroup'
2513 ins_mode = ins_mode_in_table
2517 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2518 ins_mode_in_table_body = (t) ->
2519 if t.type is TYPE_START_TAG and t.name is 'tr'
2520 clear_stack_to_table_body_context()
2521 insert_html_element t
2522 ins_mode = ins_mode_in_row
2524 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2526 clear_stack_to_table_body_context()
2527 insert_html_element new_open_tag 'tr'
2528 ins_mode = ins_mode_in_row
2531 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2532 unless is_in_table_scope t.name, NS_HTML
2535 clear_stack_to_table_body_context()
2537 ins_mode = ins_mode_in_table
2539 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2542 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2545 if table_scopers[el.name] is el.namespace
2550 clear_stack_to_table_body_context()
2552 ins_mode = ins_mode_in_table
2555 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2562 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2563 ins_mode_in_row = (t) ->
2564 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2565 clear_stack_to_table_row_context()
2566 insert_html_element t
2567 ins_mode = ins_mode_in_cell
2570 if t.type is TYPE_END_TAG and t.name is 'tr'
2571 if is_in_table_scope 'tr', NS_HTML
2572 clear_stack_to_table_row_context()
2574 ins_mode = ins_mode_in_table_body
2578 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2579 if is_in_table_scope 'tr', NS_HTML
2580 clear_stack_to_table_row_context()
2582 ins_mode = ins_mode_in_table_body
2587 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2588 if is_in_table_scope t.name, NS_HTML
2589 if is_in_table_scope 'tr', NS_HTML
2590 clear_stack_to_table_row_context()
2592 ins_mode = ins_mode_in_table_body
2597 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2604 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2606 generate_implied_end_tags()
2607 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2610 el = open_els.shift()
2611 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2613 clear_afe_to_marker()
2614 ins_mode = ins_mode_in_row
2617 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2618 ins_mode_in_cell = (t) ->
2619 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2620 if is_in_table_scope t.name, NS_HTML
2621 generate_implied_end_tags()
2622 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2625 el = open_els.shift()
2626 if el.name is t.name and el.namespace is NS_HTML
2628 clear_afe_to_marker()
2629 ins_mode = ins_mode_in_row
2633 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2636 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2639 if table_scopers[el.name] is el.namespace
2647 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2650 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2651 if is_in_table_scope t.name, NS_HTML
2661 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2662 ins_mode_in_select = (t) ->
2663 if t.type is TYPE_TEXT and t.text is "\u0000"
2666 if t.type is TYPE_TEXT
2669 if t.type is TYPE_COMMENT
2672 if t.type is TYPE_DOCTYPE
2675 if t.type is TYPE_START_TAG and t.name is 'html'
2678 if t.type is TYPE_START_TAG and t.name is 'option'
2679 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2681 insert_html_element t
2683 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2684 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2686 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2688 insert_html_element t
2690 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2691 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2692 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2694 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2699 if t.type is TYPE_END_TAG and t.name is 'option'
2700 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2705 if t.type is TYPE_END_TAG and t.name is 'select'
2706 if is_in_select_scope 'select', NS_HTML
2708 el = open_els.shift()
2709 if el.name is 'select' and el.namespace is NS_HTML
2715 if t.type is TYPE_START_TAG and t.name is 'select'
2718 el = open_els.shift()
2719 if el.name is 'select' and el.namespace is NS_HTML
2722 # spec says that this is the same as </select> but it doesn't say
2723 # to check scope first
2725 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2727 unless is_in_select_scope 'select', NS_HTML
2730 el = open_els.shift()
2731 if el.name is 'select' and el.namespace is NS_HTML
2736 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2739 if t.type is TYPE_EOF
2746 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2747 ins_mode_in_select_in_table = (t) ->
2748 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2751 el = open_els.shift()
2752 if el.name is 'select' and el.namespace is NS_HTML
2757 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2759 unless is_in_table_scope t.name, NS_HTML
2762 el = open_els.shift()
2763 if el.name is 'select' and el.namespace is NS_HTML
2769 ins_mode_in_select t
2772 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2773 ins_mode_in_template = (t) ->
2774 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2777 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2780 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2781 template_ins_modes.shift()
2782 template_ins_modes.unshift ins_mode_in_table
2783 ins_mode = ins_mode_in_table
2786 if t.type is TYPE_START_TAG and t.name is 'col'
2787 template_ins_modes.shift()
2788 template_ins_modes.unshift ins_mode_in_column_group
2789 ins_mode = ins_mode_in_column_group
2792 if t.type is TYPE_START_TAG and t.name is 'tr'
2793 template_ins_modes.shift()
2794 template_ins_modes.unshift ins_mode_in_table_body
2795 ins_mode = ins_mode_in_table_body
2798 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2799 template_ins_modes.shift()
2800 template_ins_modes.unshift ins_mode_in_row
2801 ins_mode = ins_mode_in_row
2804 if t.type is TYPE_START_TAG
2805 template_ins_modes.shift()
2806 template_ins_modes.unshift ins_mode_in_body
2807 ins_mode = ins_mode_in_body
2810 if t.type is TYPE_END_TAG
2813 if t.type is TYPE_EOF
2814 unless template_tag_is_open()
2819 el = open_els.shift()
2820 if el.name is 'template' and el.namespace is NS_HTML
2822 clear_afe_to_marker()
2823 template_ins_modes.shift()
2828 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2829 ins_mode_after_body = (t) ->
2833 if t.type is TYPE_COMMENT
2834 first = open_els[open_els.length - 1]
2835 insert_comment t, [first, first.children.length]
2837 if t.type is TYPE_DOCTYPE
2840 if t.type is TYPE_START_TAG and t.name is 'html'
2843 if t.type is TYPE_END_TAG and t.name is 'html'
2844 if flag_fragment_parsing
2847 ins_mode = ins_mode_after_after_body
2849 if t.type is TYPE_EOF
2854 ins_mode = ins_mode_in_body
2858 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2859 ins_mode_in_frameset = (t) ->
2863 if t.type is TYPE_COMMENT
2866 if t.type is TYPE_DOCTYPE
2869 if t.type is TYPE_START_TAG and t.name is 'html'
2872 if t.type is TYPE_START_TAG and t.name is 'frameset'
2873 insert_html_element t
2875 if t.type is TYPE_END_TAG and t.name is 'frameset'
2876 if open_els.length is 1
2878 return # fragment case
2880 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2881 ins_mode = ins_mode_after_frameset
2883 if t.type is TYPE_START_TAG and t.name is 'frame'
2884 insert_html_element t
2886 t.acknowledge_self_closing()
2888 if t.type is TYPE_START_TAG and t.name is 'noframes'
2891 if t.type is TYPE_EOF
2892 if open_els.length isnt 1
2900 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2901 ins_mode_after_frameset = (t) ->
2905 if t.type is TYPE_COMMENT
2908 if t.type is TYPE_DOCTYPE
2911 if t.type is TYPE_START_TAG and t.name is 'html'
2914 if t.type is TYPE_END_TAG and t.name is 'html'
2915 ins_mode = ins_mode_after_after_frameset
2917 if t.type is TYPE_START_TAG and t.name is 'noframes'
2920 if t.type is TYPE_EOF
2927 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2928 ins_mode_after_after_body = (t) ->
2929 if t.type is TYPE_COMMENT
2930 insert_comment t, [doc, doc.children.length]
2932 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2935 if t.type is TYPE_EOF
2940 ins_mode = ins_mode_in_body
2944 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2945 ins_mode_after_after_frameset = (t) ->
2946 if t.type is TYPE_COMMENT
2947 insert_comment t, [doc, doc.children.length]
2949 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2952 if t.type is TYPE_EOF
2955 if t.type is TYPE_START_TAG and t.name is 'noframes'
2962 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2963 has_color_face_or_size = (t) ->
2965 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2968 in_foreign_content_end_script = ->
2972 in_foreign_content_other_start = (t) ->
2973 acn = adjusted_current_node()
2974 if acn.namespace is NS_MATHML
2975 adjust_mathml_attributes t
2976 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2977 t.name = svg_name_fixes[t.name]
2978 if acn.namespace is NS_SVG
2979 adjust_svg_attributes t
2980 adjust_foreign_attributes t
2981 insert_foreign_element t, acn.namespace
2982 if t.flag 'self-closing'
2983 if t.name is 'script'
2984 t.acknowledge_self_closing()
2985 in_foreign_content_end_script()
2989 t.acknowledge_self_closing()
2991 in_foreign_content = (t) ->
2992 if t.type is TYPE_TEXT and t.text is "\u0000"
2994 insert_character new_character_token "\ufffd"
2999 if t.type is TYPE_TEXT
3000 flag_frameset_ok = false
3003 if t.type is TYPE_COMMENT
3006 if t.type is TYPE_DOCTYPE
3009 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
3011 if flag_fragment_parsing
3012 in_foreign_content_other_start t
3014 loop # is this safe?
3016 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
3020 if t.type is TYPE_START_TAG
3021 in_foreign_content_other_start t
3023 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
3024 in_foreign_content_end_script()
3026 if t.type is TYPE_END_TAG
3029 if node.name.toLowerCase() isnt t.name
3032 if node is open_els[open_els.length - 1]
3034 if node.name.toLowerCase() is t.name
3036 el = open_els.shift()
3041 if node.namespace is NS_HTML
3043 ins_mode t # explicitly call HTML insertion mode
3047 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
3049 switch c = txt.charAt(cur++)
3051 return new_text_node parse_character_reference()
3053 tok_state = tok_state_tag_open
3056 return new_text_node c
3058 return new_eof_token()
3060 return new_text_node c
3063 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
3064 # not needed: tok_state_character_reference_in_data = ->
3065 # just call parse_character_reference()
3067 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
3068 tok_state_rcdata = ->
3069 switch c = txt.charAt(cur++)
3071 return new_text_node parse_character_reference()
3073 tok_state = tok_state_rcdata_less_than_sign
3076 return new_character_token "\ufffd"
3078 return new_eof_token()
3080 return new_character_token c
3083 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
3084 # not needed: tok_state_character_reference_in_rcdata = ->
3085 # just call parse_character_reference()
3087 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
3088 tok_state_rawtext = ->
3089 switch c = txt.charAt(cur++)
3091 tok_state = tok_state_rawtext_less_than_sign
3094 return new_character_token "\ufffd"
3096 return new_eof_token()
3098 return new_character_token c
3101 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
3102 tok_state_script_data = ->
3103 switch c = txt.charAt(cur++)
3105 tok_state = tok_state_script_data_less_than_sign
3108 return new_character_token "\ufffd"
3110 return new_eof_token()
3112 return new_character_token c
3115 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3116 tok_state_plaintext = ->
3117 switch c = txt.charAt(cur++)
3120 return new_character_token "\ufffd"
3122 return new_eof_token()
3124 return new_character_token c
3128 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3129 tok_state_tag_open = ->
3130 c = txt.charAt(cur++)
3132 tok_state = tok_state_markup_declaration_open
3135 tok_state = tok_state_end_tag_open
3138 tok_cur_tag = new_open_tag c.toLowerCase()
3139 tok_state = tok_state_tag_name
3142 tok_cur_tag = new_open_tag c
3143 tok_state = tok_state_tag_name
3147 tok_cur_tag = new_comment_token '?' # FIXME right?
3148 tok_state = tok_state_bogus_comment
3152 tok_state = tok_state_data
3153 cur -= 1 # we didn't parse/handle the char after <
3154 return new_text_node '<'
3156 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3157 tok_state_end_tag_open = ->
3158 c = txt.charAt(cur++)
3160 tok_cur_tag = new_end_tag c.toLowerCase()
3161 tok_state = tok_state_tag_name
3164 tok_cur_tag = new_end_tag c
3165 tok_state = tok_state_tag_name
3169 tok_state = tok_state_data
3173 tok_state = tok_state_data
3174 return new_text_node '</'
3177 tok_cur_tag = new_comment_token c
3178 tok_state = tok_state_bogus_comment
3181 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3182 tok_state_tag_name = ->
3183 switch c = txt.charAt(cur++)
3184 when "\t", "\n", "\u000c", ' '
3185 tok_state = tok_state_before_attribute_name
3187 tok_state = tok_state_self_closing_start_tag
3189 tok_state = tok_state_data
3195 tok_cur_tag.name += "\ufffd"
3198 tok_state = tok_state_data
3201 tok_cur_tag.name += c.toLowerCase()
3203 tok_cur_tag.name += c
3206 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3207 tok_state_rcdata_less_than_sign = ->
3208 c = txt.charAt(cur++)
3210 temporary_buffer = ''
3211 tok_state = tok_state_rcdata_end_tag_open
3214 tok_state = tok_state_rcdata
3215 cur -= 1 # reconsume the input character
3216 return new_character_token '<'
3218 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3219 tok_state_rcdata_end_tag_open = ->
3220 c = txt.charAt(cur++)
3222 tok_cur_tag = new_end_tag c.toLowerCase()
3223 temporary_buffer += c
3224 tok_state = tok_state_rcdata_end_tag_name
3227 tok_cur_tag = new_end_tag c
3228 temporary_buffer += c
3229 tok_state = tok_state_rcdata_end_tag_name
3232 tok_state = tok_state_rcdata
3233 cur -= 1 # reconsume the input character
3234 return new_character_token "</" # fixfull separate these
3236 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3237 is_appropriate_end_tag = (t) ->
3238 # fixfull: this assumes that open_els[0].name is "the tag name of the last
3239 # start tag to have been emitted from this tokenizer"
3240 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3242 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3243 tok_state_rcdata_end_tag_name = ->
3244 c = txt.charAt(cur++)
3245 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3246 if is_appropriate_end_tag tok_cur_tag
3247 tok_state = tok_state_before_attribute_name
3249 # else fall through to "Anything else"
3251 if is_appropriate_end_tag tok_cur_tag
3252 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3254 # else fall through to "Anything else"
3256 if is_appropriate_end_tag tok_cur_tag
3257 tok_state = tok_state_data
3259 # else fall through to "Anything else"
3261 tok_cur_tag.name += c.toLowerCase()
3262 temporary_buffer += c
3265 tok_cur_tag.name += c
3266 temporary_buffer += c
3269 tok_state = tok_state_rcdata
3270 cur -= 1 # reconsume the input character
3271 return new_character_token '</' + temporary_buffer # fixfull separate these
3273 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3274 tok_state_rawtext_less_than_sign = ->
3275 c = txt.charAt(cur++)
3277 temporary_buffer = ''
3278 tok_state = tok_state_rawtext_end_tag_open
3281 tok_state = tok_state_rawtext
3282 cur -= 1 # reconsume the input character
3283 return new_character_token '<'
3285 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3286 tok_state_rawtext_end_tag_open = ->
3287 c = txt.charAt(cur++)
3289 tok_cur_tag = new_end_tag c.toLowerCase()
3290 temporary_buffer += c
3291 tok_state = tok_state_rawtext_end_tag_name
3294 tok_cur_tag = new_end_tag c
3295 temporary_buffer += c
3296 tok_state = tok_state_rawtext_end_tag_name
3299 tok_state = tok_state_rawtext
3300 cur -= 1 # reconsume the input character
3301 return new_character_token "</" # fixfull separate these
3303 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3304 tok_state_rawtext_end_tag_name = ->
3305 c = txt.charAt(cur++)
3306 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3307 if is_appropriate_end_tag tok_cur_tag
3308 tok_state = tok_state_before_attribute_name
3310 # else fall through to "Anything else"
3312 if is_appropriate_end_tag tok_cur_tag
3313 tok_state = tok_state_self_closing_start_tag
3315 # else fall through to "Anything else"
3317 if is_appropriate_end_tag tok_cur_tag
3318 tok_state = tok_state_data
3320 # else fall through to "Anything else"
3322 tok_cur_tag.name += c.toLowerCase()
3323 temporary_buffer += c
3326 tok_cur_tag.name += c
3327 temporary_buffer += c
3330 tok_state = tok_state_rawtext
3331 cur -= 1 # reconsume the input character
3332 return new_character_token '</' + temporary_buffer # fixfull separate these
3334 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3335 tok_state_script_data_less_than_sign = ->
3336 c = txt.charAt(cur++)
3338 temporary_buffer = ''
3339 tok_state = tok_state_script_data_end_tag_open
3342 tok_state = tok_state_script_data_escape_start
3343 return new_character_token '<!' # fixfull split
3345 tok_state = tok_state_script_data
3346 cur -= 1 # Reconsume
3347 return new_character_token '<'
3349 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3350 tok_state_script_data_end_tag_open = ->
3351 c = txt.charAt(cur++)
3353 tok_cur_tag = new_end_tag c.toLowerCase()
3354 temporary_buffer += c
3355 tok_state = tok_state_script_data_end_tag_name
3358 tok_cur_tag = new_end_tag c
3359 temporary_buffer += c
3360 tok_state = tok_state_script_data_end_tag_name
3363 tok_state = tok_state_script_data
3364 cur -= 1 # Reconsume
3365 return new_character_token '</'
3367 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3368 tok_state_script_data_end_tag_name = ->
3369 c = txt.charAt(cur++)
3370 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3371 if is_appropriate_end_tag tok_cur_tag
3372 tok_state = tok_state_before_attribute_name
3376 if is_appropriate_end_tag tok_cur_tag
3377 tok_state = tok_state_self_closing_start_tag
3381 if is_appropriate_end_tag tok_cur_tag
3382 tok_state = tok_state_data
3386 tok_cur_tag.name += c.toLowerCase()
3387 temporary_buffer += c
3390 tok_cur_tag.name += c
3391 temporary_buffer += c
3394 tok_state = tok_state_script_data
3395 cur -= 1 # Reconsume
3396 return new_character_token "</#{temporary_buffer}" # fixfull split
3398 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3399 tok_state_script_data_escape_start = ->
3400 c = txt.charAt(cur++)
3402 tok_state = tok_state_script_data_escape_start_dash
3403 return new_character_token '-'
3405 tok_state = tok_state_script_data
3406 cur -= 1 # Reconsume
3409 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3410 tok_state_script_data_escape_start_dash = ->
3411 c = txt.charAt(cur++)
3413 tok_state = tok_state_script_data_escaped_dash_dash
3414 return new_character_token '-'
3416 tok_state = tok_state_script_data
3417 cur -= 1 # Reconsume
3420 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3421 tok_state_script_data_escaped = ->
3422 c = txt.charAt(cur++)
3424 tok_state = tok_state_script_data_escaped_dash
3425 return new_character_token '-'
3427 tok_state = tok_state_script_data_escaped_less_than_sign
3431 return new_character_token "\ufffd"
3433 tok_state = tok_state_data
3435 cur -= 1 # Reconsume
3438 return new_character_token c
3440 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3441 tok_state_script_data_escaped_dash = ->
3442 c = txt.charAt(cur++)
3444 tok_state = tok_state_script_data_escaped_dash_dash
3445 return new_character_token '-'
3447 tok_state = tok_state_script_data_escaped_less_than_sign
3451 tok_state = tok_state_script_data_escaped
3452 return new_character_token "\ufffd"
3454 tok_state = tok_state_data
3456 cur -= 1 # Reconsume
3459 tok_state = tok_state_script_data_escaped
3460 return new_character_token c
3462 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3463 tok_state_script_data_escaped_dash_dash = ->
3464 c = txt.charAt(cur++)
3466 return new_character_token '-'
3468 tok_state = tok_state_script_data_escaped_less_than_sign
3471 tok_state = tok_state_script_data
3472 return new_character_token '>'
3475 tok_state = tok_state_script_data_escaped
3476 return new_character_token "\ufffd"
3479 tok_state = tok_state_data
3480 cur -= 1 # Reconsume
3483 tok_state = tok_state_script_data_escaped
3484 return new_character_token c
3486 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3487 tok_state_script_data_escaped_less_than_sign = ->
3488 c = txt.charAt(cur++)
3490 temporary_buffer = ''
3491 tok_state = tok_state_script_data_escaped_end_tag_open
3494 temporary_buffer = c.toLowerCase() # yes, really
3495 tok_state = tok_state_script_data_double_escape_start
3496 return new_character_token "<#{c}" # fixfull split
3498 temporary_buffer = c
3499 tok_state = tok_state_script_data_double_escape_start
3500 return new_character_token "<#{c}" # fixfull split
3502 tok_state = tok_state_script_data_escaped
3503 cur -= 1 # Reconsume
3504 return new_character_token '<'
3506 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3507 tok_state_script_data_escaped_end_tag_open = ->
3508 c = txt.charAt(cur++)
3510 tok_cur_tag = new_end_tag c.toLowerCase()
3511 temporary_buffer += c
3512 tok_state = tok_state_script_data_escaped_end_tag_name
3515 tok_cur_tag = new_end_tag c
3516 temporary_buffer += c
3517 tok_state = tok_state_script_data_escaped_end_tag_name
3520 tok_state = tok_state_script_data_escaped
3521 cur -= 1 # Reconsume
3522 return new_character_token '</' # fixfull split
3524 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3525 tok_state_script_data_escaped_end_tag_name = ->
3526 c = txt.charAt(cur++)
3527 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3528 if is_appropriate_end_tag tok_cur_tag
3529 tok_state = tok_state_before_attribute_name
3533 if is_appropriate_end_tag tok_cur_tag
3534 tok_state = tok_state_self_closing_start_tag
3538 if is_appropriate_end_tag tok_cur_tag
3539 tok_state = tok_state_data
3543 tok_cur_tag.name += c.toLowerCase()
3544 temporary_buffer += c.toLowerCase()
3547 tok_cur_tag.name += c
3548 temporary_buffer += c.toLowerCase()
3551 tok_state = tok_state_script_data_escaped
3552 cur -= 1 # Reconsume
3553 return new_character_token "</#{temporary_buffer}" # fixfull split
3555 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3556 tok_state_script_data_double_escape_start = ->
3557 c = txt.charAt(cur++)
3558 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3559 if temporary_buffer is 'script'
3560 tok_state = tok_state_script_data_double_escaped
3562 tok_state = tok_state_script_data_escaped
3563 return new_character_token c
3565 temporary_buffer += c.toLowerCase() # yes, really lowercase
3566 return new_character_token c
3568 temporary_buffer += c
3569 return new_character_token c
3571 tok_state = tok_state_script_data_escaped
3572 cur -= 1 # Reconsume
3575 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3576 tok_state_script_data_double_escaped = ->
3577 c = txt.charAt(cur++)
3579 tok_state = tok_state_script_data_double_escaped_dash
3580 return new_character_token '-'
3582 tok_state = tok_state_script_data_double_escaped_less_than_sign
3583 return new_character_token '<'
3586 return new_character_token "\ufffd"
3589 tok_state = tok_state_data
3590 cur -= 1 # Reconsume
3593 return new_character_token c
3595 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3596 tok_state_script_data_double_escaped_dash = ->
3597 c = txt.charAt(cur++)
3599 tok_state = tok_state_script_data_double_escaped_dash_dash
3600 return new_character_token '-'
3602 tok_state = tok_state_script_data_double_escaped_less_than_sign
3603 return new_character_token '<'
3606 tok_state = tok_state_script_data_double_escaped
3607 return new_character_token "\ufffd"
3610 tok_state = tok_state_data
3611 cur -= 1 # Reconsume
3614 tok_state = tok_state_script_data_double_escaped
3615 return new_character_token c
3617 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3618 tok_state_script_data_double_escaped_dash_dash = ->
3619 c = txt.charAt(cur++)
3621 return new_character_token '-'
3623 tok_state = tok_state_script_data_double_escaped_less_than_sign
3624 return new_character_token '<'
3626 tok_state = tok_state_script_data
3627 return new_character_token '>'
3630 tok_state = tok_state_script_data_double_escaped
3631 return new_character_token "\ufffd"
3634 tok_state = tok_state_data
3635 cur -= 1 # Reconsume
3638 tok_state = tok_state_script_data_double_escaped
3639 return new_character_token c
3641 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3642 tok_state_script_data_double_escaped_less_than_sign = ->
3643 c = txt.charAt(cur++)
3645 temporary_buffer = ''
3646 tok_state = tok_state_script_data_double_escape_end
3647 return new_character_token '/'
3649 tok_state = tok_state_script_data_double_escaped
3650 cur -= 1 # Reconsume
3653 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3654 tok_state_script_data_double_escape_end = ->
3655 c = txt.charAt(cur++)
3656 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3657 if temporary_buffer is 'script'
3658 tok_state = tok_state_script_data_escaped
3660 tok_state = tok_state_script_data_double_escaped
3661 return new_character_token c
3663 temporary_buffer += c.toLowerCase() # yes, really lowercase
3664 return new_character_token c
3666 temporary_buffer += c
3667 return new_character_token c
3669 tok_state = tok_state_script_data_double_escaped
3670 cur -= 1 # Reconsume
3673 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3674 tok_state_before_attribute_name = ->
3676 switch c = txt.charAt(cur++)
3677 when "\t", "\n", "\u000c", ' '
3680 tok_state = tok_state_self_closing_start_tag
3683 tok_state = tok_state_data
3689 attr_name = "\ufffd"
3690 when '"', "'", '<', '='
3695 tok_state = tok_state_data
3698 attr_name = c.toLowerCase()
3702 tok_cur_tag.attrs_a.unshift [attr_name, '']
3703 tok_state = tok_state_attribute_name
3706 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3707 tok_state_attribute_name = ->
3708 switch c = txt.charAt(cur++)
3709 when "\t", "\n", "\u000c", ' '
3710 tok_state = tok_state_after_attribute_name
3712 tok_state = tok_state_self_closing_start_tag
3714 tok_state = tok_state_before_attribute_value
3716 tok_state = tok_state_data
3722 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3725 tok_cur_tag.attrs_a[0][0] += c
3728 tok_state = tok_state_data
3731 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3733 tok_cur_tag.attrs_a[0][0] += c
3736 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3737 tok_state_after_attribute_name = ->
3738 c = txt.charAt(cur++)
3739 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3742 tok_state = tok_state_self_closing_start_tag
3745 tok_state = tok_state_before_attribute_value
3748 tok_state = tok_state_data
3751 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3752 tok_state = tok_state_attribute_name
3756 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3757 tok_state = tok_state_attribute_name
3761 tok_state = tok_state_data
3762 cur -= 1 # reconsume
3764 if c is '"' or c is "'" or c is '<'
3766 # fall through to Anything else
3768 tok_cur_tag.attrs_a.unshift [c, '']
3769 tok_state = tok_state_attribute_name
3772 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3773 tok_state_before_attribute_value = ->
3774 switch c = txt.charAt(cur++)
3775 when "\t", "\n", "\u000c", ' '
3778 tok_state = tok_state_attribute_value_double_quoted
3780 tok_state = tok_state_attribute_value_unquoted
3783 tok_state = tok_state_attribute_value_single_quoted
3786 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3787 tok_state = tok_state_attribute_value_unquoted
3790 tok_state = tok_state_data
3796 tok_state = tok_state_data
3798 tok_cur_tag.attrs_a[0][1] += c
3799 tok_state = tok_state_attribute_value_unquoted
3802 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3803 tok_state_attribute_value_double_quoted = ->
3804 switch c = txt.charAt(cur++)
3806 tok_state = tok_state_after_attribute_value_quoted
3808 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3811 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3814 tok_state = tok_state_data
3816 tok_cur_tag.attrs_a[0][1] += c
3819 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3820 tok_state_attribute_value_single_quoted = ->
3821 switch c = txt.charAt(cur++)
3823 tok_state = tok_state_after_attribute_value_quoted
3825 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3828 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3831 tok_state = tok_state_data
3833 tok_cur_tag.attrs_a[0][1] += c
3836 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3837 tok_state_attribute_value_unquoted = ->
3838 switch c = txt.charAt(cur++)
3839 when "\t", "\n", "\u000c", ' '
3840 tok_state = tok_state_before_attribute_name
3842 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3844 tok_state = tok_state_data
3849 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3852 tok_state = tok_state_data
3854 # Parse Error if ', <, = or ` (backtick)
3855 tok_cur_tag.attrs_a[0][1] += c
3858 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3859 tok_state_after_attribute_value_quoted = ->
3860 switch c = txt.charAt(cur++)
3861 when "\t", "\n", "\u000c", ' '
3862 tok_state = tok_state_before_attribute_name
3864 tok_state = tok_state_self_closing_start_tag
3866 tok_state = tok_state_data
3872 tok_state = tok_state_data
3875 tok_state = tok_state_before_attribute_name
3876 cur -= 1 # we didn't handle that char
3879 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3880 tok_state_self_closing_start_tag = ->
3881 c = txt.charAt(cur++)
3883 tok_cur_tag.flag 'self-closing', true
3884 tok_state = tok_state_data
3888 tok_state = tok_state_data
3889 cur -= 1 # Reconsume
3893 tok_state = tok_state_before_attribute_name
3894 cur -= 1 # Reconsume
3897 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3898 # WARNING: put a comment token in tok_cur_tag before setting this state
3899 tok_state_bogus_comment = ->
3900 next_gt = txt.indexOf '>', cur
3902 val = txt.substr cur
3905 val = txt.substr cur, (next_gt - cur)
3907 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3908 tok_cur_tag.text += val
3909 tok_state = tok_state_data
3912 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3913 tok_state_markup_declaration_open = ->
3914 if txt.substr(cur, 2) is '--'
3916 tok_cur_tag = new_comment_token ''
3917 tok_state = tok_state_comment_start
3919 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3921 tok_state = tok_state_doctype
3923 acn = adjusted_current_node()
3924 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3926 tok_state = tok_state_cdata_section
3930 tok_cur_tag = new_comment_token ''
3931 tok_state = tok_state_bogus_comment
3934 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3935 tok_state_comment_start = ->
3936 switch c = txt.charAt(cur++)
3938 tok_state = tok_state_comment_start_dash
3941 tok_state = tok_state_comment
3942 return new_character_token "\ufffd"
3945 tok_state = tok_state_data
3949 tok_state = tok_state_data
3950 cur -= 1 # Reconsume
3953 tok_cur_tag.text += c
3954 tok_state = tok_state_comment
3957 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3958 tok_state_comment_start_dash = ->
3959 switch c = txt.charAt(cur++)
3961 tok_state = tok_state_comment_end
3964 tok_cur_tag.text += "-\ufffd"
3965 tok_state = tok_state_comment
3968 tok_state = tok_state_data
3972 tok_state = tok_state_data
3973 cur -= 1 # Reconsume
3976 tok_cur_tag.text += "-#{c}"
3977 tok_state = tok_state_comment
3980 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3981 tok_state_comment = ->
3982 switch c = txt.charAt(cur++)
3984 tok_state = tok_state_comment_end_dash
3987 tok_cur_tag.text += "\ufffd"
3990 tok_state = tok_state_data
3991 cur -= 1 # Reconsume
3994 tok_cur_tag.text += c
3997 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3998 tok_state_comment_end_dash = ->
3999 switch c = txt.charAt(cur++)
4001 tok_state = tok_state_comment_end
4004 tok_cur_tag.text += "-\ufffd"
4005 tok_state = tok_state_comment
4008 tok_state = tok_state_data
4009 cur -= 1 # Reconsume
4012 tok_cur_tag.text += "-#{c}"
4013 tok_state = tok_state_comment
4016 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
4017 tok_state_comment_end = ->
4018 switch c = txt.charAt(cur++)
4020 tok_state = tok_state_data
4024 tok_cur_tag.text += "--\ufffd"
4025 tok_state = tok_state_comment
4028 tok_state = tok_state_comment_end_bang
4031 tok_cur_tag.text += '-'
4034 tok_state = tok_state_data
4035 cur -= 1 # Reconsume
4039 tok_cur_tag.text += "--#{c}"
4040 tok_state = tok_state_comment
4043 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
4044 tok_state_comment_end_bang = ->
4045 switch c = txt.charAt(cur++)
4047 tok_cur_tag.text += "--!#{c}"
4048 tok_state = tok_state_comment_end_dash
4050 tok_state = tok_state_data
4054 tok_cur_tag.text += "--!\ufffd"
4055 tok_state = tok_state_comment
4058 tok_state = tok_state_data
4059 cur -= 1 # Reconsume
4062 tok_cur_tag.text += "--!#{c}"
4063 tok_state = tok_state_comment
4066 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4067 tok_state_doctype = ->
4068 switch c = txt.charAt(cur++)
4069 when "\t", "\u000a", "\u000c", ' '
4070 tok_state = tok_state_before_doctype_name
4073 tok_state = tok_state_data
4074 el = new_doctype_token ''
4075 el.flag 'force-quirks', true
4076 cur -= 1 # Reconsume
4080 tok_state = tok_state_before_doctype_name
4081 cur -= 1 # Reconsume
4084 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4085 tok_state_before_doctype_name = ->
4086 c = txt.charAt(cur++)
4087 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4090 tok_cur_tag = new_doctype_token c.toLowerCase()
4091 tok_state = tok_state_doctype_name
4095 tok_cur_tag = new_doctype_token "\ufffd"
4096 tok_state = tok_state_doctype_name
4100 el = new_doctype_token ''
4101 el.flag 'force-quirks', true
4102 tok_state = tok_state_data
4106 tok_state = tok_state_data
4107 el = new_doctype_token ''
4108 el.flag 'force-quirks', true
4109 cur -= 1 # Reconsume
4112 tok_cur_tag = new_doctype_token c
4113 tok_state = tok_state_doctype_name
4116 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4117 tok_state_doctype_name = ->
4118 c = txt.charAt(cur++)
4119 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4120 tok_state = tok_state_after_doctype_name
4123 tok_state = tok_state_data
4126 tok_cur_tag.name += c.toLowerCase()
4130 tok_cur_tag.name += "\ufffd"
4134 tok_state = tok_state_data
4135 tok_cur_tag.flag 'force-quirks', true
4136 cur -= 1 # Reconsume
4139 tok_cur_tag.name += c
4142 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4143 tok_state_after_doctype_name = ->
4144 c = txt.charAt(cur++)
4145 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4148 tok_state = tok_state_data
4152 tok_state = tok_state_data
4153 tok_cur_tag.flag 'force-quirks', true
4154 cur -= 1 # Reconsume
4157 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4159 tok_state = tok_state_after_doctype_public_keyword
4161 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4163 tok_state = tok_state_after_doctype_system_keyword
4166 tok_cur_tag.flag 'force-quirks', true
4167 tok_state = tok_state_bogus_doctype
4170 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4171 tok_state_after_doctype_public_keyword = ->
4172 c = txt.charAt(cur++)
4173 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4174 tok_state = tok_state_before_doctype_public_identifier
4178 tok_cur_tag.public_identifier = ''
4179 tok_state = tok_state_doctype_public_identifier_double_quoted
4183 tok_cur_tag.public_identifier = ''
4184 tok_state = tok_state_doctype_public_identifier_single_quoted
4188 tok_cur_tag.flag 'force-quirks', true
4189 tok_state = tok_state_data
4193 tok_state = tok_state_data
4194 tok_cur_tag.flag 'force-quirks', true
4195 cur -= 1 # Reconsume
4199 tok_cur_tag.flag 'force-quirks', true
4200 tok_state = tok_state_bogus_doctype
4203 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4204 tok_state_before_doctype_public_identifier = ->
4205 c = txt.charAt(cur++)
4206 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4210 tok_cur_tag.public_identifier = ''
4211 tok_state = tok_state_doctype_public_identifier_double_quoted
4215 tok_cur_tag.public_identifier = ''
4216 tok_state = tok_state_doctype_public_identifier_single_quoted
4220 tok_cur_tag.flag 'force-quirks', true
4221 tok_state = tok_state_data
4225 tok_state = tok_state_data
4226 tok_cur_tag.flag 'force-quirks', true
4227 cur -= 1 # Reconsume
4231 tok_cur_tag.flag 'force-quirks', true
4232 tok_state = tok_state_bogus_doctype
4236 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4237 tok_state_doctype_public_identifier_double_quoted = ->
4238 c = txt.charAt(cur++)
4240 tok_state = tok_state_after_doctype_public_identifier
4244 tok_cur_tag.public_identifier += "\ufffd"
4248 tok_cur_tag.flag 'force-quirks', true
4249 tok_state = tok_state_data
4253 tok_state = tok_state_data
4254 tok_cur_tag.flag 'force-quirks', true
4255 cur -= 1 # Reconsume
4258 tok_cur_tag.public_identifier += c
4261 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4262 tok_state_doctype_public_identifier_single_quoted = ->
4263 c = txt.charAt(cur++)
4265 tok_state = tok_state_after_doctype_public_identifier
4269 tok_cur_tag.public_identifier += "\ufffd"
4273 tok_cur_tag.flag 'force-quirks', true
4274 tok_state = tok_state_data
4278 tok_state = tok_state_data
4279 tok_cur_tag.flag 'force-quirks', true
4280 cur -= 1 # Reconsume
4283 tok_cur_tag.public_identifier += c
4286 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4287 tok_state_after_doctype_public_identifier = ->
4288 c = txt.charAt(cur++)
4289 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4290 tok_state = tok_state_between_doctype_public_and_system_identifiers
4293 tok_state = tok_state_data
4297 tok_cur_tag.system_identifier = ''
4298 tok_state = tok_state_doctype_system_identifier_double_quoted
4302 tok_cur_tag.system_identifier = ''
4303 tok_state = tok_state_doctype_system_identifier_single_quoted
4307 tok_state = tok_state_data
4308 tok_cur_tag.flag 'force-quirks', true
4309 cur -= 1 # Reconsume
4313 tok_cur_tag.flag 'force-quirks', true
4314 tok_state = tok_state_bogus_doctype
4317 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4318 tok_state_between_doctype_public_and_system_identifiers = ->
4319 c = txt.charAt(cur++)
4320 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4323 tok_state = tok_state_data
4327 tok_cur_tag.system_identifier = ''
4328 tok_state = tok_state_doctype_system_identifier_double_quoted
4332 tok_cur_tag.system_identifier = ''
4333 tok_state = tok_state_doctype_system_identifier_single_quoted
4337 tok_state = tok_state_data
4338 tok_cur_tag.flag 'force-quirks', true
4339 cur -= 1 # Reconsume
4343 tok_cur_tag.flag 'force-quirks', true
4344 tok_state = tok_state_bogus_doctype
4347 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4348 tok_state_after_doctype_system_keyword = ->
4349 c = txt.charAt(cur++)
4350 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4351 tok_state = tok_state_before_doctype_system_identifier
4355 tok_cur_tag.system_identifier = ''
4356 tok_state = tok_state_doctype_system_identifier_double_quoted
4360 tok_cur_tag.system_identifier = ''
4361 tok_state = tok_state_doctype_system_identifier_single_quoted
4365 tok_cur_tag.flag 'force-quirks', true
4366 tok_state = tok_state_data
4370 tok_state = tok_state_data
4371 tok_cur_tag.flag 'force-quirks', true
4372 cur -= 1 # Reconsume
4376 tok_cur_tag.flag 'force-quirks', true
4377 tok_state = tok_state_bogus_doctype
4380 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4381 tok_state_before_doctype_system_identifier = ->
4382 c = txt.charAt(cur++)
4383 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4386 tok_cur_tag.system_identifier = ''
4387 tok_state = tok_state_doctype_system_identifier_double_quoted
4390 tok_cur_tag.system_identifier = ''
4391 tok_state = tok_state_doctype_system_identifier_single_quoted
4395 tok_cur_tag.flag 'force-quirks', true
4396 tok_state = tok_state_data
4400 tok_state = tok_state_data
4401 tok_cur_tag.flag 'force-quirks', true
4402 cur -= 1 # Reconsume
4406 tok_cur_tag.flag 'force-quirks', true
4407 tok_state = tok_state_bogus_doctype
4410 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4411 tok_state_doctype_system_identifier_double_quoted = ->
4412 c = txt.charAt(cur++)
4414 tok_state = tok_state_after_doctype_system_identifier
4418 tok_cur_tag.system_identifier += "\ufffd"
4422 tok_cur_tag.flag 'force-quirks', true
4423 tok_state = tok_state_data
4427 tok_state = tok_state_data
4428 tok_cur_tag.flag 'force-quirks', true
4429 cur -= 1 # Reconsume
4432 tok_cur_tag.system_identifier += c
4435 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4436 tok_state_doctype_system_identifier_single_quoted = ->
4437 c = txt.charAt(cur++)
4439 tok_state = tok_state_after_doctype_system_identifier
4443 tok_cur_tag.system_identifier += "\ufffd"
4447 tok_cur_tag.flag 'force-quirks', true
4448 tok_state = tok_state_data
4452 tok_state = tok_state_data
4453 tok_cur_tag.flag 'force-quirks', true
4454 cur -= 1 # Reconsume
4457 tok_cur_tag.system_identifier += c
4460 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4461 tok_state_after_doctype_system_identifier = ->
4462 c = txt.charAt(cur++)
4463 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4466 tok_state = tok_state_data
4470 tok_state = tok_state_data
4471 tok_cur_tag.flag 'force-quirks', true
4472 cur -= 1 # Reconsume
4476 # do _not_ tok_cur_tag.flag 'force-quirks', true
4477 tok_state = tok_state_bogus_doctype
4480 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4481 tok_state_bogus_doctype = ->
4482 c = txt.charAt(cur++)
4484 tok_state = tok_state_data
4487 tok_state = tok_state_data
4488 cur -= 1 # Reconsume
4493 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4494 tok_state_cdata_section = ->
4495 tok_state = tok_state_data
4496 next_gt = txt.indexOf ']]>', cur
4498 val = txt.substr cur
4501 val = txt.substr cur, (next_gt - cur)
4503 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
4505 return new_character_token val # fixfull split
4508 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4509 # Don't set this as a state, just call it
4510 # returns a string (NOT a text node)
4511 parse_character_reference = (allowed_char = null, in_attr = false) ->
4512 if cur >= txt.length
4514 switch c = txt.charAt(cur)
4515 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4516 # explicitly not a parse error
4519 # there has to be "one or more" alnums between & and ; to be a parse error
4522 if cur + 1 >= txt.length
4524 if txt.charAt(cur + 1).toLowerCase() is 'x'
4533 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4538 if txt.charAt(start + i) is ';'
4542 code_point = txt.substr(start, i)
4543 while code_point.charAt(0) is '0' and code_point.length > 1
4544 code_point = code_point.substr 1
4545 code_point = parseInt(code_point, base)
4546 if unicode_fixes[code_point]?
4548 return unicode_fixes[code_point]
4550 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4554 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4556 return from_code_point code_point
4560 if alnum.indexOf(txt.charAt(cur + i)) is -1
4563 # exit early, because parse_error() below needs at least one alnum
4565 if txt.charAt(cur + i) is ';'
4566 decoded = decode_named_char_ref txt.substr(cur, i)
4567 i += 1 # scan past the ';' (after, so we dno't pass it to decode)
4571 # else FALL THROUGH (check for match without last char(s) or ";")
4572 # no ';' terminator (only legacy char refs)
4574 for i in [2..max] # no prefix matches, so ok to check shortest first
4575 c = legacy_char_refs[txt.substr(cur, i)]
4578 if txt.charAt(cur + i) is '='
4579 # "because some legacy user agents will
4580 # misinterpret the markup in those cases"
4583 if alnum.indexOf(txt.charAt(cur + i)) > -1
4584 # this makes attributes forgiving about url args
4586 # ok, and besides the weird exceptions for attributes...
4587 # return the matching char
4588 cur += i # consume entity chars
4589 parse_error() # because no terminating ";"
4593 return # never reached
4595 eat_next_token_if_newline = ->
4600 if t.type is TYPE_TEXT
4601 # definition of a newline depends on whether it was a character ref or not
4602 if cur - old_cur is 1
4603 # not a character reference
4604 if t.text is "\u000d" or t.text is "\u000a"
4607 if t.text is "\u000a"
4613 # tree constructor initialization
4614 # see comments on TYPE_TAG/etc for the structure of this data
4617 doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
4618 doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
4619 fragment_root = null # fragment parsing algorithm returns children of this
4621 afe = [] # active formatting elements
4622 template_ins_modes = []
4623 ins_mode = ins_mode_initial
4624 original_ins_mode = ins_mode # TODO check spec
4625 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4626 flag_frameset_ok = true
4628 flag_foster_parenting = false
4629 form_element_pointer = null
4630 temporary_buffer = null
4631 pending_table_character_tokens = []
4632 head_element_pointer = null
4633 flag_fragment_parsing = false
4634 context_element = null
4635 prev_node_id = 0 # just for debugging
4637 # tokenizer initialization
4638 tok_state = tok_state_data
4641 # fragment parsing (text arg)
4643 # this handles the fragment from the tests in the format described here:
4644 # https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/README.md
4647 if f.substr(0, 5) is 'math '
4650 else if f.substr(0, 4) is 'svg '
4654 context_element = token_to_element t, ns
4655 context_element.document = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
4656 context_element.document.flag 'quirks mode', QUIRKS_NO
4657 # fragment parsing (Node arg)
4659 context_element = args.context
4661 # http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4662 # fragment parsing algorithm
4664 flag_fragment_parsing = true
4665 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4666 # search up the tree from context, to try to find it's document,
4667 # because this file only puts a "document" property on the root
4670 el = context_element
4673 old_doc = el.document
4680 doc.flag 'quirks mode', old_doc.flag 'quirks mode'
4682 if context_element.namespace is NS_HTML
4683 switch context_element.name
4684 when 'title', 'textarea'
4685 tok_state = tok_state_rcdata
4686 when 'style', 'xmp', 'iframe', 'noembed', 'noframes'
4687 tok_state = tok_state_rawtext
4689 tok_state = tok_state_script_data
4692 tok_state = tok_state_rawtext
4694 tok_state = tok_state_plaintext
4695 fragment_root = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4696 doc.children.push fragment_root
4697 fragment_root.document = doc
4698 open_els = [fragment_root]
4699 if context_element.name is 'template' and context_element.namespace is NS_HTML
4700 template_ins_modes.unshift ins_mode_in_template
4701 # fixfull create token for context (it should have it's original one already)
4703 # set form_element pointer... in the foreign doc?!
4704 el = context_element
4706 if el.name is 'form' and el.namespace is NS_HTML
4707 form_element_pointer = el
4714 # text pre-processing
4715 # FIXME check http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4716 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4717 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4721 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4722 parse_main_loop = ->
4727 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4732 if flag_fragment_parsing
4733 return fragment_root.children
4736 exports.parse_html = parse_html
4737 exports.debug_log_reset = debug_log_reset
4738 exports.debug_log_each = debug_log_each
4739 exports.TYPE_TAG = TYPE_TAG
4740 exports.TYPE_TEXT = TYPE_TEXT
4741 exports.TYPE_COMMENT = TYPE_COMMENT
4742 exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4743 exports.NS_HTML = NS_HTML
4744 exports.NS_MATHML = NS_MATHML
4745 exports.NS_SVG = NS_SVG
4746 exports.QUIRKS_NO = QUIRKS_NO
4747 exports.QUIRKS_LIMITED = QUIRKS_LIMITED
4748 exports.QUIRKS_YES = QUIRKS_YES