1 # Copyright 2015 Jason Woofenden
2 # This file implements an HTML5 parser
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a thorough parser for html5, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
24 # http://www.w3.org/TR/html5/syntax.html
26 # except for some places marked "WHATWG" that are implemented as described here:
28 # https://html.spec.whatwg.org/multipage/syntax.html
30 # This code passes all of the tests in the .dat files at:
32 # https://github.com/JasonWoof/html5lib-tests/tree/patch-1/tree-construction
35 ##################################
36 ## how to use this code
37 ##################################
39 # See README.md for how to run this file in the browser or in node.js.
41 # This file exports a single useful function: parse_tml, and some constants
42 # (see the bottom of this file for those.)
46 # wheic_parser.parse("<p><b>hi</p>")
48 # Or, if you don't want <html><head><body>/etc, do this:
50 # wheic_parser.parse("<p><b>hi</p>", {fragment: "body"})
52 # return value is an array of Nodes, see "class Node" below.
54 # This code is a work in progress, eg try search this file for "fixfull",
60 # Jason was frequently confused by the terminology used to refer to different
61 # parts of the stacks and lists in the spec, so he made this chart to help keep
64 # stacks grow downward (current element is index=0)
66 # example: open_els = [a, b, c, d, e, f, g]
68 # "grows downwards" means it's visualized like this: (index: el "names")
70 # 6: g "start of the list", "topmost", "first"
72 # 4: e "previous" (to d), "above", "before"
73 # 3: d (previous/next are relative to this element)
74 # 2: c "next", "after", "lower", "below"
76 # 0: a "end of the list", "current node", "bottommost", "last"
78 if (typeof module) isnt 'undefined' and module.exports?
80 exports = module.exports
83 window.wheic_parser = {}
84 exports = window.wheic_parser
86 from_code_point = (x) ->
87 if String.fromCodePoint?
88 return String.fromCodePoint x
91 return String.fromCharCode x
93 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
95 # Each node is an obect of the Node class. Here are the Node types:
96 TYPE_TAG = 0 # name, {attributes}, [children]
97 TYPE_TEXT = 1 # "text"
100 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
101 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
102 TYPE_END_TAG = 5 # name
104 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
105 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
107 # namespace constants
112 # quirks mode constants
117 # queue up debug logs, so eg they can be shown only for tests that fail
125 debug_log_each = (cb) ->
126 for str in g_debug_log
132 constructor: (type, args = {}) ->
133 @type = type # one of the TYPE_* constants above
134 @name = args.name ? '' # tag name
135 @text = args.text ? '' # contents for text/comment nodes
136 @attrs = args.attrs ? {}
137 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
138 @children = args.children ? []
139 @namespace = args.namespace ? NS_HTML
140 @parent = args.parent ? null
141 @token = args.token ? null
142 @flags = args.flags ? {}
146 @id = "#{++prev_node_id}"
147 acknowledge_self_closing: ->
149 @token.flag 'did_self_close', true
151 @flag 'did_self_close', true
153 flag: (key, value = null) ->
160 # helpers: (only take args that are normally known when parser creates nodes)
161 new_open_tag = (name) ->
162 return new Node TYPE_START_TAG, name: name
163 new_end_tag = (name) ->
164 return new Node TYPE_END_TAG, name: name
165 new_element = (name) ->
166 return new Node TYPE_TAG, name: name
167 new_text_node = (txt) ->
168 return new Node TYPE_TEXT, text: txt
169 new_character_token = new_text_node
170 new_comment_token = (txt) ->
171 return new Node TYPE_COMMENT, text: txt
172 new_doctype_token = (name) ->
173 return new Node TYPE_DOCTYPE, name: name
175 return new Node TYPE_EOF
177 return new Node TYPE_AFE_MARKER
178 new_aaa_bookmark = ->
179 return new Node TYPE_AAA_BOOKMARK
181 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
182 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
183 digits = "0123456789"
184 alnum = lc_alpha + uc_alpha + digits
185 hex_chars = digits + "abcdefABCDEF"
187 is_uc_alpha = (str) ->
188 return str.length is 1 and uc_alpha.indexOf(str) > -1
189 is_lc_alpha = (str) ->
190 return str.length is 1 and lc_alpha.indexOf(str) > -1
192 # some SVG elements have dashes in them
193 tag_name_chars = alnum + "-"
195 # http://www.w3.org/TR/html5/infrastructure.html#space-character
196 space_chars = "\u0009\u000a\u000c\u000d\u0020"
198 return txt.length is 1 and space_chars.indexOf(txt) > -1
199 is_space_tok = (t) ->
200 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
202 is_input_hidden_tok = (t) ->
203 return false unless t.type is TYPE_START_TAG
206 if a[1].toLowerCase() is 'hidden'
211 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
212 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
215 unicode_fixes[0x00] = "\uFFFD"
216 unicode_fixes[0x80] = "\u20AC"
217 unicode_fixes[0x82] = "\u201A"
218 unicode_fixes[0x83] = "\u0192"
219 unicode_fixes[0x84] = "\u201E"
220 unicode_fixes[0x85] = "\u2026"
221 unicode_fixes[0x86] = "\u2020"
222 unicode_fixes[0x87] = "\u2021"
223 unicode_fixes[0x88] = "\u02C6"
224 unicode_fixes[0x89] = "\u2030"
225 unicode_fixes[0x8A] = "\u0160"
226 unicode_fixes[0x8B] = "\u2039"
227 unicode_fixes[0x8C] = "\u0152"
228 unicode_fixes[0x8E] = "\u017D"
229 unicode_fixes[0x91] = "\u2018"
230 unicode_fixes[0x92] = "\u2019"
231 unicode_fixes[0x93] = "\u201C"
232 unicode_fixes[0x94] = "\u201D"
233 unicode_fixes[0x95] = "\u2022"
234 unicode_fixes[0x96] = "\u2013"
235 unicode_fixes[0x97] = "\u2014"
236 unicode_fixes[0x98] = "\u02DC"
237 unicode_fixes[0x99] = "\u2122"
238 unicode_fixes[0x9A] = "\u0161"
239 unicode_fixes[0x9B] = "\u203A"
240 unicode_fixes[0x9C] = "\u0153"
241 unicode_fixes[0x9E] = "\u017E"
242 unicode_fixes[0x9F] = "\u0178"
244 quirks_yes_pi_prefixes = [
245 "+//silmaril//dtd html pro v0r11 19970101//"
246 "-//as//dtd html 3.0 aswedit + extensions//"
247 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
248 "-//ietf//dtd html 2.0 level 1//"
249 "-//ietf//dtd html 2.0 level 2//"
250 "-//ietf//dtd html 2.0 strict level 1//"
251 "-//ietf//dtd html 2.0 strict level 2//"
252 "-//ietf//dtd html 2.0 strict//"
253 "-//ietf//dtd html 2.0//"
254 "-//ietf//dtd html 2.1e//"
255 "-//ietf//dtd html 3.0//"
256 "-//ietf//dtd html 3.2 final//"
257 "-//ietf//dtd html 3.2//"
258 "-//ietf//dtd html 3//"
259 "-//ietf//dtd html level 0//"
260 "-//ietf//dtd html level 1//"
261 "-//ietf//dtd html level 2//"
262 "-//ietf//dtd html level 3//"
263 "-//ietf//dtd html strict level 0//"
264 "-//ietf//dtd html strict level 1//"
265 "-//ietf//dtd html strict level 2//"
266 "-//ietf//dtd html strict level 3//"
267 "-//ietf//dtd html strict//"
268 "-//ietf//dtd html//"
269 "-//metrius//dtd metrius presentational//"
270 "-//microsoft//dtd internet explorer 2.0 html strict//"
271 "-//microsoft//dtd internet explorer 2.0 html//"
272 "-//microsoft//dtd internet explorer 2.0 tables//"
273 "-//microsoft//dtd internet explorer 3.0 html strict//"
274 "-//microsoft//dtd internet explorer 3.0 html//"
275 "-//microsoft//dtd internet explorer 3.0 tables//"
276 "-//netscape comm. corp.//dtd html//"
277 "-//netscape comm. corp.//dtd strict html//"
278 "-//o'reilly and associates//dtd html 2.0//"
279 "-//o'reilly and associates//dtd html extended 1.0//"
280 "-//o'reilly and associates//dtd html extended relaxed 1.0//"
281 "-//sq//dtd html 2.0 hotmetal + extensions//"
282 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
283 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
284 "-//spyglass//dtd html 2.0 extended//"
285 "-//sun microsystems corp.//dtd hotjava html//"
286 "-//sun microsystems corp.//dtd hotjava strict html//"
287 "-//w3c//dtd html 3 1995-03-24//"
288 "-//w3c//dtd html 3.2 draft//"
289 "-//w3c//dtd html 3.2 final//"
290 "-//w3c//dtd html 3.2//"
291 "-//w3c//dtd html 3.2s draft//"
292 "-//w3c//dtd html 4.0 frameset//"
293 "-//w3c//dtd html 4.0 transitional//"
294 "-//w3c//dtd html experimental 19960712//"
295 "-//w3c//dtd html experimental 970421//"
296 "-//w3c//dtd w3 html//"
297 "-//w3o//dtd w3 html 3.0//"
298 "-//webtechs//dtd mozilla html 2.0//"
299 "-//webtechs//dtd mozilla html//"
302 # These are the character references that don't need a terminating semicolon
303 # min length: 2, max: 6, none are a prefix of any other.
305 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
306 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
307 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
308 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
309 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
310 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
311 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
312 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
313 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
314 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
315 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
316 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
317 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
318 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
319 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
320 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
321 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
325 #void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
326 #raw_text_elements = ['script', 'style']
327 #escapable_raw_text_elements = ['textarea', 'title']
328 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
330 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
331 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
332 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
333 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
334 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
335 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
336 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
337 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
338 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
339 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
340 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
341 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
342 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
343 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
347 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
349 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
350 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
351 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
352 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
353 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
354 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
355 'determinant', 'diff', 'divergence', 'divide', 'domain',
356 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
357 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
358 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
359 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
360 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
361 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
362 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
363 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
364 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
365 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
366 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
367 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
368 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
369 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
370 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
371 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
372 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
373 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
374 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
375 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
376 'vectorproduct', 'xor'
378 # foreign_elements = [svg_elements..., mathml_elements...]
379 #normal_elements = All other allowed HTML elements are normal elements.
383 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
384 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
385 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
386 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
387 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
388 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
389 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
390 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
391 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
392 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
393 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
395 menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
397 meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
398 noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
399 plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
400 select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
401 table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
402 textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
403 tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
406 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
407 'annotation-xml':NS_MATHML,
410 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
413 formatting_elements = {
414 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
415 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
419 mathml_text_integration = {
420 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
422 is_mathml_text_integration_point = (el) ->
423 return mathml_text_integration[el.name] is el.namespace
424 is_html_integration = (el) -> # DON'T PASS A TOKEN
425 if el.namespace is NS_MATHML
426 if el.name is 'annotation-xml'
427 if el.attrs.encoding?
428 if el.attrs.encoding.toLowerCase() is 'text/html'
430 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
433 if el.namespace is NS_SVG
434 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
439 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
442 foster_parenting_targets = {
463 el_is_special = (e) ->
464 return special_elements[e.name] is e.namespace
466 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
467 el_is_special_not_adp = (el) ->
468 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
472 altglyphdef: 'altGlyphDef'
473 altglyphitem: 'altGlyphItem'
474 animatecolor: 'animateColor'
475 animatemotion: 'animateMotion'
476 animatetransform: 'animateTransform'
479 fecolormatrix: 'feColorMatrix'
480 fecomponenttransfer: 'feComponentTransfer'
481 fecomposite: 'feComposite'
482 feconvolvematrix: 'feConvolveMatrix'
483 fediffuselighting: 'feDiffuseLighting'
484 fedisplacementmap: 'feDisplacementMap'
485 fedistantlight: 'feDistantLight'
486 fedropshadow: 'feDropShadow'
492 fegaussianblur: 'feGaussianBlur'
495 femergenode: 'feMergeNode'
496 femorphology: 'feMorphology'
498 fepointlight: 'fePointLight'
499 fespecularlighting: 'feSpecularLighting'
500 fespotlight: 'feSpotLight'
502 feturbulence: 'feTurbulence'
503 foreignobject: 'foreignObject'
505 lineargradient: 'linearGradient'
506 radialgradient: 'radialGradient'
509 svg_attribute_fixes = {
510 attributename: 'attributeName'
511 attributetype: 'attributeType'
512 basefrequency: 'baseFrequency'
513 baseprofile: 'baseProfile'
515 clippathunits: 'clipPathUnits'
516 contentscripttype: 'contentScriptType'
517 contentstyletype: 'contentStyleType'
518 diffuseconstant: 'diffuseConstant'
520 externalresourcesrequired: 'externalResourcesRequired'
521 # WHATWG removes this: filterres: 'filterRes'
522 filterunits: 'filterUnits'
524 gradienttransform: 'gradientTransform'
525 gradientunits: 'gradientUnits'
526 kernelmatrix: 'kernelMatrix'
527 kernelunitlength: 'kernelUnitLength'
528 keypoints: 'keyPoints'
529 keysplines: 'keySplines'
531 lengthadjust: 'lengthAdjust'
532 limitingconeangle: 'limitingConeAngle'
533 markerheight: 'markerHeight'
534 markerunits: 'markerUnits'
535 markerwidth: 'markerWidth'
536 maskcontentunits: 'maskContentUnits'
537 maskunits: 'maskUnits'
538 numoctaves: 'numOctaves'
539 pathlength: 'pathLength'
540 patterncontentunits: 'patternContentUnits'
541 patterntransform: 'patternTransform'
542 patternunits: 'patternUnits'
543 pointsatx: 'pointsAtX'
544 pointsaty: 'pointsAtY'
545 pointsatz: 'pointsAtZ'
546 preservealpha: 'preserveAlpha'
547 preserveaspectratio: 'preserveAspectRatio'
548 primitiveunits: 'primitiveUnits'
551 repeatcount: 'repeatCount'
552 repeatdur: 'repeatDur'
553 requiredextensions: 'requiredExtensions'
554 requiredfeatures: 'requiredFeatures'
555 specularconstant: 'specularConstant'
556 specularexponent: 'specularExponent'
557 spreadmethod: 'spreadMethod'
558 startoffset: 'startOffset'
559 stddeviation: 'stdDeviation'
560 stitchtiles: 'stitchTiles'
561 surfacescale: 'surfaceScale'
562 systemlanguage: 'systemLanguage'
563 tablevalues: 'tableValues'
566 textlength: 'textLength'
568 viewtarget: 'viewTarget'
569 xchannelselector: 'xChannelSelector'
570 ychannelselector: 'yChannelSelector'
571 zoomandpan: 'zoomAndPan'
573 foreign_attr_fixes = {
574 'xlink:actuate': 'xlink actuate'
575 'xlink:arcrole': 'xlink arcrole'
576 'xlink:href': 'xlink href'
577 'xlink:role': 'xlink role'
578 'xlink:show': 'xlink show'
579 'xlink:title': 'xlink title'
580 'xlink:type': 'xlink type'
581 'xml:base': 'xml base'
582 'xml:lang': 'xml lang'
583 'xml:space': 'xml space'
585 'xmlns:xlink': 'xmlns xlink'
587 adjust_mathml_attributes = (t) ->
589 if a[0] is 'definitionurl'
590 a[0] = 'definitionURL'
592 adjust_svg_attributes = (t) ->
594 if svg_attribute_fixes[a[0]]?
595 a[0] = svg_attribute_fixes[a[0]]
597 adjust_foreign_attributes = (t) ->
600 if foreign_attr_fixes[a[0]]?
601 a[0] = foreign_attr_fixes[a[0]]
604 # decode_named_char_ref()
606 # The list of named character references is _huge_ so if we're running in a
607 # browser, we get the browser to decode them, rather than increasing the code
608 # size to include the table.
609 if context is 'module'
610 _decode_named_char_ref = require './parser_no_browser_helper.coffee'
612 # TODO test this in IE8
613 decode_named_char_ref_el = document.createElement('textarea')
614 _decode_named_char_ref = (txt) ->
616 decode_named_char_ref_el.innerHTML = txt
617 decoded = decode_named_char_ref_el.value
618 return null if decoded is txt
620 # Pass the name of a named entity _that has a terminating semicolon_
621 # Entities without terminating semicolons should use legacy_char_refs[]
622 # Do not include the "&" or ";" in your argument, eg pass "alpha"
623 decode_named_char_ref_cache = {}
624 decode_named_char_ref = (txt) ->
625 decoded = decode_named_char_ref_cache[txt]
626 return decoded if decoded?
627 decoded = _decode_named_char_ref txt
628 return decode_named_char_ref_cache[txt] = decoded
630 parse_html = (args_html, args = {}) ->
632 cur = null # index of next char in txt to be parsed
633 # declare doc and tokenizer variables so they're in scope below
635 open_els = null # stack of open elements
636 afe = null # active formatting elements
637 template_ins_modes = null
639 original_ins_mode = null
641 tok_cur_tag = null # partially parsed tag
642 flag_scripting = null
643 flag_frameset_ok = null
645 flag_foster_parenting = null
646 form_element_pointer = null
647 temporary_buffer = null
648 pending_table_character_tokens = null
649 head_element_pointer = null
650 flag_fragment_parsing = null
651 context_element = null
662 # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
663 # "Noah's Ark clause" but with three
664 afe_push = (new_el) ->
667 if el.type is TYPE_AFE_MARKER
669 if el.name is new_el.name and el.namespace is new_el.namespace
672 unless new_el.attrs[k] is v
676 for k, v of new_el.attrs
677 unless el.attrs[k] is v
689 afe.unshift new_afe_marker()
692 # the functions below impliment the Tree Contstruction algorithm
693 # http://www.w3.org/TR/html5/syntax.html#tree-construction
695 # But first... the helpers
696 template_tag_is_open = ->
698 if el.name is 'template' and el.namespace is NS_HTML
701 is_in_scope_x = (tag_name, scope, namespace) ->
703 if el.name is tag_name and (namespace is null or namespace is el.namespace)
705 if scope[el.name] is el.namespace
708 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
710 if el.name is tag_name and (namespace is null or namespace is el.namespace)
712 if scope[el.name] is el.namespace
714 if scope2[el.name] is el.namespace
718 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
719 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
722 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
723 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
725 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
727 button_scopers = button: NS_HTML
728 li_scopers = ol: NS_HTML, ul: NS_HTML
729 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
730 is_in_scope = (tag_name, namespace = null) ->
731 return is_in_scope_x tag_name, standard_scopers, namespace
732 is_in_button_scope = (tag_name, namespace = null) ->
733 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
734 is_in_table_scope = (tag_name, namespace = null) ->
735 return is_in_scope_x tag_name, table_scopers, namespace
736 # aka is_in_list_item_scope
737 is_in_li_scope = (tag_name, namespace = null) ->
738 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
739 is_in_select_scope = (tag_name, namespace = null) ->
741 if t.name is tag_name and (namespace is null or namespace is t.namespace)
743 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
746 # this checks for a particular element, not by name
747 # this requires a namespace match
748 el_is_in_scope = (needle) ->
752 if standard_scopers[el.name] is el.namespace
756 clear_to_table_stopers = {
761 clear_stack_to_table_context = ->
763 if clear_to_table_stopers[open_els[0].name]?
767 clear_to_table_body_stopers = {
774 clear_stack_to_table_body_context = ->
776 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
780 clear_to_table_row_stopers = {
785 clear_stack_to_table_row_context = ->
787 if clear_to_table_row_stopers[open_els[0].name]?
791 clear_afe_to_marker = ->
793 return unless afe.length > 0 # this happens in fragment case, ?spec error
795 if el.type is TYPE_AFE_MARKER
800 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
802 # 1. Let last be false.
804 # 2. Let node be the last node in the stack of open elements.
806 node = open_els[node_i]
807 # 3. Loop: If node is the first node in the stack of open elements,
808 # then set last to true, and, if the parser was originally created as
809 # part of the HTML fragment parsing algorithm (fragment case) set node
810 # to the context element.
812 if node_i is open_els.length - 1
814 if flag_fragment_parsing
815 node = context_element
816 # 4. If node is a select element, run these substeps:
817 if node.name is 'select' and node.namespace is NS_HTML
818 # 1. If last is true, jump to the step below labeled done.
820 # 2. Let ancestor be node.
823 # 3. Loop: If ancestor is the first node in the stack of
824 # open elements, jump to the step below labeled done.
826 if ancestor_i is open_els.length - 1
828 # 4. Let ancestor be the node before ancestor in the stack
831 ancestor = open_els[ancestor_i]
832 # 5. If ancestor is a template node, jump to the step below
834 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
836 # 6. If ancestor is a table node, switch the insertion mode
837 # to "in select in table" and abort these steps.
838 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
839 ins_mode = ins_mode_in_select_in_table
841 # 7. Jump back to the step labeled loop.
842 # 8. Done: Switch the insertion mode to "in select" and abort
844 ins_mode = ins_mode_in_select
846 # 5. If node is a td or th element and last is false, then switch
847 # the insertion mode to "in cell" and abort these steps.
848 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
849 ins_mode = ins_mode_in_cell
851 # 6. If node is a tr element, then switch the insertion mode to "in
852 # row" and abort these steps.
853 if node.name is 'tr' and node.namespace is NS_HTML
854 ins_mode = ins_mode_in_row
856 # 7. If node is a tbody, thead, or tfoot element, then switch the
857 # insertion mode to "in table body" and abort these steps.
858 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
859 ins_mode = ins_mode_in_table_body
861 # 8. If node is a caption element, then switch the insertion mode
862 # to "in caption" and abort these steps.
863 if node.name is 'caption' and node.namespace is NS_HTML
864 ins_mode = ins_mode_in_caption
866 # 9. If node is a colgroup element, then switch the insertion mode
867 # to "in column group" and abort these steps.
868 if node.name is 'colgroup' and node.namespace is NS_HTML
869 ins_mode = ins_mode_in_column_group
871 # 10. If node is a table element, then switch the insertion mode to
872 # "in table" and abort these steps.
873 if node.name is 'table' and node.namespace is NS_HTML
874 ins_mode = ins_mode_in_table
876 # 11. If node is a template element, then switch the insertion mode
877 # to the current template insertion mode and abort these steps.
878 if node.name is 'template' and node.namespace is NS_HTML
879 ins_mode = template_ins_modes[0]
881 # 12. If node is a head element and last is true, then switch the
882 # insertion mode to "in body" ("in body"! not "in head"!) and abort
883 # these steps. (fragment case)
884 if node.name is 'head' and node.namespace is NS_HTML and last
885 ins_mode = ins_mode_in_body
887 # 13. If node is a head element and last is false, then switch the
888 # insertion mode to "in head" and abort these steps.
889 if node.name is 'head' and node.namespace is NS_HTML and last is false
890 ins_mode = ins_mode_in_head
892 # 14. If node is a body element, then switch the insertion mode to
893 # "in body" and abort these steps.
894 if node.name is 'body' and node.namespace is NS_HTML
895 ins_mode = ins_mode_in_body
897 # 15. If node is a frameset element, then switch the insertion mode
898 # to "in frameset" and abort these steps. (fragment case)
899 if node.name is 'frameset' and node.namespace is NS_HTML
900 ins_mode = ins_mode_in_frameset
902 # 16. If node is an html element, run these substeps:
903 if node.name is 'html' and node.namespace is NS_HTML
904 # 1. If the head element pointer is null, switch the insertion
905 # mode to "before head" and abort these steps. (fragment case)
906 if head_element_pointer is null
907 ins_mode = ins_mode_before_head
909 # 2. Otherwise, the head element pointer is not null,
910 # switch the insertion mode to "after head" and abort these
912 ins_mode = ins_mode_after_head
914 # 17. If last is true, then switch the insertion mode to "in body"
915 # and abort these steps. (fragment case)
917 ins_mode = ins_mode_in_body
919 # 18. Let node now be the node before node in the stack of open
922 node = open_els[node_i]
923 # 19. Return to the step labeled loop.
928 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
929 adjusted_current_node = ->
930 if open_els.length is 1 and flag_fragment_parsing
931 return context_element
934 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
935 # this implementation is structured (mostly) as described at the link above.
936 # capitalized comments are the "labels" described at the link above.
938 return if afe.length is 0
939 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
944 if i is afe.length - 1
947 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
952 el = insert_html_element afe[i].token
958 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
959 # adoption agency algorithm
961 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
962 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
963 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
964 adoption_agency = (subject) ->
965 # this block implements tha W3C spec
966 # # 1. If the current node is an HTML element whose tag name is subject,
967 # # then run these substeps:
969 # # 1. Let element be the current node.
971 # # 2. Pop element off the stack of open elements.
973 # # 3. If element is also in the list of active formatting elements,
974 # # remove the element from the list.
976 # # 4. Abort the adoption agency algorithm.
977 # if open_els[0].name is subject and open_els[0].namespace is NS_HTML
978 # el = open_els.shift()
979 # # remove it from the list of active formatting elements (if found)
985 # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
986 # If the current node is an HTML element whose tag name is subject, and
987 # the current node is not in the list of active formatting elements,
988 # then pop the current node off the stack of open elements, and abort
990 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
991 # remove it from the list of active formatting elements (if found)
1007 # 5. Let formatting element be the last element in the list of
1008 # active formatting elements that: is between the end of the list
1009 # and the last scope marker in the list, if any, or the start of
1010 # the list otherwise, and has the tag name subject.
1012 for t, fe_of_afe in afe
1013 if t.type is TYPE_AFE_MARKER
1015 if t.name is subject
1018 # If there is no such element, then abort these steps and instead
1019 # act as described in the "any other end tag" entry above.
1021 in_body_any_other_end_tag subject
1023 # 6. If formatting element is not in the stack of open elements,
1024 # then this is a parse error; remove the element from the list, and
1025 # abort these steps.
1027 for t, fe_of_open_els in open_els
1033 # "remove it from the list" must mean afe, since it's not in open_els
1034 afe.splice fe_of_afe, 1
1036 # 7. If formatting element is in the stack of open elements, but
1037 # the element is not in scope, then this is a parse error; abort
1039 unless el_is_in_scope fe
1042 # 8. If formatting element is not the current node, this is a parse
1043 # error. (But do not abort these steps.)
1044 unless open_els[0] is fe
1047 # 9. Let furthest block be the topmost node in the stack of open
1048 # elements that is lower in the stack than formatting element, and
1049 # is an element in the special category. There might not be one.
1051 fb_of_open_els = null
1052 for t, i in open_els
1058 # and continue, to see if there's one that's more "topmost"
1059 # 10. If there is no furthest block, then the UA must first pop all
1060 # the nodes from the bottom of the stack of open elements, from the
1061 # current node up to and including formatting element, then remove
1062 # formatting element from the list of active formatting elements,
1063 # and finally abort these steps.
1066 t = open_els.shift()
1068 afe.splice fe_of_afe, 1
1070 # 11. Let common ancestor be the element immediately above
1071 # formatting element in the stack of open elements.
1072 ca = open_els[fe_of_open_els + 1] # common ancestor
1074 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
1075 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
1076 bookmark = new_aaa_bookmark()
1079 afe.splice i, 0, bookmark
1081 node = last_node = fb
1085 # 3. Let node be the element immediately above node in the
1086 # stack of open elements, or if node is no longer in the stack
1087 # of open elements (e.g. because it got removed by this
1088 # algorithm), the element that was immediately above node in
1089 # the stack of open elements before node was removed.
1091 for t, i in open_els
1093 node_next = open_els[i + 1]
1095 node = node_next ? node_above
1096 # TODO make sure node_above gets re-set if/when node is removed from open_els
1098 # 4. If node is formatting element, then go to the next step in
1099 # the overall algorithm.
1102 # 5. If inner loop counter is greater than three and node is in
1103 # the list of active formatting elements, then remove node from
1104 # the list of active formatting elements.
1113 # 6. If node is not in the list of active formatting elements,
1114 # then remove node from the stack of open elements and then go
1115 # back to the step labeled inner loop.
1117 for t, i in open_els
1119 node_above = open_els[i + 1]
1120 open_els.splice i, 1
1123 # 7. create an element for the token for which the element node
1124 # was created, in the HTML namespace, with common ancestor as
1125 # the intended parent; replace the entry for node in the list
1126 # of active formatting elements with an entry for the new
1127 # element, replace the entry for node in the stack of open
1128 # elements with an entry for the new element, and let node be
1130 new_node = token_to_element node.token, NS_HTML, ca
1135 for t, i in open_els
1137 node_above = open_els[i + 1]
1138 open_els[i] = new_node
1141 # 8. If last node is furthest block, then move the
1142 # aforementioned bookmark to be immediately after the new node
1143 # in the list of active formatting elements.
1151 # "after" means lower
1152 afe.splice i, 0, bookmark # "after as <-
1154 # 9. Insert last node into node, first removing it from its
1155 # previous parent node if any.
1156 if last_node.parent?
1157 for c, i in last_node.parent.children
1159 last_node.parent.children.splice i, 1
1161 node.children.push last_node
1162 last_node.parent = node
1163 # 10. Let last node be node.
1165 # 11. Return to the step labeled inner loop.
1166 # 14. Insert whatever last node ended up being in the previous step
1167 # at the appropriate place for inserting a node, but using common
1168 # ancestor as the override target.
1170 # In the case where fe is immediately followed by fb:
1171 # * inner loop exits out early (node==fe)
1173 # * last_node is still in the tree (not a duplicate)
1174 if last_node.parent?
1175 for c, i in last_node.parent.children
1177 last_node.parent.children.splice i, 1
1179 # can't use standard insert token thing, because it's already in
1180 # open_els and must stay at it's current position in open_els
1181 dest = adjusted_insertion_location ca
1182 dest[0].children.splice dest[1], 0, last_node
1183 last_node.parent = dest[0]
1184 # 15. Create an element for the token for which formatting element
1185 # was created, in the HTML namespace, with furthest block as the
1187 new_element = token_to_element fe.token, NS_HTML, fb
1188 # 16. Take all of the child nodes of furthest block and append them
1189 # to the element created in the last step.
1190 while fb.children.length
1191 t = fb.children.shift()
1192 t.parent = new_element
1193 new_element.children.push t
1194 # 17. Append that new element to furthest block.
1195 new_element.parent = fb
1196 fb.children.push new_element
1197 # 18. Remove formatting element from the list of active formatting
1198 # elements, and insert the new element into the list of active
1199 # formatting elements at the position of the aforementioned
1207 afe[i] = new_element
1209 # 19. Remove formatting element from the stack of open elements,
1210 # and insert the new element into the stack of open elements
1211 # immediately below the position of furthest block in that stack.
1212 for t, i in open_els
1214 open_els.splice i, 1
1216 for t, i in open_els
1218 open_els.splice i, 0, new_element
1220 # 20. Jump back to the step labeled outer loop.
1223 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1224 close_p_element = ->
1225 generate_implied_end_tags 'p' # arg is exception
1226 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1228 while open_els.length > 1 # just in case
1229 el = open_els.shift()
1230 if el.name is 'p' and el.namespace is NS_HTML
1233 close_p_if_in_button_scope = ->
1234 if is_in_button_scope 'p', NS_HTML
1238 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1239 # aka insert_a_character = (t) ->
1240 insert_character = (t) ->
1241 dest = adjusted_insertion_location()
1242 # fixfull check for Document node
1244 prev = dest[0].children[dest[1] - 1]
1245 if prev.type is TYPE_TEXT
1248 dest[0].children.splice dest[1], 0, t
1251 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1252 process_token = (t) ->
1253 acn = adjusted_current_node()
1257 if acn.namespace is NS_HTML
1260 if is_mathml_text_integration_point(acn)
1261 if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1264 if t.type is TYPE_TEXT
1267 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1270 if is_html_integration acn
1271 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1274 if t.type is TYPE_EOF
1277 in_foreign_content t
1281 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1282 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1283 adjusted_insertion_location = (override_target = null) ->
1284 # 1. If there was an override target specified, then let target be the
1287 target = override_target
1288 else # Otherwise, let target be the current node.
1289 target = open_els[0]
1290 # 2. Determine the adjusted insertion location using the first matching
1291 # steps from the following list:
1293 # If foster parenting is enabled and target is a table, tbody, tfoot,
1294 # thead, or tr element Foster parenting happens when content is
1295 # misnested in tables.
1296 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1297 loop # once. this is here so we can ``break`` to "abort these substeps"
1298 # 1. Let last template be the last template element in the
1299 # stack of open elements, if any.
1300 last_template = null
1301 last_template_i = null
1302 for el, i in open_els
1303 if el.name is 'template' and el.namespace is NS_HTML
1307 # 2. Let last table be the last table element in the stack of
1308 # open elements, if any.
1311 for el, i in open_els
1312 if el.name is 'table' and el.namespace is NS_HTML
1316 # 3. If there is a last template and either there is no last
1317 # table, or there is one, but last template is lower (more
1318 # recently added) than last table in the stack of open
1319 # elements, then: let adjusted insertion location be inside
1320 # last template's template contents, after its last child (if
1321 # any), and abort these substeps.
1322 if last_template and (last_table is null or last_template_i < last_table_i)
1323 target = last_template # fixfull should be it's contents
1324 target_i = target.children.length
1326 # 4. If there is no last table, then let adjusted insertion
1327 # location be inside the first element in the stack of open
1328 # elements (the html element), after its last child (if any),
1329 # and abort these substeps. (fragment case)
1330 if last_table is null
1332 target = open_els[open_els.length - 1]
1333 target_i = target.children.length
1335 # 5. If last table has a parent element, then let adjusted
1336 # insertion location be inside last table's parent element,
1337 # immediately before last table, and abort these substeps.
1338 if last_table.parent?
1339 for c, i in last_table.parent.children
1341 target = last_table.parent
1345 # 6. Let previous element be the element immediately above last
1346 # table in the stack of open elements.
1348 # huh? how could it not have a parent?
1349 previous_element = open_els[last_table_i + 1]
1350 # 7. Let adjusted insertion location be inside previous
1351 # element, after its last child (if any).
1352 target = previous_element
1353 target_i = target.children.length
1354 # Note: These steps are involved in part because it's possible
1355 # for elements, the table element in this case in particular,
1356 # to have been moved by a script around in the DOM, or indeed
1357 # removed from the DOM entirely, after the element was inserted
1359 break # don't really loop
1361 # Otherwise Let adjusted insertion location be inside target, after
1362 # its last child (if any).
1363 target_i = target.children.length
1365 # 3. If the adjusted insertion location is inside a template element,
1366 # let it instead be inside the template element's template contents,
1367 # after its last child (if any).
1368 # fixfull (template)
1370 # 4. Return the adjusted insertion location.
1371 return [target, target_i]
1373 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1374 # aka create_an_element_for_token
1375 token_to_element = (t, namespace, intended_parent) ->
1376 # convert attributes into a hash
1379 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1380 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1382 # TODO 2. If the newly created element has an xmlns attribute in the
1383 # XMLNS namespace whose value is not exactly the same as the element's
1384 # namespace, that is a parse error. Similarly, if the newly created
1385 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1386 # value is not the XLink Namespace, that is a parse error.
1388 # fixfull: the spec says stuff about form pointers and ownerDocument
1392 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1393 insert_foreign_element = (token, namespace) ->
1394 ail = adjusted_insertion_location()
1397 el = token_to_element token, namespace, ail_el
1398 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1400 ail_el.children.splice ail_i, 0, el
1403 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1404 insert_html_element = (token) ->
1405 return insert_foreign_element token, NS_HTML
1407 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1408 # position should be [node, index_within_children]
1409 insert_comment = (t, position = null) ->
1410 position ?= adjusted_insertion_location()
1411 position[0].children.splice position[1], 0, t
1415 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1416 parse_generic_raw_text = (t) ->
1417 insert_html_element t
1418 tok_state = tok_state_rawtext
1419 original_ins_mode = ins_mode
1420 ins_mode = ins_mode_text
1422 parse_generic_rcdata_text = (t) ->
1423 insert_html_element t
1424 tok_state = tok_state_rcdata
1425 original_ins_mode = ins_mode
1426 ins_mode = ins_mode_text
1429 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1430 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1431 generate_implied_end_tags = (except = null) ->
1432 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1436 # 8.2.5.4 The rules for parsing tokens in HTML content
1437 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1439 # 8.2.5.4.1 The "initial" insertion mode
1440 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1441 is_quirks_yes_doctype = (t) ->
1442 if t.flag 'force-quirks'
1444 if t.name isnt 'html'
1446 if t.public_identifier?
1447 pi = t.public_identifier.toLowerCase()
1448 for p in quirks_yes_pi_prefixes
1449 if pi.substr(0, p.length) is p
1451 if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
1453 if t.system_identifier?
1454 if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
1456 else if t.public_identifier?
1457 # already did this: pi = t.public_identifier.toLowerCase()
1458 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1461 is_quirks_limited_doctype = (t) ->
1462 if t.public_identifier?
1463 pi = t.public_identifier.toLowerCase()
1464 if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
1466 if t.system_identifier?
1467 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1470 ins_mode_initial = (t) ->
1473 if t.type is TYPE_COMMENT
1477 if t.type is TYPE_DOCTYPE
1478 # fixfull syntax error from first paragraph and following bullets
1479 # fixfull set doc.doctype
1480 # fixfull is the "not an iframe srcdoc" thing relevant?
1481 if is_quirks_yes_doctype t
1482 doc.flag 'quirks mode', QUIRKS_YES
1483 else if is_quirks_limited_doctype t
1484 doc.flag 'quirks mode', QUIRKS_LIMITED
1486 ins_mode = ins_mode_before_html
1489 # fixfull not iframe srcdoc?
1491 doc.flag 'quirks mode', QUIRKS_YES
1492 ins_mode = ins_mode_before_html
1496 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1497 ins_mode_before_html = (t) ->
1498 if t.type is TYPE_DOCTYPE
1501 if t.type is TYPE_COMMENT
1506 if t.type is TYPE_START_TAG and t.name is 'html'
1507 el = token_to_element t, NS_HTML, doc
1508 doc.children.push el
1510 open_els.unshift(el)
1511 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1512 ins_mode = ins_mode_before_head
1514 if t.type is TYPE_END_TAG
1515 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1516 # fall through to "anything else"
1521 el = token_to_element new_open_tag('html'), NS_HTML, doc
1522 doc.children.push el
1525 # ?fixfull browsing context
1526 ins_mode = ins_mode_before_head
1530 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1531 ins_mode_before_head = (t) ->
1534 if t.type is TYPE_COMMENT
1537 if t.type is TYPE_DOCTYPE
1540 if t.type is TYPE_START_TAG and t.name is 'html'
1543 if t.type is TYPE_START_TAG and t.name is 'head'
1544 el = insert_html_element t
1545 head_element_pointer = el
1546 ins_mode = ins_mode_in_head
1548 if t.type is TYPE_END_TAG
1549 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1550 # fall through to Anything else below
1555 el = insert_html_element new_open_tag 'head'
1556 head_element_pointer = el
1557 ins_mode = ins_mode_in_head
1561 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1562 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1563 open_els.shift() # spec says this will be a 'head' node
1564 ins_mode = ins_mode_after_head
1567 ins_mode_in_head = (t) ->
1568 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1571 if t.type is TYPE_COMMENT
1574 if t.type is TYPE_DOCTYPE
1577 if t.type is TYPE_START_TAG and t.name is 'html'
1580 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1581 el = insert_html_element t
1583 t.acknowledge_self_closing()
1585 if t.type is TYPE_START_TAG and t.name is 'meta'
1586 el = insert_html_element t
1588 t.acknowledge_self_closing()
1589 # fixfull encoding stuff
1591 if t.type is TYPE_START_TAG and t.name is 'title'
1592 parse_generic_rcdata_text t
1594 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1595 parse_generic_raw_text t
1597 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1598 insert_html_element t
1599 ins_mode = ins_mode_in_head_noscript
1601 if t.type is TYPE_START_TAG and t.name is 'script'
1602 ail = adjusted_insertion_location()
1603 el = token_to_element t, NS_HTML, ail
1604 el.flag 'parser-inserted', true
1605 # fixfull frament case
1606 ail[0].children.splice ail[1], 0, el
1608 tok_state = tok_state_script_data
1609 original_ins_mode = ins_mode # make sure orig... is defined
1610 ins_mode = ins_mode_text
1612 if t.type is TYPE_END_TAG and t.name is 'head'
1613 open_els.shift() # will be a head element... spec says so
1614 ins_mode = ins_mode_after_head
1616 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1617 ins_mode_in_head_else t
1619 if t.type is TYPE_START_TAG and t.name is 'template'
1620 insert_html_element t
1622 flag_frameset_ok = false
1623 ins_mode = ins_mode_in_template
1624 template_ins_modes.unshift ins_mode_in_template
1626 if t.type is TYPE_END_TAG and t.name is 'template'
1627 if template_tag_is_open()
1628 generate_implied_end_tags
1629 if open_els[0].name isnt 'template'
1632 el = open_els.shift()
1633 if el.name is 'template' and el.namespace is NS_HTML
1635 clear_afe_to_marker()
1636 template_ins_modes.shift()
1641 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1644 ins_mode_in_head_else t
1647 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1648 ins_mode_in_head_noscript_else = (t) ->
1651 ins_mode = ins_mode_in_head
1654 ins_mode_in_head_noscript = (t) ->
1655 if t.type is TYPE_DOCTYPE
1658 if t.type is TYPE_START_TAG and t.name is 'html'
1661 if t.type is TYPE_END_TAG and t.name is 'noscript'
1663 ins_mode = ins_mode_in_head
1665 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1668 if t.type is TYPE_END_TAG and t.name is 'br'
1669 ins_mode_in_head_noscript_else t
1671 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1675 ins_mode_in_head_noscript_else t
1678 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1679 ins_mode_after_head_else = (t) ->
1680 body_tok = new_open_tag 'body'
1681 insert_html_element body_tok
1682 ins_mode = ins_mode_in_body
1685 ins_mode_after_head = (t) ->
1689 if t.type is TYPE_COMMENT
1692 if t.type is TYPE_DOCTYPE
1695 if t.type is TYPE_START_TAG and t.name is 'html'
1698 if t.type is TYPE_START_TAG and t.name is 'body'
1699 insert_html_element t
1700 flag_frameset_ok = false
1701 ins_mode = ins_mode_in_body
1703 if t.type is TYPE_START_TAG and t.name is 'frameset'
1704 insert_html_element t
1705 ins_mode = ins_mode_in_frameset
1707 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1709 open_els.unshift head_element_pointer
1711 for el, i in open_els
1712 if el is head_element_pointer
1713 open_els.splice i, 1
1716 if t.type is TYPE_END_TAG and t.name is 'template'
1719 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1720 ins_mode_after_head_else t
1722 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1726 ins_mode_after_head_else t
1729 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1730 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1733 if node.name is name and node.namespace is NS_HTML
1734 generate_implied_end_tags name # arg is exception
1735 unless node is open_els[0]
1738 el = open_els.shift()
1741 if special_elements[node.name] is node.namespace
1744 for el, i in open_els
1746 node = open_els[i + 1]
1749 ins_mode_in_body = (t) ->
1750 if t.type is TYPE_TEXT and t.text is "\u0000"
1757 if t.type is TYPE_TEXT
1760 flag_frameset_ok = false
1762 if t.type is TYPE_COMMENT
1765 if t.type is TYPE_DOCTYPE
1768 if t.type is TYPE_START_TAG and t.name is 'html'
1770 return if template_tag_is_open()
1771 root_attrs = open_els[open_els.length - 1].attrs
1773 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1776 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1779 if t.type is TYPE_START_TAG and t.name is 'body'
1781 return if open_els.length < 2
1782 second = open_els[open_els.length - 2]
1783 return unless second.namespace is NS_HTML
1784 return unless second.name is 'body'
1785 return if template_tag_is_open()
1786 flag_frameset_ok = false
1788 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1790 if t.type is TYPE_START_TAG and t.name is 'frameset'
1792 return if open_els.length < 2
1793 second_i = open_els.length - 2
1794 second = open_els[second_i]
1795 return unless second.namespace is NS_HTML
1796 return unless second.name is 'body'
1797 if flag_frameset_ok is false
1800 for el, i in second.parent.children
1802 second.parent.children.splice i, 1
1804 open_els.splice second_i, 1
1805 # pop everything except the "root html element"
1806 while open_els.length > 1
1808 insert_html_element t
1809 ins_mode = ins_mode_in_frameset
1811 if t.type is TYPE_EOF
1813 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1814 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1815 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1818 unless ok_tags[t.name] is el.namespace
1821 if template_ins_modes.length > 0
1822 ins_mode_in_template t
1826 if t.type is TYPE_END_TAG and t.name is 'body'
1827 unless is_in_scope 'body', NS_HTML
1831 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1832 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1833 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1834 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1838 unless ok_tags[t.name] is el.namespace
1841 ins_mode = ins_mode_after_body
1843 if t.type is TYPE_END_TAG and t.name is 'html'
1844 unless is_in_scope 'body', NS_HTML
1848 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1849 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1850 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1851 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1855 unless ok_tags[t.name] is el.namespace
1858 ins_mode = ins_mode_after_body
1861 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1862 close_p_if_in_button_scope()
1863 insert_html_element t
1865 if t.type is TYPE_START_TAG and h_tags[t.name]?
1866 close_p_if_in_button_scope()
1867 if h_tags[open_els[0].name] is open_els[0].namespace
1870 insert_html_element t
1872 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1873 close_p_if_in_button_scope()
1874 insert_html_element t
1875 eat_next_token_if_newline()
1876 flag_frameset_ok = false
1878 if t.type is TYPE_START_TAG and t.name is 'form'
1879 unless form_element_pointer is null or template_tag_is_open()
1882 close_p_if_in_button_scope()
1883 el = insert_html_element t
1884 unless template_tag_is_open()
1885 form_element_pointer = el
1887 if t.type is TYPE_START_TAG and t.name is 'li'
1888 flag_frameset_ok = false
1889 for node in open_els
1890 if node.name is 'li' and node.namespace is NS_HTML
1891 generate_implied_end_tags 'li' # arg is exception
1892 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1895 el = open_els.shift()
1896 if el.name is 'li' and el.namespace is NS_HTML
1899 if el_is_special_not_adp node
1901 close_p_if_in_button_scope()
1902 insert_html_element t
1904 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1905 flag_frameset_ok = false
1906 for node in open_els
1907 if node.name is 'dd' and node.namespace is NS_HTML
1908 generate_implied_end_tags 'dd' # arg is exception
1909 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1912 el = open_els.shift()
1913 if el.name is 'dd' and el.namespace is NS_HTML
1916 if node.name is 'dt' and node.namespace is NS_HTML
1917 generate_implied_end_tags 'dt' # arg is exception
1918 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1921 el = open_els.shift()
1922 if el.name is 'dt' and el.namespace is NS_HTML
1925 if el_is_special_not_adp node
1927 close_p_if_in_button_scope()
1928 insert_html_element t
1930 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1931 close_p_if_in_button_scope()
1932 insert_html_element t
1933 tok_state = tok_state_plaintext
1935 if t.type is TYPE_START_TAG and t.name is 'button'
1936 if is_in_scope 'button', NS_HTML
1938 generate_implied_end_tags()
1940 el = open_els.shift()
1941 if el.name is 'button' and el.namespace is NS_HTML
1944 insert_html_element t
1945 flag_frameset_ok = false
1947 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1948 unless is_in_scope t.name, NS_HTML
1951 generate_implied_end_tags()
1952 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1955 el = open_els.shift()
1956 if el.name is t.name and el.namespace is NS_HTML
1959 if t.type is TYPE_END_TAG and t.name is 'form'
1960 unless template_tag_is_open()
1961 node = form_element_pointer
1962 form_element_pointer = null
1963 if node is null or not el_is_in_scope node
1966 generate_implied_end_tags()
1967 if open_els[0] isnt node
1969 for el, i in open_els
1971 open_els.splice i, 1
1974 unless is_in_scope 'form', NS_HTML
1977 generate_implied_end_tags()
1978 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1981 el = open_els.shift()
1982 if el.name is 'form' and el.namespace is NS_HTML
1985 if t.type is TYPE_END_TAG and t.name is 'p'
1986 unless is_in_button_scope 'p', NS_HTML
1988 insert_html_element new_open_tag 'p'
1991 if t.type is TYPE_END_TAG and t.name is 'li'
1992 unless is_in_li_scope 'li', NS_HTML
1995 generate_implied_end_tags 'li' # arg is exception
1996 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1999 el = open_els.shift()
2000 if el.name is 'li' and el.namespace is NS_HTML
2003 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
2004 unless is_in_scope t.name, NS_HTML
2007 generate_implied_end_tags t.name # arg is exception
2008 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2011 el = open_els.shift()
2012 if el.name is t.name and el.namespace is NS_HTML
2015 if t.type is TYPE_END_TAG and h_tags[t.name]?
2018 if h_tags[el.name] is el.namespace
2021 if standard_scopers[el.name] is el.namespace
2026 generate_implied_end_tags()
2027 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2030 el = open_els.shift()
2031 if h_tags[el.name] is el.namespace
2035 if t.type is TYPE_START_TAG and t.name is 'a'
2036 # If the list of active formatting elements contains an a element
2037 # between the end of the list and the last marker on the list (or
2038 # the start of the list if there is no marker on the list), then
2039 # this is a parse error; run the adoption agency algorithm for the
2040 # tag name "a", then remove that element from the list of active
2041 # formatting elements and the stack of open elements if the
2042 # adoption agency algorithm didn't already remove it (it might not
2043 # have if the element is not in table scope).
2046 if el.type is TYPE_AFE_MARKER
2048 if el.name is 'a' and el.namespace is NS_HTML
2056 for el, i in open_els
2058 open_els.splice i, 1
2060 el = insert_html_element t
2063 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2065 el = insert_html_element t
2068 if t.type is TYPE_START_TAG and t.name is 'nobr'
2070 if is_in_scope 'nobr', NS_HTML
2072 adoption_agency 'nobr'
2074 el = insert_html_element t
2077 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2078 adoption_agency t.name
2080 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2082 insert_html_element t
2084 flag_frameset_ok = false
2086 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2087 unless is_in_scope t.name, NS_HTML
2090 generate_implied_end_tags()
2091 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2094 el = open_els.shift()
2095 if el.name is t.name and el.namespace is NS_HTML
2097 clear_afe_to_marker()
2099 if t.type is TYPE_START_TAG and t.name is 'table'
2100 unless doc.flag('quirks mode') is QUIRKS_YES
2101 close_p_if_in_button_scope() # test
2102 insert_html_element t
2103 flag_frameset_ok = false
2104 ins_mode = ins_mode_in_table
2106 if t.type is TYPE_END_TAG and t.name is 'br'
2108 # W3C: t.type = TYPE_START_TAG
2109 t = new_open_tag 'br' # WHATWG
2111 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2113 insert_html_element t
2115 t.acknowledge_self_closing()
2116 flag_frameset_ok = false
2118 if t.type is TYPE_START_TAG and t.name is 'input'
2120 insert_html_element t
2122 t.acknowledge_self_closing()
2123 unless is_input_hidden_tok t
2124 flag_frameset_ok = false
2126 if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
2127 # WHATWG adds 'menuitem' for this block
2128 insert_html_element t
2130 t.acknowledge_self_closing()
2132 if t.type is TYPE_START_TAG and t.name is 'hr'
2133 close_p_if_in_button_scope()
2134 insert_html_element t
2136 t.acknowledge_self_closing()
2137 flag_frameset_ok = false
2139 if t.type is TYPE_START_TAG and t.name is 'image'
2144 if t.type is TYPE_START_TAG and t.name is 'isindex'
2146 if template_tag_is_open() is false and form_element_pointer isnt null
2148 t.acknowledge_self_closing()
2149 flag_frameset_ok = false
2150 close_p_if_in_button_scope()
2151 el = insert_html_element new_open_tag 'form'
2152 unless template_tag_is_open()
2153 form_element_pointer = el
2156 el.attrs['action'] = a[1]
2158 insert_html_element new_open_tag 'hr'
2161 insert_html_element new_open_tag 'label'
2162 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2163 input_el = new_open_tag 'input'
2168 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2169 input_el.attrs_a.push [a[0], a[1]]
2170 input_el.attrs_a.push ['name', 'isindex']
2171 # fixfull this next bit is in english... internationalize?
2172 prompt ?= "This is a searchable index. Enter search keywords: "
2173 insert_character new_character_token prompt # fixfull split
2174 # TODO submit typo "balue" in spec
2175 insert_html_element input_el
2177 # insert_character '' # you can put chars here if promt attr missing
2179 insert_html_element new_open_tag 'hr'
2182 unless template_tag_is_open()
2183 form_element_pointer = null
2185 if t.type is TYPE_START_TAG and t.name is 'textarea'
2186 insert_html_element t
2187 eat_next_token_if_newline()
2188 tok_state = tok_state_rcdata
2189 original_ins_mode = ins_mode
2190 flag_frameset_ok = false
2191 ins_mode = ins_mode_text
2193 if t.type is TYPE_START_TAG and t.name is 'xmp'
2194 close_p_if_in_button_scope()
2196 flag_frameset_ok = false
2197 parse_generic_raw_text t
2199 if t.type is TYPE_START_TAG and t.name is 'iframe'
2200 flag_frameset_ok = false
2201 parse_generic_raw_text t
2203 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2204 parse_generic_raw_text t
2206 if t.type is TYPE_START_TAG and t.name is 'select'
2208 insert_html_element t
2209 flag_frameset_ok = false
2210 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2211 ins_mode = ins_mode_in_select_in_table
2213 ins_mode = ins_mode_in_select
2215 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2216 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2219 insert_html_element t
2221 # this comment block implements the W3C spec
2222 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2223 # if is_in_scope 'ruby', NS_HTML
2224 # generate_implied_end_tags()
2225 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2227 # insert_html_element t
2229 # if t.type is TYPE_START_TAG and t.name is 'rt'
2230 # if is_in_scope 'ruby', NS_HTML
2231 # generate_implied_end_tags 'rtc' # arg is exception
2232 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2234 # insert_html_element t
2236 # below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2237 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2238 if is_in_scope 'ruby', NS_HTML
2239 generate_implied_end_tags()
2240 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2242 insert_html_element t
2244 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2245 if is_in_scope 'ruby', NS_HTML
2246 generate_implied_end_tags 'rtc'
2247 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2249 insert_html_element t
2252 if t.type is TYPE_START_TAG and t.name is 'math'
2254 adjust_mathml_attributes t
2255 adjust_foreign_attributes t
2256 insert_foreign_element t, NS_MATHML
2257 if t.flag 'self-closing'
2259 t.acknowledge_self_closing()
2261 if t.type is TYPE_START_TAG and t.name is 'svg'
2263 adjust_svg_attributes t
2264 adjust_foreign_attributes t
2265 insert_foreign_element t, NS_SVG
2266 if t.flag 'self-closing'
2268 t.acknowledge_self_closing()
2270 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2273 if t.type is TYPE_START_TAG # any other start tag
2275 insert_html_element t
2277 if t.type is TYPE_END_TAG # any other end tag
2278 in_body_any_other_end_tag t.name
2282 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2283 ins_mode_text = (t) ->
2284 if t.type is TYPE_TEXT
2287 if t.type is TYPE_EOF
2289 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2290 open_els[0].flag 'already started', true
2292 ins_mode = original_ins_mode
2295 if t.type is TYPE_END_TAG and t.name is 'script'
2297 ins_mode = original_ins_mode
2298 # fixfull the spec seems to assume that I'm going to run the script
2299 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2301 if t.type is TYPE_END_TAG
2303 ins_mode = original_ins_mode
2307 # the functions below implement the tokenizer stats described here:
2308 # http://www.w3.org/TR/html5/syntax.html#tokenization
2310 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2311 ins_mode_in_table_else = (t) ->
2313 flag_foster_parenting = true
2315 flag_foster_parenting = false
2317 ins_mode_in_table = (t) ->
2320 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2321 pending_table_character_tokens = []
2322 original_ins_mode = ins_mode
2323 ins_mode = ins_mode_in_table_text
2326 ins_mode_in_table_else t
2334 clear_stack_to_table_context()
2336 insert_html_element t
2337 ins_mode = ins_mode_in_caption
2339 clear_stack_to_table_context()
2340 insert_html_element t
2341 ins_mode = ins_mode_in_column_group
2343 clear_stack_to_table_context()
2344 insert_html_element new_open_tag 'colgroup'
2345 ins_mode = ins_mode_in_column_group
2347 when 'tbody', 'tfoot', 'thead'
2348 clear_stack_to_table_context()
2349 insert_html_element t
2350 ins_mode = ins_mode_in_table_body
2351 when 'td', 'th', 'tr'
2352 clear_stack_to_table_context()
2353 insert_html_element new_open_tag 'tbody'
2354 ins_mode = ins_mode_in_table_body
2358 if is_in_table_scope 'table', NS_HTML
2360 el = open_els.shift()
2361 if el.name is 'table' and el.namespace is NS_HTML
2365 when 'style', 'script', 'template'
2368 unless is_input_hidden_tok t
2369 ins_mode_in_table_else t
2372 el = insert_html_element t
2374 t.acknowledge_self_closing()
2377 if form_element_pointer?
2379 if template_tag_is_open()
2381 form_element_pointer = insert_html_element t
2384 ins_mode_in_table_else t
2388 if is_in_table_scope 'table', NS_HTML
2390 el = open_els.shift()
2391 if el.name is 'table' and el.namespace is NS_HTML
2396 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2401 ins_mode_in_table_else t
2405 ins_mode_in_table_else t
2409 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2410 ins_mode_in_table_text = (t) ->
2411 if t.type is TYPE_TEXT and t.text is "\u0000"
2415 if t.type is TYPE_TEXT
2416 pending_table_character_tokens.push t
2420 for old in pending_table_character_tokens
2421 unless is_space_tok old
2425 for old in pending_table_character_tokens
2426 insert_character old
2428 for old in pending_table_character_tokens
2429 ins_mode_in_table_else old
2430 pending_table_character_tokens = []
2431 ins_mode = original_ins_mode
2435 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2436 ins_mode_in_caption = (t) ->
2437 if t.type is TYPE_END_TAG and t.name is 'caption'
2438 if is_in_table_scope 'caption', NS_HTML
2439 generate_implied_end_tags()
2440 if open_els[0].name isnt 'caption'
2443 el = open_els.shift()
2444 if el.name is 'caption' and el.namespace is NS_HTML
2446 clear_afe_to_marker()
2447 ins_mode = ins_mode_in_table
2452 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2454 if is_in_table_scope 'caption', NS_HTML
2456 el = open_els.shift()
2457 if el.name is 'caption' and el.namespace is NS_HTML
2459 clear_afe_to_marker()
2460 ins_mode = ins_mode_in_table
2462 # else fragment case
2464 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2471 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2472 ins_mode_in_column_group = (t) ->
2476 if t.type is TYPE_COMMENT
2479 if t.type is TYPE_DOCTYPE
2482 if t.type is TYPE_START_TAG and t.name is 'html'
2485 if t.type is TYPE_START_TAG and t.name is 'col'
2486 el = insert_html_element t
2488 t.acknowledge_self_closing()
2490 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2491 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2493 ins_mode = ins_mode_in_table
2497 if t.type is TYPE_END_TAG and t.name is 'col'
2500 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2503 if t.type is TYPE_EOF
2507 if open_els[0].name isnt 'colgroup'
2511 ins_mode = ins_mode_in_table
2515 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2516 ins_mode_in_table_body = (t) ->
2517 if t.type is TYPE_START_TAG and t.name is 'tr'
2518 clear_stack_to_table_body_context()
2519 insert_html_element t
2520 ins_mode = ins_mode_in_row
2522 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2524 clear_stack_to_table_body_context()
2525 insert_html_element new_open_tag 'tr'
2526 ins_mode = ins_mode_in_row
2529 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2530 unless is_in_table_scope t.name, NS_HTML
2533 clear_stack_to_table_body_context()
2535 ins_mode = ins_mode_in_table
2537 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2540 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2543 if table_scopers[el.name] is el.namespace
2548 clear_stack_to_table_body_context()
2550 ins_mode = ins_mode_in_table
2553 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2560 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2561 ins_mode_in_row = (t) ->
2562 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2563 clear_stack_to_table_row_context()
2564 insert_html_element t
2565 ins_mode = ins_mode_in_cell
2568 if t.type is TYPE_END_TAG and t.name is 'tr'
2569 if is_in_table_scope 'tr', NS_HTML
2570 clear_stack_to_table_row_context()
2572 ins_mode = ins_mode_in_table_body
2576 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2577 if is_in_table_scope 'tr', NS_HTML
2578 clear_stack_to_table_row_context()
2580 ins_mode = ins_mode_in_table_body
2585 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2586 if is_in_table_scope t.name, NS_HTML
2587 if is_in_table_scope 'tr', NS_HTML
2588 clear_stack_to_table_row_context()
2590 ins_mode = ins_mode_in_table_body
2595 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2602 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2604 generate_implied_end_tags()
2605 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2608 el = open_els.shift()
2609 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2611 clear_afe_to_marker()
2612 ins_mode = ins_mode_in_row
2615 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2616 ins_mode_in_cell = (t) ->
2617 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2618 if is_in_table_scope t.name, NS_HTML
2619 generate_implied_end_tags()
2620 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2623 el = open_els.shift()
2624 if el.name is t.name and el.namespace is NS_HTML
2626 clear_afe_to_marker()
2627 ins_mode = ins_mode_in_row
2631 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2634 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2637 if table_scopers[el.name] is el.namespace
2645 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2648 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2649 if is_in_table_scope t.name, NS_HTML
2659 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2660 ins_mode_in_select = (t) ->
2661 if t.type is TYPE_TEXT and t.text is "\u0000"
2664 if t.type is TYPE_TEXT
2667 if t.type is TYPE_COMMENT
2670 if t.type is TYPE_DOCTYPE
2673 if t.type is TYPE_START_TAG and t.name is 'html'
2676 if t.type is TYPE_START_TAG and t.name is 'option'
2677 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2679 insert_html_element t
2681 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2682 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2684 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2686 insert_html_element t
2688 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2689 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2690 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2692 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2697 if t.type is TYPE_END_TAG and t.name is 'option'
2698 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2703 if t.type is TYPE_END_TAG and t.name is 'select'
2704 if is_in_select_scope 'select', NS_HTML
2706 el = open_els.shift()
2707 if el.name is 'select' and el.namespace is NS_HTML
2713 if t.type is TYPE_START_TAG and t.name is 'select'
2716 el = open_els.shift()
2717 if el.name is 'select' and el.namespace is NS_HTML
2720 # spec says that this is the same as </select> but it doesn't say
2721 # to check scope first
2723 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2725 unless is_in_select_scope 'select', NS_HTML
2728 el = open_els.shift()
2729 if el.name is 'select' and el.namespace is NS_HTML
2734 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2737 if t.type is TYPE_EOF
2744 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2745 ins_mode_in_select_in_table = (t) ->
2746 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2749 el = open_els.shift()
2750 if el.name is 'select' and el.namespace is NS_HTML
2755 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2757 unless is_in_table_scope t.name, NS_HTML
2760 el = open_els.shift()
2761 if el.name is 'select' and el.namespace is NS_HTML
2767 ins_mode_in_select t
2770 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2771 ins_mode_in_template = (t) ->
2772 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2775 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2778 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2779 template_ins_modes.shift()
2780 template_ins_modes.unshift ins_mode_in_table
2781 ins_mode = ins_mode_in_table
2784 if t.type is TYPE_START_TAG and t.name is 'col'
2785 template_ins_modes.shift()
2786 template_ins_modes.unshift ins_mode_in_column_group
2787 ins_mode = ins_mode_in_column_group
2790 if t.type is TYPE_START_TAG and t.name is 'tr'
2791 template_ins_modes.shift()
2792 template_ins_modes.unshift ins_mode_in_table_body
2793 ins_mode = ins_mode_in_table_body
2796 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2797 template_ins_modes.shift()
2798 template_ins_modes.unshift ins_mode_in_row
2799 ins_mode = ins_mode_in_row
2802 if t.type is TYPE_START_TAG
2803 template_ins_modes.shift()
2804 template_ins_modes.unshift ins_mode_in_body
2805 ins_mode = ins_mode_in_body
2808 if t.type is TYPE_END_TAG
2811 if t.type is TYPE_EOF
2812 unless template_tag_is_open()
2817 el = open_els.shift()
2818 if el.name is 'template' and el.namespace is NS_HTML
2820 clear_afe_to_marker()
2821 template_ins_modes.shift()
2826 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2827 ins_mode_after_body = (t) ->
2831 if t.type is TYPE_COMMENT
2832 first = open_els[open_els.length - 1]
2833 insert_comment t, [first, first.children.length]
2835 if t.type is TYPE_DOCTYPE
2838 if t.type is TYPE_START_TAG and t.name is 'html'
2841 if t.type is TYPE_END_TAG and t.name is 'html'
2842 if flag_fragment_parsing
2845 ins_mode = ins_mode_after_after_body
2847 if t.type is TYPE_EOF
2852 ins_mode = ins_mode_in_body
2856 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2857 ins_mode_in_frameset = (t) ->
2861 if t.type is TYPE_COMMENT
2864 if t.type is TYPE_DOCTYPE
2867 if t.type is TYPE_START_TAG and t.name is 'html'
2870 if t.type is TYPE_START_TAG and t.name is 'frameset'
2871 insert_html_element t
2873 if t.type is TYPE_END_TAG and t.name is 'frameset'
2874 if open_els.length is 1
2876 return # fragment case
2878 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2879 ins_mode = ins_mode_after_frameset
2881 if t.type is TYPE_START_TAG and t.name is 'frame'
2882 insert_html_element t
2884 t.acknowledge_self_closing()
2886 if t.type is TYPE_START_TAG and t.name is 'noframes'
2889 if t.type is TYPE_EOF
2890 if open_els.length isnt 1
2898 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2899 ins_mode_after_frameset = (t) ->
2903 if t.type is TYPE_COMMENT
2906 if t.type is TYPE_DOCTYPE
2909 if t.type is TYPE_START_TAG and t.name is 'html'
2912 if t.type is TYPE_END_TAG and t.name is 'html'
2913 ins_mode = ins_mode_after_after_frameset
2915 if t.type is TYPE_START_TAG and t.name is 'noframes'
2918 if t.type is TYPE_EOF
2925 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2926 ins_mode_after_after_body = (t) ->
2927 if t.type is TYPE_COMMENT
2928 insert_comment t, [doc, doc.children.length]
2930 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2933 if t.type is TYPE_EOF
2938 ins_mode = ins_mode_in_body
2942 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2943 ins_mode_after_after_frameset = (t) ->
2944 if t.type is TYPE_COMMENT
2945 insert_comment t, [doc, doc.children.length]
2947 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2950 if t.type is TYPE_EOF
2953 if t.type is TYPE_START_TAG and t.name is 'noframes'
2960 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2961 has_color_face_or_size = (t) ->
2963 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2966 in_foreign_content_end_script = ->
2970 in_foreign_content_other_start = (t) ->
2971 acn = adjusted_current_node()
2972 if acn.namespace is NS_MATHML
2973 adjust_mathml_attributes t
2974 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2975 t.name = svg_name_fixes[t.name]
2976 if acn.namespace is NS_SVG
2977 adjust_svg_attributes t
2978 adjust_foreign_attributes t
2979 insert_foreign_element t, acn.namespace
2980 if t.flag 'self-closing'
2981 if t.name is 'script'
2982 t.acknowledge_self_closing()
2983 in_foreign_content_end_script()
2987 t.acknowledge_self_closing()
2989 in_foreign_content = (t) ->
2990 if t.type is TYPE_TEXT and t.text is "\u0000"
2992 insert_character new_character_token "\ufffd"
2997 if t.type is TYPE_TEXT
2998 flag_frameset_ok = false
3001 if t.type is TYPE_COMMENT
3004 if t.type is TYPE_DOCTYPE
3007 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
3009 if flag_fragment_parsing
3010 in_foreign_content_other_start t
3012 loop # is this safe?
3014 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
3018 if t.type is TYPE_START_TAG
3019 in_foreign_content_other_start t
3021 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
3022 in_foreign_content_end_script()
3024 if t.type is TYPE_END_TAG
3027 if node.name.toLowerCase() isnt t.name
3030 if node is open_els[open_els.length - 1]
3032 if node.name.toLowerCase() is t.name
3034 el = open_els.shift()
3039 if node.namespace is NS_HTML
3041 ins_mode t # explicitly call HTML insertion mode
3045 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
3047 switch c = txt.charAt(cur++)
3049 return new_text_node parse_character_reference()
3051 tok_state = tok_state_tag_open
3054 return new_text_node c
3056 return new_eof_token()
3058 return new_text_node c
3061 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
3062 # not needed: tok_state_character_reference_in_data = ->
3063 # just call parse_character_reference()
3065 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
3066 tok_state_rcdata = ->
3067 switch c = txt.charAt(cur++)
3069 return new_text_node parse_character_reference()
3071 tok_state = tok_state_rcdata_less_than_sign
3074 return new_character_token "\ufffd"
3076 return new_eof_token()
3078 return new_character_token c
3081 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
3082 # not needed: tok_state_character_reference_in_rcdata = ->
3083 # just call parse_character_reference()
3085 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
3086 tok_state_rawtext = ->
3087 switch c = txt.charAt(cur++)
3089 tok_state = tok_state_rawtext_less_than_sign
3092 return new_character_token "\ufffd"
3094 return new_eof_token()
3096 return new_character_token c
3099 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
3100 tok_state_script_data = ->
3101 switch c = txt.charAt(cur++)
3103 tok_state = tok_state_script_data_less_than_sign
3106 return new_character_token "\ufffd"
3108 return new_eof_token()
3110 return new_character_token c
3113 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3114 tok_state_plaintext = ->
3115 switch c = txt.charAt(cur++)
3118 return new_character_token "\ufffd"
3120 return new_eof_token()
3122 return new_character_token c
3126 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3127 tok_state_tag_open = ->
3128 c = txt.charAt(cur++)
3130 tok_state = tok_state_markup_declaration_open
3133 tok_state = tok_state_end_tag_open
3136 tok_cur_tag = new_open_tag c.toLowerCase()
3137 tok_state = tok_state_tag_name
3140 tok_cur_tag = new_open_tag c
3141 tok_state = tok_state_tag_name
3145 tok_cur_tag = new_comment_token '?' # FIXME right?
3146 tok_state = tok_state_bogus_comment
3150 tok_state = tok_state_data
3151 cur -= 1 # we didn't parse/handle the char after <
3152 return new_text_node '<'
3154 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3155 tok_state_end_tag_open = ->
3156 c = txt.charAt(cur++)
3158 tok_cur_tag = new_end_tag c.toLowerCase()
3159 tok_state = tok_state_tag_name
3162 tok_cur_tag = new_end_tag c
3163 tok_state = tok_state_tag_name
3167 tok_state = tok_state_data
3171 tok_state = tok_state_data
3172 return new_text_node '</'
3175 tok_cur_tag = new_comment_token c
3176 tok_state = tok_state_bogus_comment
3179 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3180 tok_state_tag_name = ->
3181 switch c = txt.charAt(cur++)
3182 when "\t", "\n", "\u000c", ' '
3183 tok_state = tok_state_before_attribute_name
3185 tok_state = tok_state_self_closing_start_tag
3187 tok_state = tok_state_data
3193 tok_cur_tag.name += "\ufffd"
3196 tok_state = tok_state_data
3199 tok_cur_tag.name += c.toLowerCase()
3201 tok_cur_tag.name += c
3204 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3205 tok_state_rcdata_less_than_sign = ->
3206 c = txt.charAt(cur++)
3208 temporary_buffer = ''
3209 tok_state = tok_state_rcdata_end_tag_open
3212 tok_state = tok_state_rcdata
3213 cur -= 1 # reconsume the input character
3214 return new_character_token '<'
3216 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3217 tok_state_rcdata_end_tag_open = ->
3218 c = txt.charAt(cur++)
3220 tok_cur_tag = new_end_tag c.toLowerCase()
3221 temporary_buffer += c
3222 tok_state = tok_state_rcdata_end_tag_name
3225 tok_cur_tag = new_end_tag c
3226 temporary_buffer += c
3227 tok_state = tok_state_rcdata_end_tag_name
3230 tok_state = tok_state_rcdata
3231 cur -= 1 # reconsume the input character
3232 return new_character_token "</" # fixfull separate these
3234 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3235 is_appropriate_end_tag = (t) ->
3236 # fixfull: this assumes that open_els[0].name is "the tag name of the last
3237 # start tag to have been emitted from this tokenizer"
3238 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3240 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3241 tok_state_rcdata_end_tag_name = ->
3242 c = txt.charAt(cur++)
3243 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3244 if is_appropriate_end_tag tok_cur_tag
3245 tok_state = tok_state_before_attribute_name
3247 # else fall through to "Anything else"
3249 if is_appropriate_end_tag tok_cur_tag
3250 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3252 # else fall through to "Anything else"
3254 if is_appropriate_end_tag tok_cur_tag
3255 tok_state = tok_state_data
3257 # else fall through to "Anything else"
3259 tok_cur_tag.name += c.toLowerCase()
3260 temporary_buffer += c
3263 tok_cur_tag.name += c
3264 temporary_buffer += c
3267 tok_state = tok_state_rcdata
3268 cur -= 1 # reconsume the input character
3269 return new_character_token '</' + temporary_buffer # fixfull separate these
3271 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3272 tok_state_rawtext_less_than_sign = ->
3273 c = txt.charAt(cur++)
3275 temporary_buffer = ''
3276 tok_state = tok_state_rawtext_end_tag_open
3279 tok_state = tok_state_rawtext
3280 cur -= 1 # reconsume the input character
3281 return new_character_token '<'
3283 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3284 tok_state_rawtext_end_tag_open = ->
3285 c = txt.charAt(cur++)
3287 tok_cur_tag = new_end_tag c.toLowerCase()
3288 temporary_buffer += c
3289 tok_state = tok_state_rawtext_end_tag_name
3292 tok_cur_tag = new_end_tag c
3293 temporary_buffer += c
3294 tok_state = tok_state_rawtext_end_tag_name
3297 tok_state = tok_state_rawtext
3298 cur -= 1 # reconsume the input character
3299 return new_character_token "</" # fixfull separate these
3301 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3302 tok_state_rawtext_end_tag_name = ->
3303 c = txt.charAt(cur++)
3304 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3305 if is_appropriate_end_tag tok_cur_tag
3306 tok_state = tok_state_before_attribute_name
3308 # else fall through to "Anything else"
3310 if is_appropriate_end_tag tok_cur_tag
3311 tok_state = tok_state_self_closing_start_tag
3313 # else fall through to "Anything else"
3315 if is_appropriate_end_tag tok_cur_tag
3316 tok_state = tok_state_data
3318 # else fall through to "Anything else"
3320 tok_cur_tag.name += c.toLowerCase()
3321 temporary_buffer += c
3324 tok_cur_tag.name += c
3325 temporary_buffer += c
3328 tok_state = tok_state_rawtext
3329 cur -= 1 # reconsume the input character
3330 return new_character_token '</' + temporary_buffer # fixfull separate these
3332 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3333 tok_state_script_data_less_than_sign = ->
3334 c = txt.charAt(cur++)
3336 temporary_buffer = ''
3337 tok_state = tok_state_script_data_end_tag_open
3340 tok_state = tok_state_script_data_escape_start
3341 return new_character_token '<!' # fixfull split
3343 tok_state = tok_state_script_data
3344 cur -= 1 # Reconsume
3345 return new_character_token '<'
3347 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3348 tok_state_script_data_end_tag_open = ->
3349 c = txt.charAt(cur++)
3351 tok_cur_tag = new_end_tag c.toLowerCase()
3352 temporary_buffer += c
3353 tok_state = tok_state_script_data_end_tag_name
3356 tok_cur_tag = new_end_tag c
3357 temporary_buffer += c
3358 tok_state = tok_state_script_data_end_tag_name
3361 tok_state = tok_state_script_data
3362 cur -= 1 # Reconsume
3363 return new_character_token '</'
3365 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3366 tok_state_script_data_end_tag_name = ->
3367 c = txt.charAt(cur++)
3368 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3369 if is_appropriate_end_tag tok_cur_tag
3370 tok_state = tok_state_before_attribute_name
3374 if is_appropriate_end_tag tok_cur_tag
3375 tok_state = tok_state_self_closing_start_tag
3379 if is_appropriate_end_tag tok_cur_tag
3380 tok_state = tok_state_data
3384 tok_cur_tag.name += c.toLowerCase()
3385 temporary_buffer += c
3388 tok_cur_tag.name += c
3389 temporary_buffer += c
3392 tok_state = tok_state_script_data
3393 cur -= 1 # Reconsume
3394 return new_character_token "</#{temporary_buffer}" # fixfull split
3396 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3397 tok_state_script_data_escape_start = ->
3398 c = txt.charAt(cur++)
3400 tok_state = tok_state_script_data_escape_start_dash
3401 return new_character_token '-'
3403 tok_state = tok_state_script_data
3404 cur -= 1 # Reconsume
3407 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3408 tok_state_script_data_escape_start_dash = ->
3409 c = txt.charAt(cur++)
3411 tok_state = tok_state_script_data_escaped_dash_dash
3412 return new_character_token '-'
3414 tok_state = tok_state_script_data
3415 cur -= 1 # Reconsume
3418 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3419 tok_state_script_data_escaped = ->
3420 c = txt.charAt(cur++)
3422 tok_state = tok_state_script_data_escaped_dash
3423 return new_character_token '-'
3425 tok_state = tok_state_script_data_escaped_less_than_sign
3429 return new_character_token "\ufffd"
3431 tok_state = tok_state_data
3433 cur -= 1 # Reconsume
3436 return new_character_token c
3438 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3439 tok_state_script_data_escaped_dash = ->
3440 c = txt.charAt(cur++)
3442 tok_state = tok_state_script_data_escaped_dash_dash
3443 return new_character_token '-'
3445 tok_state = tok_state_script_data_escaped_less_than_sign
3449 tok_state = tok_state_script_data_escaped
3450 return new_character_token "\ufffd"
3452 tok_state = tok_state_data
3454 cur -= 1 # Reconsume
3457 tok_state = tok_state_script_data_escaped
3458 return new_character_token c
3460 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3461 tok_state_script_data_escaped_dash_dash = ->
3462 c = txt.charAt(cur++)
3464 return new_character_token '-'
3466 tok_state = tok_state_script_data_escaped_less_than_sign
3469 tok_state = tok_state_script_data
3470 return new_character_token '>'
3473 tok_state = tok_state_script_data_escaped
3474 return new_character_token "\ufffd"
3477 tok_state = tok_state_data
3478 cur -= 1 # Reconsume
3481 tok_state = tok_state_script_data_escaped
3482 return new_character_token c
3484 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3485 tok_state_script_data_escaped_less_than_sign = ->
3486 c = txt.charAt(cur++)
3488 temporary_buffer = ''
3489 tok_state = tok_state_script_data_escaped_end_tag_open
3492 temporary_buffer = c.toLowerCase() # yes, really
3493 tok_state = tok_state_script_data_double_escape_start
3494 return new_character_token "<#{c}" # fixfull split
3496 temporary_buffer = c
3497 tok_state = tok_state_script_data_double_escape_start
3498 return new_character_token "<#{c}" # fixfull split
3500 tok_state = tok_state_script_data_escaped
3501 cur -= 1 # Reconsume
3502 return new_character_token '<'
3504 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3505 tok_state_script_data_escaped_end_tag_open = ->
3506 c = txt.charAt(cur++)
3508 tok_cur_tag = new_end_tag c.toLowerCase()
3509 temporary_buffer += c
3510 tok_state = tok_state_script_data_escaped_end_tag_name
3513 tok_cur_tag = new_end_tag c
3514 temporary_buffer += c
3515 tok_state = tok_state_script_data_escaped_end_tag_name
3518 tok_state = tok_state_script_data_escaped
3519 cur -= 1 # Reconsume
3520 return new_character_token '</' # fixfull split
3522 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3523 tok_state_script_data_escaped_end_tag_name = ->
3524 c = txt.charAt(cur++)
3525 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3526 if is_appropriate_end_tag tok_cur_tag
3527 tok_state = tok_state_before_attribute_name
3531 if is_appropriate_end_tag tok_cur_tag
3532 tok_state = tok_state_self_closing_start_tag
3536 if is_appropriate_end_tag tok_cur_tag
3537 tok_state = tok_state_data
3541 tok_cur_tag.name += c.toLowerCase()
3542 temporary_buffer += c.toLowerCase()
3545 tok_cur_tag.name += c
3546 temporary_buffer += c.toLowerCase()
3549 tok_state = tok_state_script_data_escaped
3550 cur -= 1 # Reconsume
3551 return new_character_token "</#{temporary_buffer}" # fixfull split
3553 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3554 tok_state_script_data_double_escape_start = ->
3555 c = txt.charAt(cur++)
3556 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3557 if temporary_buffer is 'script'
3558 tok_state = tok_state_script_data_double_escaped
3560 tok_state = tok_state_script_data_escaped
3561 return new_character_token c
3563 temporary_buffer += c.toLowerCase() # yes, really lowercase
3564 return new_character_token c
3566 temporary_buffer += c
3567 return new_character_token c
3569 tok_state = tok_state_script_data_escaped
3570 cur -= 1 # Reconsume
3573 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3574 tok_state_script_data_double_escaped = ->
3575 c = txt.charAt(cur++)
3577 tok_state = tok_state_script_data_double_escaped_dash
3578 return new_character_token '-'
3580 tok_state = tok_state_script_data_double_escaped_less_than_sign
3581 return new_character_token '<'
3584 return new_character_token "\ufffd"
3587 tok_state = tok_state_data
3588 cur -= 1 # Reconsume
3591 return new_character_token c
3593 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3594 tok_state_script_data_double_escaped_dash = ->
3595 c = txt.charAt(cur++)
3597 tok_state = tok_state_script_data_double_escaped_dash_dash
3598 return new_character_token '-'
3600 tok_state = tok_state_script_data_double_escaped_less_than_sign
3601 return new_character_token '<'
3604 tok_state = tok_state_script_data_double_escaped
3605 return new_character_token "\ufffd"
3608 tok_state = tok_state_data
3609 cur -= 1 # Reconsume
3612 tok_state = tok_state_script_data_double_escaped
3613 return new_character_token c
3615 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3616 tok_state_script_data_double_escaped_dash_dash = ->
3617 c = txt.charAt(cur++)
3619 return new_character_token '-'
3621 tok_state = tok_state_script_data_double_escaped_less_than_sign
3622 return new_character_token '<'
3624 tok_state = tok_state_script_data
3625 return new_character_token '>'
3628 tok_state = tok_state_script_data_double_escaped
3629 return new_character_token "\ufffd"
3632 tok_state = tok_state_data
3633 cur -= 1 # Reconsume
3636 tok_state = tok_state_script_data_double_escaped
3637 return new_character_token c
3639 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3640 tok_state_script_data_double_escaped_less_than_sign = ->
3641 c = txt.charAt(cur++)
3643 temporary_buffer = ''
3644 tok_state = tok_state_script_data_double_escape_end
3645 return new_character_token '/'
3647 tok_state = tok_state_script_data_double_escaped
3648 cur -= 1 # Reconsume
3651 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3652 tok_state_script_data_double_escape_end = ->
3653 c = txt.charAt(cur++)
3654 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3655 if temporary_buffer is 'script'
3656 tok_state = tok_state_script_data_escaped
3658 tok_state = tok_state_script_data_double_escaped
3659 return new_character_token c
3661 temporary_buffer += c.toLowerCase() # yes, really lowercase
3662 return new_character_token c
3664 temporary_buffer += c
3665 return new_character_token c
3667 tok_state = tok_state_script_data_double_escaped
3668 cur -= 1 # Reconsume
3671 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3672 tok_state_before_attribute_name = ->
3674 switch c = txt.charAt(cur++)
3675 when "\t", "\n", "\u000c", ' '
3678 tok_state = tok_state_self_closing_start_tag
3681 tok_state = tok_state_data
3687 attr_name = "\ufffd"
3688 when '"', "'", '<', '='
3693 tok_state = tok_state_data
3696 attr_name = c.toLowerCase()
3700 tok_cur_tag.attrs_a.unshift [attr_name, '']
3701 tok_state = tok_state_attribute_name
3704 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3705 tok_state_attribute_name = ->
3706 switch c = txt.charAt(cur++)
3707 when "\t", "\n", "\u000c", ' '
3708 tok_state = tok_state_after_attribute_name
3710 tok_state = tok_state_self_closing_start_tag
3712 tok_state = tok_state_before_attribute_value
3714 tok_state = tok_state_data
3720 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3723 tok_cur_tag.attrs_a[0][0] += c
3726 tok_state = tok_state_data
3729 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3731 tok_cur_tag.attrs_a[0][0] += c
3734 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3735 tok_state_after_attribute_name = ->
3736 c = txt.charAt(cur++)
3737 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3740 tok_state = tok_state_self_closing_start_tag
3743 tok_state = tok_state_before_attribute_value
3746 tok_state = tok_state_data
3749 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3750 tok_state = tok_state_attribute_name
3754 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3755 tok_state = tok_state_attribute_name
3759 tok_state = tok_state_data
3760 cur -= 1 # reconsume
3762 if c is '"' or c is "'" or c is '<'
3764 # fall through to Anything else
3766 tok_cur_tag.attrs_a.unshift [c, '']
3767 tok_state = tok_state_attribute_name
3770 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3771 tok_state_before_attribute_value = ->
3772 switch c = txt.charAt(cur++)
3773 when "\t", "\n", "\u000c", ' '
3776 tok_state = tok_state_attribute_value_double_quoted
3778 tok_state = tok_state_attribute_value_unquoted
3781 tok_state = tok_state_attribute_value_single_quoted
3784 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3785 tok_state = tok_state_attribute_value_unquoted
3788 tok_state = tok_state_data
3794 tok_state = tok_state_data
3796 tok_cur_tag.attrs_a[0][1] += c
3797 tok_state = tok_state_attribute_value_unquoted
3800 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3801 tok_state_attribute_value_double_quoted = ->
3802 switch c = txt.charAt(cur++)
3804 tok_state = tok_state_after_attribute_value_quoted
3806 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3809 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3812 tok_state = tok_state_data
3814 tok_cur_tag.attrs_a[0][1] += c
3817 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3818 tok_state_attribute_value_single_quoted = ->
3819 switch c = txt.charAt(cur++)
3821 tok_state = tok_state_after_attribute_value_quoted
3823 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3826 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3829 tok_state = tok_state_data
3831 tok_cur_tag.attrs_a[0][1] += c
3834 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3835 tok_state_attribute_value_unquoted = ->
3836 switch c = txt.charAt(cur++)
3837 when "\t", "\n", "\u000c", ' '
3838 tok_state = tok_state_before_attribute_name
3840 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3842 tok_state = tok_state_data
3847 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3850 tok_state = tok_state_data
3852 # Parse Error if ', <, = or ` (backtick)
3853 tok_cur_tag.attrs_a[0][1] += c
3856 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3857 tok_state_after_attribute_value_quoted = ->
3858 switch c = txt.charAt(cur++)
3859 when "\t", "\n", "\u000c", ' '
3860 tok_state = tok_state_before_attribute_name
3862 tok_state = tok_state_self_closing_start_tag
3864 tok_state = tok_state_data
3870 tok_state = tok_state_data
3873 tok_state = tok_state_before_attribute_name
3874 cur -= 1 # we didn't handle that char
3877 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3878 tok_state_self_closing_start_tag = ->
3879 c = txt.charAt(cur++)
3881 tok_cur_tag.flag 'self-closing', true
3882 tok_state = tok_state_data
3886 tok_state = tok_state_data
3887 cur -= 1 # Reconsume
3891 tok_state = tok_state_before_attribute_name
3892 cur -= 1 # Reconsume
3895 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3896 # WARNING: put a comment token in tok_cur_tag before setting this state
3897 tok_state_bogus_comment = ->
3898 next_gt = txt.indexOf '>', cur
3900 val = txt.substr cur
3903 val = txt.substr cur, (next_gt - cur)
3905 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3906 tok_cur_tag.text += val
3907 tok_state = tok_state_data
3910 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3911 tok_state_markup_declaration_open = ->
3912 if txt.substr(cur, 2) is '--'
3914 tok_cur_tag = new_comment_token ''
3915 tok_state = tok_state_comment_start
3917 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3919 tok_state = tok_state_doctype
3921 acn = adjusted_current_node()
3922 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3924 tok_state = tok_state_cdata_section
3928 tok_cur_tag = new_comment_token ''
3929 tok_state = tok_state_bogus_comment
3932 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3933 tok_state_comment_start = ->
3934 switch c = txt.charAt(cur++)
3936 tok_state = tok_state_comment_start_dash
3939 tok_state = tok_state_comment
3940 return new_character_token "\ufffd"
3943 tok_state = tok_state_data
3947 tok_state = tok_state_data
3948 cur -= 1 # Reconsume
3951 tok_cur_tag.text += c
3952 tok_state = tok_state_comment
3955 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3956 tok_state_comment_start_dash = ->
3957 switch c = txt.charAt(cur++)
3959 tok_state = tok_state_comment_end
3962 tok_cur_tag.text += "-\ufffd"
3963 tok_state = tok_state_comment
3966 tok_state = tok_state_data
3970 tok_state = tok_state_data
3971 cur -= 1 # Reconsume
3974 tok_cur_tag.text += "-#{c}"
3975 tok_state = tok_state_comment
3978 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3979 tok_state_comment = ->
3980 switch c = txt.charAt(cur++)
3982 tok_state = tok_state_comment_end_dash
3985 tok_cur_tag.text += "\ufffd"
3988 tok_state = tok_state_data
3989 cur -= 1 # Reconsume
3992 tok_cur_tag.text += c
3995 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3996 tok_state_comment_end_dash = ->
3997 switch c = txt.charAt(cur++)
3999 tok_state = tok_state_comment_end
4002 tok_cur_tag.text += "-\ufffd"
4003 tok_state = tok_state_comment
4006 tok_state = tok_state_data
4007 cur -= 1 # Reconsume
4010 tok_cur_tag.text += "-#{c}"
4011 tok_state = tok_state_comment
4014 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
4015 tok_state_comment_end = ->
4016 switch c = txt.charAt(cur++)
4018 tok_state = tok_state_data
4022 tok_cur_tag.text += "--\ufffd"
4023 tok_state = tok_state_comment
4026 tok_state = tok_state_comment_end_bang
4029 tok_cur_tag.text += '-'
4032 tok_state = tok_state_data
4033 cur -= 1 # Reconsume
4037 tok_cur_tag.text += "--#{c}"
4038 tok_state = tok_state_comment
4041 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
4042 tok_state_comment_end_bang = ->
4043 switch c = txt.charAt(cur++)
4045 tok_cur_tag.text += "--!#{c}"
4046 tok_state = tok_state_comment_end_dash
4048 tok_state = tok_state_data
4052 tok_cur_tag.text += "--!\ufffd"
4053 tok_state = tok_state_comment
4056 tok_state = tok_state_data
4057 cur -= 1 # Reconsume
4060 tok_cur_tag.text += "--!#{c}"
4061 tok_state = tok_state_comment
4064 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4065 tok_state_doctype = ->
4066 switch c = txt.charAt(cur++)
4067 when "\t", "\u000a", "\u000c", ' '
4068 tok_state = tok_state_before_doctype_name
4071 tok_state = tok_state_data
4072 el = new_doctype_token ''
4073 el.flag 'force-quirks', true
4074 cur -= 1 # Reconsume
4078 tok_state = tok_state_before_doctype_name
4079 cur -= 1 # Reconsume
4082 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4083 tok_state_before_doctype_name = ->
4084 c = txt.charAt(cur++)
4085 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4088 tok_cur_tag = new_doctype_token c.toLowerCase()
4089 tok_state = tok_state_doctype_name
4093 tok_cur_tag = new_doctype_token "\ufffd"
4094 tok_state = tok_state_doctype_name
4098 el = new_doctype_token ''
4099 el.flag 'force-quirks', true
4100 tok_state = tok_state_data
4104 tok_state = tok_state_data
4105 el = new_doctype_token ''
4106 el.flag 'force-quirks', true
4107 cur -= 1 # Reconsume
4110 tok_cur_tag = new_doctype_token c
4111 tok_state = tok_state_doctype_name
4114 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4115 tok_state_doctype_name = ->
4116 c = txt.charAt(cur++)
4117 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4118 tok_state = tok_state_after_doctype_name
4121 tok_state = tok_state_data
4124 tok_cur_tag.name += c.toLowerCase()
4128 tok_cur_tag.name += "\ufffd"
4132 tok_state = tok_state_data
4133 tok_cur_tag.flag 'force-quirks', true
4134 cur -= 1 # Reconsume
4137 tok_cur_tag.name += c
4140 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4141 tok_state_after_doctype_name = ->
4142 c = txt.charAt(cur++)
4143 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4146 tok_state = tok_state_data
4150 tok_state = tok_state_data
4151 tok_cur_tag.flag 'force-quirks', true
4152 cur -= 1 # Reconsume
4155 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4157 tok_state = tok_state_after_doctype_public_keyword
4159 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4161 tok_state = tok_state_after_doctype_system_keyword
4164 tok_cur_tag.flag 'force-quirks', true
4165 tok_state = tok_state_bogus_doctype
4168 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4169 tok_state_after_doctype_public_keyword = ->
4170 c = txt.charAt(cur++)
4171 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4172 tok_state = tok_state_before_doctype_public_identifier
4176 tok_cur_tag.public_identifier = ''
4177 tok_state = tok_state_doctype_public_identifier_double_quoted
4181 tok_cur_tag.public_identifier = ''
4182 tok_state = tok_state_doctype_public_identifier_single_quoted
4186 tok_cur_tag.flag 'force-quirks', true
4187 tok_state = tok_state_data
4191 tok_state = tok_state_data
4192 tok_cur_tag.flag 'force-quirks', true
4193 cur -= 1 # Reconsume
4197 tok_cur_tag.flag 'force-quirks', true
4198 tok_state = tok_state_bogus_doctype
4201 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4202 tok_state_before_doctype_public_identifier = ->
4203 c = txt.charAt(cur++)
4204 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4208 tok_cur_tag.public_identifier = ''
4209 tok_state = tok_state_doctype_public_identifier_double_quoted
4213 tok_cur_tag.public_identifier = ''
4214 tok_state = tok_state_doctype_public_identifier_single_quoted
4218 tok_cur_tag.flag 'force-quirks', true
4219 tok_state = tok_state_data
4223 tok_state = tok_state_data
4224 tok_cur_tag.flag 'force-quirks', true
4225 cur -= 1 # Reconsume
4229 tok_cur_tag.flag 'force-quirks', true
4230 tok_state = tok_state_bogus_doctype
4234 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4235 tok_state_doctype_public_identifier_double_quoted = ->
4236 c = txt.charAt(cur++)
4238 tok_state = tok_state_after_doctype_public_identifier
4242 tok_cur_tag.public_identifier += "\ufffd"
4246 tok_cur_tag.flag 'force-quirks', true
4247 tok_state = tok_state_data
4251 tok_state = tok_state_data
4252 tok_cur_tag.flag 'force-quirks', true
4253 cur -= 1 # Reconsume
4256 tok_cur_tag.public_identifier += c
4259 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4260 tok_state_doctype_public_identifier_single_quoted = ->
4261 c = txt.charAt(cur++)
4263 tok_state = tok_state_after_doctype_public_identifier
4267 tok_cur_tag.public_identifier += "\ufffd"
4271 tok_cur_tag.flag 'force-quirks', true
4272 tok_state = tok_state_data
4276 tok_state = tok_state_data
4277 tok_cur_tag.flag 'force-quirks', true
4278 cur -= 1 # Reconsume
4281 tok_cur_tag.public_identifier += c
4284 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4285 tok_state_after_doctype_public_identifier = ->
4286 c = txt.charAt(cur++)
4287 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4288 tok_state = tok_state_between_doctype_public_and_system_identifiers
4291 tok_state = tok_state_data
4295 tok_cur_tag.system_identifier = ''
4296 tok_state = tok_state_doctype_system_identifier_double_quoted
4300 tok_cur_tag.system_identifier = ''
4301 tok_state = tok_state_doctype_system_identifier_single_quoted
4305 tok_state = tok_state_data
4306 tok_cur_tag.flag 'force-quirks', true
4307 cur -= 1 # Reconsume
4311 tok_cur_tag.flag 'force-quirks', true
4312 tok_state = tok_state_bogus_doctype
4315 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4316 tok_state_between_doctype_public_and_system_identifiers = ->
4317 c = txt.charAt(cur++)
4318 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4321 tok_state = tok_state_data
4325 tok_cur_tag.system_identifier = ''
4326 tok_state = tok_state_doctype_system_identifier_double_quoted
4330 tok_cur_tag.system_identifier = ''
4331 tok_state = tok_state_doctype_system_identifier_single_quoted
4335 tok_state = tok_state_data
4336 tok_cur_tag.flag 'force-quirks', true
4337 cur -= 1 # Reconsume
4341 tok_cur_tag.flag 'force-quirks', true
4342 tok_state = tok_state_bogus_doctype
4345 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4346 tok_state_after_doctype_system_keyword = ->
4347 c = txt.charAt(cur++)
4348 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4349 tok_state = tok_state_before_doctype_system_identifier
4353 tok_cur_tag.system_identifier = ''
4354 tok_state = tok_state_doctype_system_identifier_double_quoted
4358 tok_cur_tag.system_identifier = ''
4359 tok_state = tok_state_doctype_system_identifier_single_quoted
4363 tok_cur_tag.flag 'force-quirks', true
4364 tok_state = tok_state_data
4368 tok_state = tok_state_data
4369 tok_cur_tag.flag 'force-quirks', true
4370 cur -= 1 # Reconsume
4374 tok_cur_tag.flag 'force-quirks', true
4375 tok_state = tok_state_bogus_doctype
4378 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4379 tok_state_before_doctype_system_identifier = ->
4380 c = txt.charAt(cur++)
4381 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4384 tok_cur_tag.system_identifier = ''
4385 tok_state = tok_state_doctype_system_identifier_double_quoted
4388 tok_cur_tag.system_identifier = ''
4389 tok_state = tok_state_doctype_system_identifier_single_quoted
4393 tok_cur_tag.flag 'force-quirks', true
4394 tok_state = tok_state_data
4398 tok_state = tok_state_data
4399 tok_cur_tag.flag 'force-quirks', true
4400 cur -= 1 # Reconsume
4404 tok_cur_tag.flag 'force-quirks', true
4405 tok_state = tok_state_bogus_doctype
4408 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4409 tok_state_doctype_system_identifier_double_quoted = ->
4410 c = txt.charAt(cur++)
4412 tok_state = tok_state_after_doctype_system_identifier
4416 tok_cur_tag.system_identifier += "\ufffd"
4420 tok_cur_tag.flag 'force-quirks', true
4421 tok_state = tok_state_data
4425 tok_state = tok_state_data
4426 tok_cur_tag.flag 'force-quirks', true
4427 cur -= 1 # Reconsume
4430 tok_cur_tag.system_identifier += c
4433 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4434 tok_state_doctype_system_identifier_single_quoted = ->
4435 c = txt.charAt(cur++)
4437 tok_state = tok_state_after_doctype_system_identifier
4441 tok_cur_tag.system_identifier += "\ufffd"
4445 tok_cur_tag.flag 'force-quirks', true
4446 tok_state = tok_state_data
4450 tok_state = tok_state_data
4451 tok_cur_tag.flag 'force-quirks', true
4452 cur -= 1 # Reconsume
4455 tok_cur_tag.system_identifier += c
4458 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4459 tok_state_after_doctype_system_identifier = ->
4460 c = txt.charAt(cur++)
4461 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4464 tok_state = tok_state_data
4468 tok_state = tok_state_data
4469 tok_cur_tag.flag 'force-quirks', true
4470 cur -= 1 # Reconsume
4474 # do _not_ tok_cur_tag.flag 'force-quirks', true
4475 tok_state = tok_state_bogus_doctype
4478 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4479 tok_state_bogus_doctype = ->
4480 c = txt.charAt(cur++)
4482 tok_state = tok_state_data
4485 tok_state = tok_state_data
4486 cur -= 1 # Reconsume
4491 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4492 tok_state_cdata_section = ->
4493 tok_state = tok_state_data
4494 next_gt = txt.indexOf ']]>', cur
4496 val = txt.substr cur
4499 val = txt.substr cur, (next_gt - cur)
4501 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
4503 return new_character_token val # fixfull split
4506 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4507 # Don't set this as a state, just call it
4508 # returns a string (NOT a text node)
4509 parse_character_reference = (allowed_char = null, in_attr = false) ->
4510 if cur >= txt.length
4512 switch c = txt.charAt(cur)
4513 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4514 # explicitly not a parse error
4517 # there has to be "one or more" alnums between & and ; to be a parse error
4520 if cur + 1 >= txt.length
4522 if txt.charAt(cur + 1).toLowerCase() is 'x'
4531 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4536 if txt.charAt(start + i) is ';'
4540 code_point = txt.substr(start, i)
4541 while code_point.charAt(0) is '0' and code_point.length > 1
4542 code_point = code_point.substr 1
4543 code_point = parseInt(code_point, base)
4544 if unicode_fixes[code_point]?
4546 return unicode_fixes[code_point]
4548 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4552 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4554 return from_code_point code_point
4558 if alnum.indexOf(txt.charAt(cur + i)) is -1
4561 # exit early, because parse_error() below needs at least one alnum
4563 if txt.charAt(cur + i) is ';'
4564 decoded = decode_named_char_ref txt.substr(cur, i)
4565 i += 1 # scan past the ';' (after, so we dno't pass it to decode)
4569 # else FALL THROUGH (check for match without last char(s) or ";")
4570 # no ';' terminator (only legacy char refs)
4572 for i in [2..max] # no prefix matches, so ok to check shortest first
4573 c = legacy_char_refs[txt.substr(cur, i)]
4576 if txt.charAt(cur + i) is '='
4577 # "because some legacy user agents will
4578 # misinterpret the markup in those cases"
4581 if alnum.indexOf(txt.charAt(cur + i)) > -1
4582 # this makes attributes forgiving about url args
4584 # ok, and besides the weird exceptions for attributes...
4585 # return the matching char
4586 cur += i # consume entity chars
4587 parse_error() # because no terminating ";"
4591 return # never reached
4593 eat_next_token_if_newline = ->
4598 if t.type is TYPE_TEXT
4599 # definition of a newline depends on whether it was a character ref or not
4600 if cur - old_cur is 1
4601 # not a character reference
4602 if t.text is "\u000d" or t.text is "\u000a"
4605 if t.text is "\u000a"
4611 # tree constructor initialization
4612 # see comments on TYPE_TAG/etc for the structure of this data
4615 doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
4616 doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
4617 fragment_root = null # fragment parsing algorithm returns children of this
4619 afe = [] # active formatting elements
4620 template_ins_modes = []
4621 ins_mode = ins_mode_initial
4622 original_ins_mode = ins_mode # TODO check spec
4623 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4624 flag_frameset_ok = true
4626 flag_foster_parenting = false
4627 form_element_pointer = null
4628 temporary_buffer = null
4629 pending_table_character_tokens = []
4630 head_element_pointer = null
4631 flag_fragment_parsing = false
4632 context_element = null
4633 prev_node_id = 0 # just for debugging
4635 # tokenizer initialization
4636 tok_state = tok_state_data
4639 # fragment parsing (text arg)
4641 # this handles the fragment from the tests in the format described here:
4642 # https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/README.md
4645 if f.substr(0, 5) is 'math '
4648 else if f.substr(0, 4) is 'svg '
4652 context_element = token_to_element t, ns
4653 context_element.document = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
4654 context_element.document.flag 'quirks mode', QUIRKS_NO
4655 # fragment parsing (Node arg)
4657 context_element = args.context
4659 # http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4660 # fragment parsing algorithm
4662 flag_fragment_parsing = true
4663 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4664 # search up the tree from context, to try to find it's document,
4665 # because this file only puts a "document" property on the root
4668 el = context_element
4671 old_doc = el.document
4678 doc.flag 'quirks mode', old_doc.flag 'quirks mode'
4680 if context_element.namespace is NS_HTML
4681 switch context_element.name
4682 when 'title', 'textarea'
4683 tok_state = tok_state_rcdata
4684 when 'style', 'xmp', 'iframe', 'noembed', 'noframes'
4685 tok_state = tok_state_rawtext
4687 tok_state = tok_state_script_data
4690 tok_state = tok_state_rawtext
4692 tok_state = tok_state_plaintext
4693 fragment_root = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4694 doc.children.push fragment_root
4695 fragment_root.document = doc
4696 open_els = [fragment_root]
4697 if context_element.name is 'template' and context_element.namespace is NS_HTML
4698 template_ins_modes.unshift ins_mode_in_template
4699 # fixfull create token for context (it should have it's original one already)
4701 # set form_element pointer... in the foreign doc?!
4702 el = context_element
4704 if el.name is 'form' and el.namespace is NS_HTML
4705 form_element_pointer = el
4712 # text pre-processing
4713 # FIXME check http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4714 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4715 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4719 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4720 parse_main_loop = ->
4725 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4730 if flag_fragment_parsing
4731 return fragment_root.children
4734 exports.parse = parse_html
4735 exports.debug_log_reset = debug_log_reset
4736 exports.debug_log_each = debug_log_each
4737 exports.TYPE_TAG = TYPE_TAG
4738 exports.TYPE_TEXT = TYPE_TEXT
4739 exports.TYPE_COMMENT = TYPE_COMMENT
4740 exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4741 exports.NS_HTML = NS_HTML
4742 exports.NS_MATHML = NS_MATHML
4743 exports.NS_SVG = NS_SVG
4744 exports.QUIRKS_NO = QUIRKS_NO
4745 exports.QUIRKS_LIMITED = QUIRKS_LIMITED
4746 exports.QUIRKS_YES = QUIRKS_YES