1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a thorough parser for html5, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
24 # http://www.w3.org/TR/html5/syntax.html
26 # except for some places marked "WHATWG" that are implemented as described here:
28 # https://html.spec.whatwg.org/multipage/syntax.html
30 # This code passes all of the tests in the .dat files at:
32 # https://github.com/JasonWoof/html5lib-tests/tree/patch-1/tree-construction
35 ##################################
36 ## how to use this code
37 ##################################
39 # See README.md for how to pre-compile this file, or compile it in the browser.
41 # This file exports a single useful function: parse_tml
43 # Once you include this file in a page (see index.html for an example) you'll
48 # wheic.parse_html({html: "<p><b>hi</p>"})
50 # Or, if you don't want <html><head><body>/etc, do this:
52 # wheic.parse_html({fragment: "body", html: "<p><b>hi</p>"})
54 # This code can _almost_ run outside the browser (eg under node.js). To get it
55 # to run without the browser would require native implementation of
56 # decode_named_char_ref(). The current implementation of that function uses the
57 # browser's DOM api, to save space (the list of valid named characters is
60 # This code is a work in progress, eg try search this file for "fixfull",
66 # Jason was frequently confused by the terminology used to refer to different
67 # parts of the stacks and lists in the spec, so he made this chart to help keep
70 # stacks grow downward (current element is index=0)
72 # example: open_els = [a, b, c, d, e, f, g]
74 # "grows downwards" means it's visualized like this: (index: el, names)
76 # 6: g "start of the list", "topmost", "first"
78 # 4: e "previous" (to d), "above", "before"
79 # 3: d (previous/next are relative to this element)
80 # 2: c "next", "after", "lower", "below"
82 # 0: a "end of the list", "current node", "bottommost", "last"
84 if (typeof module) isnt 'undefined' and module.exports?
86 exports = module.exports
90 exports = window.wheic
92 from_code_point = (x) ->
93 if String.fromCodePoint?
94 return String.fromCodePoint x
97 return String.fromCharCode x
99 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
101 # Each node is an obect of the Node class. Here are the Node types:
102 TYPE_TAG = 0 # name, {attributes}, [children]
103 TYPE_TEXT = 1 # "text"
106 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
107 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
108 TYPE_END_TAG = 5 # name
110 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
111 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
113 # namespace constants
118 # quirks mode constants
123 # queue up debug logs, so eg they can be shown only for tests that fail
131 debug_log_each = (cb) ->
132 for str in g_debug_log
138 constructor: (type, args = {}) ->
139 @type = type # one of the TYPE_* constants above
140 @name = args.name ? '' # tag name
141 @text = args.text ? '' # contents for text/comment nodes
142 @attrs = args.attrs ? {}
143 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
144 @children = args.children ? []
145 @namespace = args.namespace ? NS_HTML
146 @parent = args.parent ? null
147 @token = args.token ? null
148 @flags = args.flags ? {}
152 @id = "#{++prev_node_id}"
153 acknowledge_self_closing: ->
155 @token.flag 'did_self_close', true
157 @flag 'did_self_close', true
159 flag: (key, value = null) ->
166 # helpers: (only take args that are normally known when parser creates nodes)
167 new_open_tag = (name) ->
168 return new Node TYPE_START_TAG, name: name
169 new_end_tag = (name) ->
170 return new Node TYPE_END_TAG, name: name
171 new_element = (name) ->
172 return new Node TYPE_TAG, name: name
173 new_text_node = (txt) ->
174 return new Node TYPE_TEXT, text: txt
175 new_character_token = new_text_node
176 new_comment_token = (txt) ->
177 return new Node TYPE_COMMENT, text: txt
178 new_doctype_token = (name) ->
179 return new Node TYPE_DOCTYPE, name: name
181 return new Node TYPE_EOF
183 return new Node TYPE_AFE_MARKER
184 new_aaa_bookmark = ->
185 return new Node TYPE_AAA_BOOKMARK
187 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
188 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
189 digits = "0123456789"
190 alnum = lc_alpha + uc_alpha + digits
191 hex_chars = digits + "abcdefABCDEF"
193 is_uc_alpha = (str) ->
194 return str.length is 1 and uc_alpha.indexOf(str) > -1
195 is_lc_alpha = (str) ->
196 return str.length is 1 and lc_alpha.indexOf(str) > -1
198 # some SVG elements have dashes in them
199 tag_name_chars = alnum + "-"
201 # http://www.w3.org/TR/html5/infrastructure.html#space-character
202 space_chars = "\u0009\u000a\u000c\u000d\u0020"
204 return txt.length is 1 and space_chars.indexOf(txt) > -1
205 is_space_tok = (t) ->
206 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
208 is_input_hidden_tok = (t) ->
209 return false unless t.type is TYPE_START_TAG
212 if a[1].toLowerCase() is 'hidden'
217 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
218 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
221 unicode_fixes[0x00] = "\uFFFD"
222 unicode_fixes[0x80] = "\u20AC"
223 unicode_fixes[0x82] = "\u201A"
224 unicode_fixes[0x83] = "\u0192"
225 unicode_fixes[0x84] = "\u201E"
226 unicode_fixes[0x85] = "\u2026"
227 unicode_fixes[0x86] = "\u2020"
228 unicode_fixes[0x87] = "\u2021"
229 unicode_fixes[0x88] = "\u02C6"
230 unicode_fixes[0x89] = "\u2030"
231 unicode_fixes[0x8A] = "\u0160"
232 unicode_fixes[0x8B] = "\u2039"
233 unicode_fixes[0x8C] = "\u0152"
234 unicode_fixes[0x8E] = "\u017D"
235 unicode_fixes[0x91] = "\u2018"
236 unicode_fixes[0x92] = "\u2019"
237 unicode_fixes[0x93] = "\u201C"
238 unicode_fixes[0x94] = "\u201D"
239 unicode_fixes[0x95] = "\u2022"
240 unicode_fixes[0x96] = "\u2013"
241 unicode_fixes[0x97] = "\u2014"
242 unicode_fixes[0x98] = "\u02DC"
243 unicode_fixes[0x99] = "\u2122"
244 unicode_fixes[0x9A] = "\u0161"
245 unicode_fixes[0x9B] = "\u203A"
246 unicode_fixes[0x9C] = "\u0153"
247 unicode_fixes[0x9E] = "\u017E"
248 unicode_fixes[0x9F] = "\u0178"
250 quirks_yes_pi_prefixes = [
251 "+//silmaril//dtd html pro v0r11 19970101//"
252 "-//as//dtd html 3.0 aswedit + extensions//"
253 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
254 "-//ietf//dtd html 2.0 level 1//"
255 "-//ietf//dtd html 2.0 level 2//"
256 "-//ietf//dtd html 2.0 strict level 1//"
257 "-//ietf//dtd html 2.0 strict level 2//"
258 "-//ietf//dtd html 2.0 strict//"
259 "-//ietf//dtd html 2.0//"
260 "-//ietf//dtd html 2.1e//"
261 "-//ietf//dtd html 3.0//"
262 "-//ietf//dtd html 3.2 final//"
263 "-//ietf//dtd html 3.2//"
264 "-//ietf//dtd html 3//"
265 "-//ietf//dtd html level 0//"
266 "-//ietf//dtd html level 1//"
267 "-//ietf//dtd html level 2//"
268 "-//ietf//dtd html level 3//"
269 "-//ietf//dtd html strict level 0//"
270 "-//ietf//dtd html strict level 1//"
271 "-//ietf//dtd html strict level 2//"
272 "-//ietf//dtd html strict level 3//"
273 "-//ietf//dtd html strict//"
274 "-//ietf//dtd html//"
275 "-//metrius//dtd metrius presentational//"
276 "-//microsoft//dtd internet explorer 2.0 html strict//"
277 "-//microsoft//dtd internet explorer 2.0 html//"
278 "-//microsoft//dtd internet explorer 2.0 tables//"
279 "-//microsoft//dtd internet explorer 3.0 html strict//"
280 "-//microsoft//dtd internet explorer 3.0 html//"
281 "-//microsoft//dtd internet explorer 3.0 tables//"
282 "-//netscape comm. corp.//dtd html//"
283 "-//netscape comm. corp.//dtd strict html//"
284 "-//o'reilly and associates//dtd html 2.0//"
285 "-//o'reilly and associates//dtd html extended 1.0//"
286 "-//o'reilly and associates//dtd html extended relaxed 1.0//"
287 "-//sq//dtd html 2.0 hotmetal + extensions//"
288 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
289 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
290 "-//spyglass//dtd html 2.0 extended//"
291 "-//sun microsystems corp.//dtd hotjava html//"
292 "-//sun microsystems corp.//dtd hotjava strict html//"
293 "-//w3c//dtd html 3 1995-03-24//"
294 "-//w3c//dtd html 3.2 draft//"
295 "-//w3c//dtd html 3.2 final//"
296 "-//w3c//dtd html 3.2//"
297 "-//w3c//dtd html 3.2s draft//"
298 "-//w3c//dtd html 4.0 frameset//"
299 "-//w3c//dtd html 4.0 transitional//"
300 "-//w3c//dtd html experimental 19960712//"
301 "-//w3c//dtd html experimental 970421//"
302 "-//w3c//dtd w3 html//"
303 "-//w3o//dtd w3 html 3.0//"
304 "-//webtechs//dtd mozilla html 2.0//"
305 "-//webtechs//dtd mozilla html//"
308 # These are the character references that don't need a terminating semicolon
309 # min length: 2, max: 6, none are a prefix of any other.
311 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
312 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
313 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
314 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
315 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
316 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
317 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
318 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
319 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
320 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
321 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
322 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
323 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
324 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
325 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
326 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
327 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
331 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
332 raw_text_elements = ['script', 'style']
333 escapable_raw_text_elements = ['textarea', 'title']
334 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
336 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
337 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
338 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
339 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
340 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
341 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
342 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
343 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
344 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
345 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
346 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
347 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
348 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
349 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
353 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
355 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
356 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
357 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
358 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
359 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
360 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
361 'determinant', 'diff', 'divergence', 'divide', 'domain',
362 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
363 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
364 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
365 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
366 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
367 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
368 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
369 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
370 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
371 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
372 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
373 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
374 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
375 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
376 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
377 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
378 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
379 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
380 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
381 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
382 'vectorproduct', 'xor'
384 # foreign_elements = [svg_elements..., mathml_elements...]
385 #normal_elements = All other allowed HTML elements are normal elements.
389 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
390 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
391 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
392 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
393 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
394 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
395 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
396 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
397 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
398 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
399 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
401 menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
403 meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
404 noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
405 plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
406 select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
407 table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
408 textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
409 tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
412 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
413 'annotation-xml':NS_MATHML,
416 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
419 formatting_elements = {
420 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
421 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
425 mathml_text_integration = {
426 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
428 is_mathml_text_integration_point = (el) ->
429 return mathml_text_integration[el.name] is el.namespace
430 is_html_integration = (el) -> # DON'T PASS A TOKEN
431 if el.namespace is NS_MATHML
432 if el.name is 'annotation-xml'
433 if el.attrs.encoding?
434 if el.attrs.encoding.toLowerCase() is 'text/html'
436 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
439 if el.namespace is NS_SVG
440 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
445 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
448 foster_parenting_targets = {
469 el_is_special = (e) ->
470 return special_elements[e.name] is e.namespace
472 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
473 el_is_special_not_adp = (el) ->
474 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
478 altglyphdef: 'altGlyphDef'
479 altglyphitem: 'altGlyphItem'
480 animatecolor: 'animateColor'
481 animatemotion: 'animateMotion'
482 animatetransform: 'animateTransform'
485 fecolormatrix: 'feColorMatrix'
486 fecomponenttransfer: 'feComponentTransfer'
487 fecomposite: 'feComposite'
488 feconvolvematrix: 'feConvolveMatrix'
489 fediffuselighting: 'feDiffuseLighting'
490 fedisplacementmap: 'feDisplacementMap'
491 fedistantlight: 'feDistantLight'
492 fedropshadow: 'feDropShadow'
498 fegaussianblur: 'feGaussianBlur'
501 femergenode: 'feMergeNode'
502 femorphology: 'feMorphology'
504 fepointlight: 'fePointLight'
505 fespecularlighting: 'feSpecularLighting'
506 fespotlight: 'feSpotLight'
508 feturbulence: 'feTurbulence'
509 foreignobject: 'foreignObject'
511 lineargradient: 'linearGradient'
512 radialgradient: 'radialGradient'
515 svg_attribute_fixes = {
516 attributename: 'attributeName'
517 attributetype: 'attributeType'
518 basefrequency: 'baseFrequency'
519 baseprofile: 'baseProfile'
521 clippathunits: 'clipPathUnits'
522 contentscripttype: 'contentScriptType'
523 contentstyletype: 'contentStyleType'
524 diffuseconstant: 'diffuseConstant'
526 externalresourcesrequired: 'externalResourcesRequired'
527 # WHATWG removes this: filterres: 'filterRes'
528 filterunits: 'filterUnits'
530 gradienttransform: 'gradientTransform'
531 gradientunits: 'gradientUnits'
532 kernelmatrix: 'kernelMatrix'
533 kernelunitlength: 'kernelUnitLength'
534 keypoints: 'keyPoints'
535 keysplines: 'keySplines'
537 lengthadjust: 'lengthAdjust'
538 limitingconeangle: 'limitingConeAngle'
539 markerheight: 'markerHeight'
540 markerunits: 'markerUnits'
541 markerwidth: 'markerWidth'
542 maskcontentunits: 'maskContentUnits'
543 maskunits: 'maskUnits'
544 numoctaves: 'numOctaves'
545 pathlength: 'pathLength'
546 patterncontentunits: 'patternContentUnits'
547 patterntransform: 'patternTransform'
548 patternunits: 'patternUnits'
549 pointsatx: 'pointsAtX'
550 pointsaty: 'pointsAtY'
551 pointsatz: 'pointsAtZ'
552 preservealpha: 'preserveAlpha'
553 preserveaspectratio: 'preserveAspectRatio'
554 primitiveunits: 'primitiveUnits'
557 repeatcount: 'repeatCount'
558 repeatdur: 'repeatDur'
559 requiredextensions: 'requiredExtensions'
560 requiredfeatures: 'requiredFeatures'
561 specularconstant: 'specularConstant'
562 specularexponent: 'specularExponent'
563 spreadmethod: 'spreadMethod'
564 startoffset: 'startOffset'
565 stddeviation: 'stdDeviation'
566 stitchtiles: 'stitchTiles'
567 surfacescale: 'surfaceScale'
568 systemlanguage: 'systemLanguage'
569 tablevalues: 'tableValues'
572 textlength: 'textLength'
574 viewtarget: 'viewTarget'
575 xchannelselector: 'xChannelSelector'
576 ychannelselector: 'yChannelSelector'
577 zoomandpan: 'zoomAndPan'
579 foreign_attr_fixes = {
580 'xlink:actuate': 'xlink actuate'
581 'xlink:arcrole': 'xlink arcrole'
582 'xlink:href': 'xlink href'
583 'xlink:role': 'xlink role'
584 'xlink:show': 'xlink show'
585 'xlink:title': 'xlink title'
586 'xlink:type': 'xlink type'
587 'xml:base': 'xml base'
588 'xml:lang': 'xml lang'
589 'xml:space': 'xml space'
591 'xmlns:xlink': 'xmlns xlink'
593 adjust_mathml_attributes = (t) ->
595 if a[0] is 'definitionurl'
596 a[0] = 'definitionURL'
598 adjust_svg_attributes = (t) ->
600 if svg_attribute_fixes[a[0]]?
601 a[0] = svg_attribute_fixes[a[0]]
603 adjust_foreign_attributes = (t) ->
606 if foreign_attr_fixes[a[0]]?
607 a[0] = foreign_attr_fixes[a[0]]
610 # decode_named_char_ref()
612 # The list of named character references is _huge_ so if we're running in a
613 # browser, we get the browser to decode them, rather than increasing the code
614 # size to include the table.
615 if context is 'module'
616 _decode_named_char_ref = require './html5-named-entities.coffee'
618 # TODO test this in IE8
619 decode_named_char_ref_el = document.createElement('textarea')
620 _decode_named_char_ref = (txt) ->
622 decode_named_char_ref_el.innerHTML = txt
623 decoded = decode_named_char_ref_el.value
624 return null if decoded is txt
626 # Pass the name of a named entity _that has a terminating semicolon_
627 # Entities without terminating semicolons should use legacy_char_refs[]
628 # Do not include the "&" or ";" in your argument, eg pass "alpha"
629 decode_named_char_ref_cache = {}
630 decode_named_char_ref = (txt) ->
631 decoded = decode_named_char_ref_cache[txt]
632 return decoded if decoded?
633 decoded = _decode_named_char_ref txt
634 return decode_named_char_ref_cache[txt] = decoded
636 parse_html = (args) ->
638 cur = null # index of next char in txt to be parsed
639 # declare doc and tokenizer variables so they're in scope below
641 open_els = null # stack of open elements
642 afe = null # active formatting elements
643 template_ins_modes = null
645 original_ins_mode = null
647 tok_cur_tag = null # partially parsed tag
648 flag_scripting = null
649 flag_frameset_ok = null
651 flag_foster_parenting = null
652 form_element_pointer = null
653 temporary_buffer = null
654 pending_table_character_tokens = null
655 head_element_pointer = null
656 flag_fragment_parsing = null
657 context_element = null
667 console.log "Parse error at character #{cur} of #{txt.length}"
670 # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
671 # "Noah's Ark clause" but with three
672 afe_push = (new_el) ->
675 if el.type is TYPE_AFE_MARKER
677 if el.name is new_el.name and el.namespace is new_el.namespace
680 unless new_el.attrs[k] is v
684 for k, v of new_el.attrs
685 unless el.attrs[k] is v
697 afe.unshift new_afe_marker()
700 # the functions below impliment the Tree Contstruction algorithm
701 # http://www.w3.org/TR/html5/syntax.html#tree-construction
703 # But first... the helpers
704 template_tag_is_open = ->
706 if el.name is 'template' and el.namespace is NS_HTML
709 is_in_scope_x = (tag_name, scope, namespace) ->
711 if el.name is tag_name and (namespace is null or namespace is el.namespace)
713 if scope[el.name] is el.namespace
716 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
718 if el.name is tag_name and (namespace is null or namespace is el.namespace)
720 if scope[el.name] is el.namespace
722 if scope2[el.name] is el.namespace
726 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
727 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
730 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
731 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
733 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
735 button_scopers = button: NS_HTML
736 li_scopers = ol: NS_HTML, ul: NS_HTML
737 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
738 is_in_scope = (tag_name, namespace = null) ->
739 return is_in_scope_x tag_name, standard_scopers, namespace
740 is_in_button_scope = (tag_name, namespace = null) ->
741 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
742 is_in_table_scope = (tag_name, namespace = null) ->
743 return is_in_scope_x tag_name, table_scopers, namespace
744 # aka is_in_list_item_scope
745 is_in_li_scope = (tag_name, namespace = null) ->
746 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
747 is_in_select_scope = (tag_name, namespace = null) ->
749 if t.name is tag_name and (namespace is null or namespace is t.namespace)
751 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
754 # this checks for a particular element, not by name
755 # this requires a namespace match
756 el_is_in_scope = (needle) ->
760 if standard_scopers[el.name] is el.namespace
764 clear_to_table_stopers = {
769 clear_stack_to_table_context = ->
771 if clear_to_table_stopers[open_els[0].name]?
775 clear_to_table_body_stopers = {
782 clear_stack_to_table_body_context = ->
784 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
788 clear_to_table_row_stopers = {
793 clear_stack_to_table_row_context = ->
795 if clear_to_table_row_stopers[open_els[0].name]?
799 clear_afe_to_marker = ->
801 return unless afe.length > 0 # this happens in fragment case, ?spec error
803 if el.type is TYPE_AFE_MARKER
808 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
810 # 1. Let last be false.
812 # 2. Let node be the last node in the stack of open elements.
814 node = open_els[node_i]
815 # 3. Loop: If node is the first node in the stack of open elements,
816 # then set last to true, and, if the parser was originally created as
817 # part of the HTML fragment parsing algorithm (fragment case) set node
818 # to the context element.
820 if node_i is open_els.length - 1
822 if flag_fragment_parsing
823 node = context_element
824 # 4. If node is a select element, run these substeps:
825 if node.name is 'select' and node.namespace is NS_HTML
826 # 1. If last is true, jump to the step below labeled done.
828 # 2. Let ancestor be node.
831 # 3. Loop: If ancestor is the first node in the stack of
832 # open elements, jump to the step below labeled done.
834 if ancestor_i is open_els.length - 1
836 # 4. Let ancestor be the node before ancestor in the stack
839 ancestor = open_els[ancestor_i]
840 # 5. If ancestor is a template node, jump to the step below
842 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
844 # 6. If ancestor is a table node, switch the insertion mode
845 # to "in select in table" and abort these steps.
846 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
847 ins_mode = ins_mode_in_select_in_table
849 # 7. Jump back to the step labeled loop.
850 # 8. Done: Switch the insertion mode to "in select" and abort
852 ins_mode = ins_mode_in_select
854 # 5. If node is a td or th element and last is false, then switch
855 # the insertion mode to "in cell" and abort these steps.
856 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
857 ins_mode = ins_mode_in_cell
859 # 6. If node is a tr element, then switch the insertion mode to "in
860 # row" and abort these steps.
861 if node.name is 'tr' and node.namespace is NS_HTML
862 ins_mode = ins_mode_in_row
864 # 7. If node is a tbody, thead, or tfoot element, then switch the
865 # insertion mode to "in table body" and abort these steps.
866 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
867 ins_mode = ins_mode_in_table_body
869 # 8. If node is a caption element, then switch the insertion mode
870 # to "in caption" and abort these steps.
871 if node.name is 'caption' and node.namespace is NS_HTML
872 ins_mode = ins_mode_in_caption
874 # 9. If node is a colgroup element, then switch the insertion mode
875 # to "in column group" and abort these steps.
876 if node.name is 'colgroup' and node.namespace is NS_HTML
877 ins_mode = ins_mode_in_column_group
879 # 10. If node is a table element, then switch the insertion mode to
880 # "in table" and abort these steps.
881 if node.name is 'table' and node.namespace is NS_HTML
882 ins_mode = ins_mode_in_table
884 # 11. If node is a template element, then switch the insertion mode
885 # to the current template insertion mode and abort these steps.
886 if node.name is 'template' and node.namespace is NS_HTML
887 ins_mode = template_ins_modes[0]
889 # 12. If node is a head element and last is true, then switch the
890 # insertion mode to "in body" ("in body"! not "in head"!) and abort
891 # these steps. (fragment case)
892 if node.name is 'head' and node.namespace is NS_HTML and last
893 ins_mode = ins_mode_in_body
895 # 13. If node is a head element and last is false, then switch the
896 # insertion mode to "in head" and abort these steps.
897 if node.name is 'head' and node.namespace is NS_HTML and last is false
898 ins_mode = ins_mode_in_head
900 # 14. If node is a body element, then switch the insertion mode to
901 # "in body" and abort these steps.
902 if node.name is 'body' and node.namespace is NS_HTML
903 ins_mode = ins_mode_in_body
905 # 15. If node is a frameset element, then switch the insertion mode
906 # to "in frameset" and abort these steps. (fragment case)
907 if node.name is 'frameset' and node.namespace is NS_HTML
908 ins_mode = ins_mode_in_frameset
910 # 16. If node is an html element, run these substeps:
911 if node.name is 'html' and node.namespace is NS_HTML
912 # 1. If the head element pointer is null, switch the insertion
913 # mode to "before head" and abort these steps. (fragment case)
914 if head_element_pointer is null
915 ins_mode = ins_mode_before_head
917 # 2. Otherwise, the head element pointer is not null,
918 # switch the insertion mode to "after head" and abort these
920 ins_mode = ins_mode_after_head
922 # 17. If last is true, then switch the insertion mode to "in body"
923 # and abort these steps. (fragment case)
925 ins_mode = ins_mode_in_body
927 # 18. Let node now be the node before node in the stack of open
930 node = open_els[node_i]
931 # 19. Return to the step labeled loop.
936 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
937 adjusted_current_node = ->
938 if open_els.length is 1 and flag_fragment_parsing
939 return context_element
942 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
943 # this implementation is structured (mostly) as described at the link above.
944 # capitalized comments are the "labels" described at the link above.
946 return if afe.length is 0
947 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
952 if i is afe.length - 1
955 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
960 el = insert_html_element afe[i].token
966 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
967 # adoption agency algorithm
969 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
970 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
971 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
972 adoption_agency = (subject) ->
973 # this block implements tha W3C spec
974 # # 1. If the current node is an HTML element whose tag name is subject,
975 # # then run these substeps:
977 # # 1. Let element be the current node.
979 # # 2. Pop element off the stack of open elements.
981 # # 3. If element is also in the list of active formatting elements,
982 # # remove the element from the list.
984 # # 4. Abort the adoption agency algorithm.
985 # if open_els[0].name is subject and open_els[0].namespace is NS_HTML
986 # el = open_els.shift()
987 # # remove it from the list of active formatting elements (if found)
993 # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
994 # If the current node is an HTML element whose tag name is subject, and
995 # the current node is not in the list of active formatting elements,
996 # then pop the current node off the stack of open elements, and abort
998 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
999 # remove it from the list of active formatting elements (if found)
1002 if el is open_els[0]
1015 # 5. Let formatting element be the last element in the list of
1016 # active formatting elements that: is between the end of the list
1017 # and the last scope marker in the list, if any, or the start of
1018 # the list otherwise, and has the tag name subject.
1020 for t, fe_of_afe in afe
1021 if t.type is TYPE_AFE_MARKER
1023 if t.name is subject
1026 # If there is no such element, then abort these steps and instead
1027 # act as described in the "any other end tag" entry above.
1029 in_body_any_other_end_tag subject
1031 # 6. If formatting element is not in the stack of open elements,
1032 # then this is a parse error; remove the element from the list, and
1033 # abort these steps.
1035 for t, fe_of_open_els in open_els
1041 # "remove it from the list" must mean afe, since it's not in open_els
1042 afe.splice fe_of_afe, 1
1044 # 7. If formatting element is in the stack of open elements, but
1045 # the element is not in scope, then this is a parse error; abort
1047 unless el_is_in_scope fe
1050 # 8. If formatting element is not the current node, this is a parse
1051 # error. (But do not abort these steps.)
1052 unless open_els[0] is fe
1055 # 9. Let furthest block be the topmost node in the stack of open
1056 # elements that is lower in the stack than formatting element, and
1057 # is an element in the special category. There might not be one.
1059 fb_of_open_els = null
1060 for t, i in open_els
1066 # and continue, to see if there's one that's more "topmost"
1067 # 10. If there is no furthest block, then the UA must first pop all
1068 # the nodes from the bottom of the stack of open elements, from the
1069 # current node up to and including formatting element, then remove
1070 # formatting element from the list of active formatting elements,
1071 # and finally abort these steps.
1074 t = open_els.shift()
1076 afe.splice fe_of_afe, 1
1078 # 11. Let common ancestor be the element immediately above
1079 # formatting element in the stack of open elements.
1080 ca = open_els[fe_of_open_els + 1] # common ancestor
1082 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
1083 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
1084 bookmark = new_aaa_bookmark()
1087 afe.splice i, 0, bookmark
1089 node = last_node = fb
1093 # 3. Let node be the element immediately above node in the
1094 # stack of open elements, or if node is no longer in the stack
1095 # of open elements (e.g. because it got removed by this
1096 # algorithm), the element that was immediately above node in
1097 # the stack of open elements before node was removed.
1099 for t, i in open_els
1101 node_next = open_els[i + 1]
1103 node = node_next ? node_above
1104 # TODO make sure node_above gets re-set if/when node is removed from open_els
1106 # 4. If node is formatting element, then go to the next step in
1107 # the overall algorithm.
1110 # 5. If inner loop counter is greater than three and node is in
1111 # the list of active formatting elements, then remove node from
1112 # the list of active formatting elements.
1121 # 6. If node is not in the list of active formatting elements,
1122 # then remove node from the stack of open elements and then go
1123 # back to the step labeled inner loop.
1125 for t, i in open_els
1127 node_above = open_els[i + 1]
1128 open_els.splice i, 1
1131 # 7. create an element for the token for which the element node
1132 # was created, in the HTML namespace, with common ancestor as
1133 # the intended parent; replace the entry for node in the list
1134 # of active formatting elements with an entry for the new
1135 # element, replace the entry for node in the stack of open
1136 # elements with an entry for the new element, and let node be
1138 new_node = token_to_element node.token, NS_HTML, ca
1143 for t, i in open_els
1145 node_above = open_els[i + 1]
1146 open_els[i] = new_node
1149 # 8. If last node is furthest block, then move the
1150 # aforementioned bookmark to be immediately after the new node
1151 # in the list of active formatting elements.
1159 # "after" means lower
1160 afe.splice i, 0, bookmark # "after as <-
1162 # 9. Insert last node into node, first removing it from its
1163 # previous parent node if any.
1164 if last_node.parent?
1165 for c, i in last_node.parent.children
1167 last_node.parent.children.splice i, 1
1169 node.children.push last_node
1170 last_node.parent = node
1171 # 10. Let last node be node.
1173 # 11. Return to the step labeled inner loop.
1174 # 14. Insert whatever last node ended up being in the previous step
1175 # at the appropriate place for inserting a node, but using common
1176 # ancestor as the override target.
1178 # In the case where fe is immediately followed by fb:
1179 # * inner loop exits out early (node==fe)
1181 # * last_node is still in the tree (not a duplicate)
1182 if last_node.parent?
1183 for c, i in last_node.parent.children
1185 last_node.parent.children.splice i, 1
1187 # can't use standard insert token thing, because it's already in
1188 # open_els and must stay at it's current position in open_els
1189 dest = adjusted_insertion_location ca
1190 dest[0].children.splice dest[1], 0, last_node
1191 last_node.parent = dest[0]
1192 # 15. Create an element for the token for which formatting element
1193 # was created, in the HTML namespace, with furthest block as the
1195 new_element = token_to_element fe.token, NS_HTML, fb
1196 # 16. Take all of the child nodes of furthest block and append them
1197 # to the element created in the last step.
1198 while fb.children.length
1199 t = fb.children.shift()
1200 t.parent = new_element
1201 new_element.children.push t
1202 # 17. Append that new element to furthest block.
1203 new_element.parent = fb
1204 fb.children.push new_element
1205 # 18. Remove formatting element from the list of active formatting
1206 # elements, and insert the new element into the list of active
1207 # formatting elements at the position of the aforementioned
1215 afe[i] = new_element
1217 # 19. Remove formatting element from the stack of open elements,
1218 # and insert the new element into the stack of open elements
1219 # immediately below the position of furthest block in that stack.
1220 for t, i in open_els
1222 open_els.splice i, 1
1224 for t, i in open_els
1226 open_els.splice i, 0, new_element
1228 # 20. Jump back to the step labeled outer loop.
1231 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1232 close_p_element = ->
1233 generate_implied_end_tags 'p' # arg is exception
1234 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1236 while open_els.length > 1 # just in case
1237 el = open_els.shift()
1238 if el.name is 'p' and el.namespace is NS_HTML
1241 close_p_if_in_button_scope = ->
1242 if is_in_button_scope 'p', NS_HTML
1246 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1247 # aka insert_a_character = (t) ->
1248 insert_character = (t) ->
1249 dest = adjusted_insertion_location()
1250 # fixfull check for Document node
1252 prev = dest[0].children[dest[1] - 1]
1253 if prev.type is TYPE_TEXT
1256 dest[0].children.splice dest[1], 0, t
1259 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1260 process_token = (t) ->
1261 acn = adjusted_current_node()
1265 if acn.namespace is NS_HTML
1268 if is_mathml_text_integration_point(acn)
1269 if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1272 if t.type is TYPE_TEXT
1275 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1278 if is_html_integration acn
1279 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1282 if t.type is TYPE_EOF
1285 in_foreign_content t
1289 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1290 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1291 adjusted_insertion_location = (override_target = null) ->
1292 # 1. If there was an override target specified, then let target be the
1295 target = override_target
1296 else # Otherwise, let target be the current node.
1297 target = open_els[0]
1298 # 2. Determine the adjusted insertion location using the first matching
1299 # steps from the following list:
1301 # If foster parenting is enabled and target is a table, tbody, tfoot,
1302 # thead, or tr element Foster parenting happens when content is
1303 # misnested in tables.
1304 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1305 loop # once. this is here so we can ``break`` to "abort these substeps"
1306 # 1. Let last template be the last template element in the
1307 # stack of open elements, if any.
1308 last_template = null
1309 last_template_i = null
1310 for el, i in open_els
1311 if el.name is 'template' and el.namespace is NS_HTML
1315 # 2. Let last table be the last table element in the stack of
1316 # open elements, if any.
1319 for el, i in open_els
1320 if el.name is 'table' and el.namespace is NS_HTML
1324 # 3. If there is a last template and either there is no last
1325 # table, or there is one, but last template is lower (more
1326 # recently added) than last table in the stack of open
1327 # elements, then: let adjusted insertion location be inside
1328 # last template's template contents, after its last child (if
1329 # any), and abort these substeps.
1330 if last_template and (last_table is null or last_template_i < last_table_i)
1331 target = last_template # fixfull should be it's contents
1332 target_i = target.children.length
1334 # 4. If there is no last table, then let adjusted insertion
1335 # location be inside the first element in the stack of open
1336 # elements (the html element), after its last child (if any),
1337 # and abort these substeps. (fragment case)
1338 if last_table is null
1340 target = open_els[open_els.length - 1]
1341 target_i = target.children.length
1343 # 5. If last table has a parent element, then let adjusted
1344 # insertion location be inside last table's parent element,
1345 # immediately before last table, and abort these substeps.
1346 if last_table.parent?
1347 for c, i in last_table.parent.children
1349 target = last_table.parent
1353 # 6. Let previous element be the element immediately above last
1354 # table in the stack of open elements.
1356 # huh? how could it not have a parent?
1357 previous_element = open_els[last_table_i + 1]
1358 # 7. Let adjusted insertion location be inside previous
1359 # element, after its last child (if any).
1360 target = previous_element
1361 target_i = target.children.length
1362 # Note: These steps are involved in part because it's possible
1363 # for elements, the table element in this case in particular,
1364 # to have been moved by a script around in the DOM, or indeed
1365 # removed from the DOM entirely, after the element was inserted
1367 break # don't really loop
1369 # Otherwise Let adjusted insertion location be inside target, after
1370 # its last child (if any).
1371 target_i = target.children.length
1373 # 3. If the adjusted insertion location is inside a template element,
1374 # let it instead be inside the template element's template contents,
1375 # after its last child (if any).
1376 # fixfull (template)
1378 # 4. Return the adjusted insertion location.
1379 return [target, target_i]
1381 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1382 # aka create_an_element_for_token
1383 token_to_element = (t, namespace, intended_parent) ->
1384 # convert attributes into a hash
1387 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1388 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1390 # TODO 2. If the newly created element has an xmlns attribute in the
1391 # XMLNS namespace whose value is not exactly the same as the element's
1392 # namespace, that is a parse error. Similarly, if the newly created
1393 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1394 # value is not the XLink Namespace, that is a parse error.
1396 # fixfull: the spec says stuff about form pointers and ownerDocument
1400 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1401 insert_foreign_element = (token, namespace) ->
1402 ail = adjusted_insertion_location()
1405 el = token_to_element token, namespace, ail_el
1406 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1408 ail_el.children.splice ail_i, 0, el
1411 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1412 insert_html_element = (token) ->
1413 return insert_foreign_element token, NS_HTML
1415 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1416 # position should be [node, index_within_children]
1417 insert_comment = (t, position = null) ->
1418 position ?= adjusted_insertion_location()
1419 position[0].children.splice position[1], 0, t
1423 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1424 parse_generic_raw_text = (t) ->
1425 insert_html_element t
1426 tok_state = tok_state_rawtext
1427 original_ins_mode = ins_mode
1428 ins_mode = ins_mode_text
1430 parse_generic_rcdata_text = (t) ->
1431 insert_html_element t
1432 tok_state = tok_state_rcdata
1433 original_ins_mode = ins_mode
1434 ins_mode = ins_mode_text
1437 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1438 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1439 generate_implied_end_tags = (except = null) ->
1440 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1444 # 8.2.5.4 The rules for parsing tokens in HTML content
1445 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1447 # 8.2.5.4.1 The "initial" insertion mode
1448 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1449 is_quirks_yes_doctype = (t) ->
1450 if t.flag 'force-quirks'
1452 if t.name isnt 'html'
1454 if t.public_identifier?
1455 pi = t.public_identifier.toLowerCase()
1456 for p in quirks_yes_pi_prefixes
1457 if pi.substr(0, p.length) is p
1459 if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
1461 if t.system_identifier?
1462 if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
1464 else if t.public_identifier?
1465 # already did this: pi = t.public_identifier.toLowerCase()
1466 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1469 is_quirks_limited_doctype = (t) ->
1470 if t.public_identifier?
1471 pi = t.public_identifier.toLowerCase()
1472 if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
1474 if t.system_identifier?
1475 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1478 ins_mode_initial = (t) ->
1481 if t.type is TYPE_COMMENT
1485 if t.type is TYPE_DOCTYPE
1486 # fixfull syntax error from first paragraph and following bullets
1487 # fixfull set doc.doctype
1488 # fixfull is the "not an iframe srcdoc" thing relevant?
1489 if is_quirks_yes_doctype t
1490 doc.flag 'quirks mode', QUIRKS_YES
1491 else if is_quirks_limited_doctype t
1492 doc.flag 'quirks mode', QUIRKS_LIMITED
1494 ins_mode = ins_mode_before_html
1497 # fixfull not iframe srcdoc?
1499 doc.flag 'quirks mode', QUIRKS_YES
1500 ins_mode = ins_mode_before_html
1504 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1505 ins_mode_before_html = (t) ->
1506 if t.type is TYPE_DOCTYPE
1509 if t.type is TYPE_COMMENT
1514 if t.type is TYPE_START_TAG and t.name is 'html'
1515 el = token_to_element t, NS_HTML, doc
1516 doc.children.push el
1518 open_els.unshift(el)
1519 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1520 ins_mode = ins_mode_before_head
1522 if t.type is TYPE_END_TAG
1523 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1524 # fall through to "anything else"
1529 el = token_to_element new_open_tag('html'), NS_HTML, doc
1530 doc.children.push el
1533 # ?fixfull browsing context
1534 ins_mode = ins_mode_before_head
1538 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1539 ins_mode_before_head = (t) ->
1542 if t.type is TYPE_COMMENT
1545 if t.type is TYPE_DOCTYPE
1548 if t.type is TYPE_START_TAG and t.name is 'html'
1551 if t.type is TYPE_START_TAG and t.name is 'head'
1552 el = insert_html_element t
1553 head_element_pointer = el
1554 ins_mode = ins_mode_in_head
1556 if t.type is TYPE_END_TAG
1557 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1558 # fall through to Anything else below
1563 el = insert_html_element new_open_tag 'head'
1564 head_element_pointer = el
1565 ins_mode = ins_mode_in_head
1569 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1570 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1571 open_els.shift() # spec says this will be a 'head' node
1572 ins_mode = ins_mode_after_head
1575 ins_mode_in_head = (t) ->
1576 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1579 if t.type is TYPE_COMMENT
1582 if t.type is TYPE_DOCTYPE
1585 if t.type is TYPE_START_TAG and t.name is 'html'
1588 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1589 el = insert_html_element t
1591 t.acknowledge_self_closing()
1593 if t.type is TYPE_START_TAG and t.name is 'meta'
1594 el = insert_html_element t
1596 t.acknowledge_self_closing()
1597 # fixfull encoding stuff
1599 if t.type is TYPE_START_TAG and t.name is 'title'
1600 parse_generic_rcdata_text t
1602 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1603 parse_generic_raw_text t
1605 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1606 insert_html_element t
1607 ins_mode = ins_mode_in_head_noscript
1609 if t.type is TYPE_START_TAG and t.name is 'script'
1610 ail = adjusted_insertion_location()
1611 el = token_to_element t, NS_HTML, ail
1612 el.flag 'parser-inserted', true
1613 # fixfull frament case
1614 ail[0].children.splice ail[1], 0, el
1616 tok_state = tok_state_script_data
1617 original_ins_mode = ins_mode # make sure orig... is defined
1618 ins_mode = ins_mode_text
1620 if t.type is TYPE_END_TAG and t.name is 'head'
1621 open_els.shift() # will be a head element... spec says so
1622 ins_mode = ins_mode_after_head
1624 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1625 ins_mode_in_head_else t
1627 if t.type is TYPE_START_TAG and t.name is 'template'
1628 insert_html_element t
1630 flag_frameset_ok = false
1631 ins_mode = ins_mode_in_template
1632 template_ins_modes.unshift ins_mode_in_template
1634 if t.type is TYPE_END_TAG and t.name is 'template'
1635 if template_tag_is_open()
1636 generate_implied_end_tags
1637 if open_els[0].name isnt 'template'
1640 el = open_els.shift()
1641 if el.name is 'template' and el.namespace is NS_HTML
1643 clear_afe_to_marker()
1644 template_ins_modes.shift()
1649 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1652 ins_mode_in_head_else t
1655 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1656 ins_mode_in_head_noscript_else = (t) ->
1659 ins_mode = ins_mode_in_head
1662 ins_mode_in_head_noscript = (t) ->
1663 if t.type is TYPE_DOCTYPE
1666 if t.type is TYPE_START_TAG and t.name is 'html'
1669 if t.type is TYPE_END_TAG and t.name is 'noscript'
1671 ins_mode = ins_mode_in_head
1673 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1676 if t.type is TYPE_END_TAG and t.name is 'br'
1677 ins_mode_in_head_noscript_else t
1679 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1683 ins_mode_in_head_noscript_else t
1686 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1687 ins_mode_after_head_else = (t) ->
1688 body_tok = new_open_tag 'body'
1689 insert_html_element body_tok
1690 ins_mode = ins_mode_in_body
1693 ins_mode_after_head = (t) ->
1697 if t.type is TYPE_COMMENT
1700 if t.type is TYPE_DOCTYPE
1703 if t.type is TYPE_START_TAG and t.name is 'html'
1706 if t.type is TYPE_START_TAG and t.name is 'body'
1707 insert_html_element t
1708 flag_frameset_ok = false
1709 ins_mode = ins_mode_in_body
1711 if t.type is TYPE_START_TAG and t.name is 'frameset'
1712 insert_html_element t
1713 ins_mode = ins_mode_in_frameset
1715 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1717 open_els.unshift head_element_pointer
1719 for el, i in open_els
1720 if el is head_element_pointer
1721 open_els.splice i, 1
1724 if t.type is TYPE_END_TAG and t.name is 'template'
1727 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1728 ins_mode_after_head_else t
1730 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1734 ins_mode_after_head_else t
1737 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1738 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1741 if node.name is name and node.namespace is NS_HTML
1742 generate_implied_end_tags name # arg is exception
1743 unless node is open_els[0]
1746 el = open_els.shift()
1749 if special_elements[node.name] is node.namespace
1752 for el, i in open_els
1754 node = open_els[i + 1]
1757 ins_mode_in_body = (t) ->
1758 if t.type is TYPE_TEXT and t.text is "\u0000"
1765 if t.type is TYPE_TEXT
1768 flag_frameset_ok = false
1770 if t.type is TYPE_COMMENT
1773 if t.type is TYPE_DOCTYPE
1776 if t.type is TYPE_START_TAG and t.name is 'html'
1778 return if template_tag_is_open()
1779 root_attrs = open_els[open_els.length - 1].attrs
1781 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1784 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1787 if t.type is TYPE_START_TAG and t.name is 'body'
1789 return if open_els.length < 2
1790 second = open_els[open_els.length - 2]
1791 return unless second.namespace is NS_HTML
1792 return unless second.name is 'body'
1793 return if template_tag_is_open()
1794 flag_frameset_ok = false
1796 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1798 if t.type is TYPE_START_TAG and t.name is 'frameset'
1800 return if open_els.length < 2
1801 second_i = open_els.length - 2
1802 second = open_els[second_i]
1803 return unless second.namespace is NS_HTML
1804 return unless second.name is 'body'
1805 if flag_frameset_ok is false
1808 for el, i in second.parent.children
1810 second.parent.children.splice i, 1
1812 open_els.splice second_i, 1
1813 # pop everything except the "root html element"
1814 while open_els.length > 1
1816 insert_html_element t
1817 ins_mode = ins_mode_in_frameset
1819 if t.type is TYPE_EOF
1821 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1822 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1823 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1826 unless ok_tags[t.name] is el.namespace
1829 if template_ins_modes.length > 0
1830 ins_mode_in_template t
1834 if t.type is TYPE_END_TAG and t.name is 'body'
1835 unless is_in_scope 'body', NS_HTML
1839 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1840 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1841 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1842 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1846 unless ok_tags[t.name] is el.namespace
1849 ins_mode = ins_mode_after_body
1851 if t.type is TYPE_END_TAG and t.name is 'html'
1852 unless is_in_scope 'body', NS_HTML
1856 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1857 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1858 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1859 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1863 unless ok_tags[t.name] is el.namespace
1866 ins_mode = ins_mode_after_body
1869 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1870 close_p_if_in_button_scope()
1871 insert_html_element t
1873 if t.type is TYPE_START_TAG and h_tags[t.name]?
1874 close_p_if_in_button_scope()
1875 if h_tags[open_els[0].name] is open_els[0].namespace
1878 insert_html_element t
1880 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1881 close_p_if_in_button_scope()
1882 insert_html_element t
1883 eat_next_token_if_newline()
1884 flag_frameset_ok = false
1886 if t.type is TYPE_START_TAG and t.name is 'form'
1887 unless form_element_pointer is null or template_tag_is_open()
1890 close_p_if_in_button_scope()
1891 el = insert_html_element t
1892 unless template_tag_is_open()
1893 form_element_pointer = el
1895 if t.type is TYPE_START_TAG and t.name is 'li'
1896 flag_frameset_ok = false
1897 for node in open_els
1898 if node.name is 'li' and node.namespace is NS_HTML
1899 generate_implied_end_tags 'li' # arg is exception
1900 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1903 el = open_els.shift()
1904 if el.name is 'li' and el.namespace is NS_HTML
1907 if el_is_special_not_adp node
1909 close_p_if_in_button_scope()
1910 insert_html_element t
1912 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1913 flag_frameset_ok = false
1914 for node in open_els
1915 if node.name is 'dd' and node.namespace is NS_HTML
1916 generate_implied_end_tags 'dd' # arg is exception
1917 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1920 el = open_els.shift()
1921 if el.name is 'dd' and el.namespace is NS_HTML
1924 if node.name is 'dt' and node.namespace is NS_HTML
1925 generate_implied_end_tags 'dt' # arg is exception
1926 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1929 el = open_els.shift()
1930 if el.name is 'dt' and el.namespace is NS_HTML
1933 if el_is_special_not_adp node
1935 close_p_if_in_button_scope()
1936 insert_html_element t
1938 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1939 close_p_if_in_button_scope()
1940 insert_html_element t
1941 tok_state = tok_state_plaintext
1943 if t.type is TYPE_START_TAG and t.name is 'button'
1944 if is_in_scope 'button', NS_HTML
1946 generate_implied_end_tags()
1948 el = open_els.shift()
1949 if el.name is 'button' and el.namespace is NS_HTML
1952 insert_html_element t
1953 flag_frameset_ok = false
1955 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1956 unless is_in_scope t.name, NS_HTML
1959 generate_implied_end_tags()
1960 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1963 el = open_els.shift()
1964 if el.name is t.name and el.namespace is NS_HTML
1967 if t.type is TYPE_END_TAG and t.name is 'form'
1968 unless template_tag_is_open()
1969 node = form_element_pointer
1970 form_element_pointer = null
1971 if node is null or not el_is_in_scope node
1974 generate_implied_end_tags()
1975 if open_els[0] isnt node
1977 for el, i in open_els
1979 open_els.splice i, 1
1982 unless is_in_scope 'form', NS_HTML
1985 generate_implied_end_tags()
1986 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1989 el = open_els.shift()
1990 if el.name is 'form' and el.namespace is NS_HTML
1993 if t.type is TYPE_END_TAG and t.name is 'p'
1994 unless is_in_button_scope 'p', NS_HTML
1996 insert_html_element new_open_tag 'p'
1999 if t.type is TYPE_END_TAG and t.name is 'li'
2000 unless is_in_li_scope 'li', NS_HTML
2003 generate_implied_end_tags 'li' # arg is exception
2004 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
2007 el = open_els.shift()
2008 if el.name is 'li' and el.namespace is NS_HTML
2011 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
2012 unless is_in_scope t.name, NS_HTML
2015 generate_implied_end_tags t.name # arg is exception
2016 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2019 el = open_els.shift()
2020 if el.name is t.name and el.namespace is NS_HTML
2023 if t.type is TYPE_END_TAG and h_tags[t.name]?
2026 if h_tags[el.name] is el.namespace
2029 if standard_scopers[el.name] is el.namespace
2034 generate_implied_end_tags()
2035 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2038 el = open_els.shift()
2039 if h_tags[el.name] is el.namespace
2043 if t.type is TYPE_START_TAG and t.name is 'a'
2044 # If the list of active formatting elements contains an a element
2045 # between the end of the list and the last marker on the list (or
2046 # the start of the list if there is no marker on the list), then
2047 # this is a parse error; run the adoption agency algorithm for the
2048 # tag name "a", then remove that element from the list of active
2049 # formatting elements and the stack of open elements if the
2050 # adoption agency algorithm didn't already remove it (it might not
2051 # have if the element is not in table scope).
2054 if el.type is TYPE_AFE_MARKER
2056 if el.name is 'a' and el.namespace is NS_HTML
2064 for el, i in open_els
2066 open_els.splice i, 1
2068 el = insert_html_element t
2071 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2073 el = insert_html_element t
2076 if t.type is TYPE_START_TAG and t.name is 'nobr'
2078 if is_in_scope 'nobr', NS_HTML
2080 adoption_agency 'nobr'
2082 el = insert_html_element t
2085 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2086 adoption_agency t.name
2088 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2090 insert_html_element t
2092 flag_frameset_ok = false
2094 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2095 unless is_in_scope t.name, NS_HTML
2098 generate_implied_end_tags()
2099 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2102 el = open_els.shift()
2103 if el.name is t.name and el.namespace is NS_HTML
2105 clear_afe_to_marker()
2107 if t.type is TYPE_START_TAG and t.name is 'table'
2108 unless doc.flag('quirks mode') is QUIRKS_YES
2109 close_p_if_in_button_scope() # test
2110 insert_html_element t
2111 flag_frameset_ok = false
2112 ins_mode = ins_mode_in_table
2114 if t.type is TYPE_END_TAG and t.name is 'br'
2116 # W3C: t.type = TYPE_START_TAG
2117 t = new_open_tag 'br' # WHATWG
2119 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2121 insert_html_element t
2123 t.acknowledge_self_closing()
2124 flag_frameset_ok = false
2126 if t.type is TYPE_START_TAG and t.name is 'input'
2128 insert_html_element t
2130 t.acknowledge_self_closing()
2131 unless is_input_hidden_tok t
2132 flag_frameset_ok = false
2134 if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
2135 # WHATWG adds 'menuitem' for this block
2136 insert_html_element t
2138 t.acknowledge_self_closing()
2140 if t.type is TYPE_START_TAG and t.name is 'hr'
2141 close_p_if_in_button_scope()
2142 insert_html_element t
2144 t.acknowledge_self_closing()
2145 flag_frameset_ok = false
2147 if t.type is TYPE_START_TAG and t.name is 'image'
2152 if t.type is TYPE_START_TAG and t.name is 'isindex'
2154 if template_tag_is_open() is false and form_element_pointer isnt null
2156 t.acknowledge_self_closing()
2157 flag_frameset_ok = false
2158 close_p_if_in_button_scope()
2159 el = insert_html_element new_open_tag 'form'
2160 unless template_tag_is_open()
2161 form_element_pointer = el
2164 el.attrs['action'] = a[1]
2166 insert_html_element new_open_tag 'hr'
2169 insert_html_element new_open_tag 'label'
2170 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2171 input_el = new_open_tag 'input'
2176 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2177 input_el.attrs_a.push [a[0], a[1]]
2178 input_el.attrs_a.push ['name', 'isindex']
2179 # fixfull this next bit is in english... internationalize?
2180 prompt ?= "This is a searchable index. Enter search keywords: "
2181 insert_character new_character_token prompt # fixfull split
2182 # TODO submit typo "balue" in spec
2183 insert_html_element input_el
2185 # insert_character '' # you can put chars here if promt attr missing
2187 insert_html_element new_open_tag 'hr'
2190 unless template_tag_is_open()
2191 form_element_pointer = null
2193 if t.type is TYPE_START_TAG and t.name is 'textarea'
2194 insert_html_element t
2195 eat_next_token_if_newline()
2196 tok_state = tok_state_rcdata
2197 original_ins_mode = ins_mode
2198 flag_frameset_ok = false
2199 ins_mode = ins_mode_text
2201 if t.type is TYPE_START_TAG and t.name is 'xmp'
2202 close_p_if_in_button_scope()
2204 flag_frameset_ok = false
2205 parse_generic_raw_text t
2207 if t.type is TYPE_START_TAG and t.name is 'iframe'
2208 flag_frameset_ok = false
2209 parse_generic_raw_text t
2211 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2212 parse_generic_raw_text t
2214 if t.type is TYPE_START_TAG and t.name is 'select'
2216 insert_html_element t
2217 flag_frameset_ok = false
2218 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2219 ins_mode = ins_mode_in_select_in_table
2221 ins_mode = ins_mode_in_select
2223 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2224 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2227 insert_html_element t
2229 # this comment block implements the W3C spec
2230 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2231 # if is_in_scope 'ruby', NS_HTML
2232 # generate_implied_end_tags()
2233 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2235 # insert_html_element t
2237 # if t.type is TYPE_START_TAG and t.name is 'rt'
2238 # if is_in_scope 'ruby', NS_HTML
2239 # generate_implied_end_tags 'rtc' # arg is exception
2240 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2242 # insert_html_element t
2244 # below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2245 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2246 if is_in_scope 'ruby', NS_HTML
2247 generate_implied_end_tags()
2248 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2250 insert_html_element t
2252 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2253 if is_in_scope 'ruby', NS_HTML
2254 generate_implied_end_tags 'rtc'
2255 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2257 insert_html_element t
2260 if t.type is TYPE_START_TAG and t.name is 'math'
2262 adjust_mathml_attributes t
2263 adjust_foreign_attributes t
2264 insert_foreign_element t, NS_MATHML
2265 if t.flag 'self-closing'
2267 t.acknowledge_self_closing()
2269 if t.type is TYPE_START_TAG and t.name is 'svg'
2271 adjust_svg_attributes t
2272 adjust_foreign_attributes t
2273 insert_foreign_element t, NS_SVG
2274 if t.flag 'self-closing'
2276 t.acknowledge_self_closing()
2278 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2281 if t.type is TYPE_START_TAG # any other start tag
2283 insert_html_element t
2285 if t.type is TYPE_END_TAG # any other end tag
2286 in_body_any_other_end_tag t.name
2290 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2291 ins_mode_text = (t) ->
2292 if t.type is TYPE_TEXT
2295 if t.type is TYPE_EOF
2297 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2298 open_els[0].flag 'already started', true
2300 ins_mode = original_ins_mode
2303 if t.type is TYPE_END_TAG and t.name is 'script'
2305 ins_mode = original_ins_mode
2306 # fixfull the spec seems to assume that I'm going to run the script
2307 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2309 if t.type is TYPE_END_TAG
2311 ins_mode = original_ins_mode
2315 # the functions below implement the tokenizer stats described here:
2316 # http://www.w3.org/TR/html5/syntax.html#tokenization
2318 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2319 ins_mode_in_table_else = (t) ->
2321 flag_foster_parenting = true
2323 flag_foster_parenting = false
2325 ins_mode_in_table = (t) ->
2328 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2329 pending_table_character_tokens = []
2330 original_ins_mode = ins_mode
2331 ins_mode = ins_mode_in_table_text
2334 ins_mode_in_table_else t
2342 clear_stack_to_table_context()
2344 insert_html_element t
2345 ins_mode = ins_mode_in_caption
2347 clear_stack_to_table_context()
2348 insert_html_element t
2349 ins_mode = ins_mode_in_column_group
2351 clear_stack_to_table_context()
2352 insert_html_element new_open_tag 'colgroup'
2353 ins_mode = ins_mode_in_column_group
2355 when 'tbody', 'tfoot', 'thead'
2356 clear_stack_to_table_context()
2357 insert_html_element t
2358 ins_mode = ins_mode_in_table_body
2359 when 'td', 'th', 'tr'
2360 clear_stack_to_table_context()
2361 insert_html_element new_open_tag 'tbody'
2362 ins_mode = ins_mode_in_table_body
2366 if is_in_table_scope 'table', NS_HTML
2368 el = open_els.shift()
2369 if el.name is 'table' and el.namespace is NS_HTML
2373 when 'style', 'script', 'template'
2376 unless is_input_hidden_tok t
2377 ins_mode_in_table_else t
2380 el = insert_html_element t
2382 t.acknowledge_self_closing()
2385 if form_element_pointer?
2387 if template_tag_is_open()
2389 form_element_pointer = insert_html_element t
2392 ins_mode_in_table_else t
2396 if is_in_table_scope 'table', NS_HTML
2398 el = open_els.shift()
2399 if el.name is 'table' and el.namespace is NS_HTML
2404 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2409 ins_mode_in_table_else t
2413 ins_mode_in_table_else t
2417 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2418 ins_mode_in_table_text = (t) ->
2419 if t.type is TYPE_TEXT and t.text is "\u0000"
2423 if t.type is TYPE_TEXT
2424 pending_table_character_tokens.push t
2428 for old in pending_table_character_tokens
2429 unless is_space_tok old
2433 for old in pending_table_character_tokens
2434 insert_character old
2436 for old in pending_table_character_tokens
2437 ins_mode_in_table_else old
2438 pending_table_character_tokens = []
2439 ins_mode = original_ins_mode
2443 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2444 ins_mode_in_caption = (t) ->
2445 if t.type is TYPE_END_TAG and t.name is 'caption'
2446 if is_in_table_scope 'caption', NS_HTML
2447 generate_implied_end_tags()
2448 if open_els[0].name isnt 'caption'
2451 el = open_els.shift()
2452 if el.name is 'caption' and el.namespace is NS_HTML
2454 clear_afe_to_marker()
2455 ins_mode = ins_mode_in_table
2460 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2462 if is_in_table_scope 'caption', NS_HTML
2464 el = open_els.shift()
2465 if el.name is 'caption' and el.namespace is NS_HTML
2467 clear_afe_to_marker()
2468 ins_mode = ins_mode_in_table
2470 # else fragment case
2472 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2479 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2480 ins_mode_in_column_group = (t) ->
2484 if t.type is TYPE_COMMENT
2487 if t.type is TYPE_DOCTYPE
2490 if t.type is TYPE_START_TAG and t.name is 'html'
2493 if t.type is TYPE_START_TAG and t.name is 'col'
2494 el = insert_html_element t
2496 t.acknowledge_self_closing()
2498 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2499 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2501 ins_mode = ins_mode_in_table
2505 if t.type is TYPE_END_TAG and t.name is 'col'
2508 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2511 if t.type is TYPE_EOF
2515 if open_els[0].name isnt 'colgroup'
2519 ins_mode = ins_mode_in_table
2523 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2524 ins_mode_in_table_body = (t) ->
2525 if t.type is TYPE_START_TAG and t.name is 'tr'
2526 clear_stack_to_table_body_context()
2527 insert_html_element t
2528 ins_mode = ins_mode_in_row
2530 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2532 clear_stack_to_table_body_context()
2533 insert_html_element new_open_tag 'tr'
2534 ins_mode = ins_mode_in_row
2537 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2538 unless is_in_table_scope t.name, NS_HTML
2541 clear_stack_to_table_body_context()
2543 ins_mode = ins_mode_in_table
2545 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2548 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2551 if table_scopers[el.name] is el.namespace
2556 clear_stack_to_table_body_context()
2558 ins_mode = ins_mode_in_table
2561 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2568 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2569 ins_mode_in_row = (t) ->
2570 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2571 clear_stack_to_table_row_context()
2572 insert_html_element t
2573 ins_mode = ins_mode_in_cell
2576 if t.type is TYPE_END_TAG and t.name is 'tr'
2577 if is_in_table_scope 'tr', NS_HTML
2578 clear_stack_to_table_row_context()
2580 ins_mode = ins_mode_in_table_body
2584 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2585 if is_in_table_scope 'tr', NS_HTML
2586 clear_stack_to_table_row_context()
2588 ins_mode = ins_mode_in_table_body
2593 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2594 if is_in_table_scope t.name, NS_HTML
2595 if is_in_table_scope 'tr', NS_HTML
2596 clear_stack_to_table_row_context()
2598 ins_mode = ins_mode_in_table_body
2603 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2610 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2612 generate_implied_end_tags()
2613 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2616 el = open_els.shift()
2617 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2619 clear_afe_to_marker()
2620 ins_mode = ins_mode_in_row
2623 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2624 ins_mode_in_cell = (t) ->
2625 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2626 if is_in_table_scope t.name, NS_HTML
2627 generate_implied_end_tags()
2628 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2631 el = open_els.shift()
2632 if el.name is t.name and el.namespace is NS_HTML
2634 clear_afe_to_marker()
2635 ins_mode = ins_mode_in_row
2639 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2642 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2645 if table_scopers[el.name] is el.namespace
2653 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2656 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2657 if is_in_table_scope t.name, NS_HTML
2667 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2668 ins_mode_in_select = (t) ->
2669 if t.type is TYPE_TEXT and t.text is "\u0000"
2672 if t.type is TYPE_TEXT
2675 if t.type is TYPE_COMMENT
2678 if t.type is TYPE_DOCTYPE
2681 if t.type is TYPE_START_TAG and t.name is 'html'
2684 if t.type is TYPE_START_TAG and t.name is 'option'
2685 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2687 insert_html_element t
2689 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2690 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2692 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2694 insert_html_element t
2696 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2697 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2698 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2700 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2705 if t.type is TYPE_END_TAG and t.name is 'option'
2706 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2711 if t.type is TYPE_END_TAG and t.name is 'select'
2712 if is_in_select_scope 'select', NS_HTML
2714 el = open_els.shift()
2715 if el.name is 'select' and el.namespace is NS_HTML
2721 if t.type is TYPE_START_TAG and t.name is 'select'
2724 el = open_els.shift()
2725 if el.name is 'select' and el.namespace is NS_HTML
2728 # spec says that this is the same as </select> but it doesn't say
2729 # to check scope first
2731 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2733 unless is_in_select_scope 'select', NS_HTML
2736 el = open_els.shift()
2737 if el.name is 'select' and el.namespace is NS_HTML
2742 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2745 if t.type is TYPE_EOF
2752 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2753 ins_mode_in_select_in_table = (t) ->
2754 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2757 el = open_els.shift()
2758 if el.name is 'select' and el.namespace is NS_HTML
2763 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2765 unless is_in_table_scope t.name, NS_HTML
2768 el = open_els.shift()
2769 if el.name is 'select' and el.namespace is NS_HTML
2775 ins_mode_in_select t
2778 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2779 ins_mode_in_template = (t) ->
2780 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2783 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2786 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2787 template_ins_modes.shift()
2788 template_ins_modes.unshift ins_mode_in_table
2789 ins_mode = ins_mode_in_table
2792 if t.type is TYPE_START_TAG and t.name is 'col'
2793 template_ins_modes.shift()
2794 template_ins_modes.unshift ins_mode_in_column_group
2795 ins_mode = ins_mode_in_column_group
2798 if t.type is TYPE_START_TAG and t.name is 'tr'
2799 template_ins_modes.shift()
2800 template_ins_modes.unshift ins_mode_in_table_body
2801 ins_mode = ins_mode_in_table_body
2804 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2805 template_ins_modes.shift()
2806 template_ins_modes.unshift ins_mode_in_row
2807 ins_mode = ins_mode_in_row
2810 if t.type is TYPE_START_TAG
2811 template_ins_modes.shift()
2812 template_ins_modes.unshift ins_mode_in_body
2813 ins_mode = ins_mode_in_body
2816 if t.type is TYPE_END_TAG
2819 if t.type is TYPE_EOF
2820 unless template_tag_is_open()
2825 el = open_els.shift()
2826 if el.name is 'template' and el.namespace is NS_HTML
2828 clear_afe_to_marker()
2829 template_ins_modes.shift()
2834 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2835 ins_mode_after_body = (t) ->
2839 if t.type is TYPE_COMMENT
2840 first = open_els[open_els.length - 1]
2841 insert_comment t, [first, first.children.length]
2843 if t.type is TYPE_DOCTYPE
2846 if t.type is TYPE_START_TAG and t.name is 'html'
2849 if t.type is TYPE_END_TAG and t.name is 'html'
2850 if flag_fragment_parsing
2853 ins_mode = ins_mode_after_after_body
2855 if t.type is TYPE_EOF
2860 ins_mode = ins_mode_in_body
2864 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2865 ins_mode_in_frameset = (t) ->
2869 if t.type is TYPE_COMMENT
2872 if t.type is TYPE_DOCTYPE
2875 if t.type is TYPE_START_TAG and t.name is 'html'
2878 if t.type is TYPE_START_TAG and t.name is 'frameset'
2879 insert_html_element t
2881 if t.type is TYPE_END_TAG and t.name is 'frameset'
2882 if open_els.length is 1
2884 return # fragment case
2886 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2887 ins_mode = ins_mode_after_frameset
2889 if t.type is TYPE_START_TAG and t.name is 'frame'
2890 insert_html_element t
2892 t.acknowledge_self_closing()
2894 if t.type is TYPE_START_TAG and t.name is 'noframes'
2897 if t.type is TYPE_EOF
2898 if open_els.length isnt 1
2906 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2907 ins_mode_after_frameset = (t) ->
2911 if t.type is TYPE_COMMENT
2914 if t.type is TYPE_DOCTYPE
2917 if t.type is TYPE_START_TAG and t.name is 'html'
2920 if t.type is TYPE_END_TAG and t.name is 'html'
2921 ins_mode = ins_mode_after_after_frameset
2923 if t.type is TYPE_START_TAG and t.name is 'noframes'
2926 if t.type is TYPE_EOF
2933 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2934 ins_mode_after_after_body = (t) ->
2935 if t.type is TYPE_COMMENT
2936 insert_comment t, [doc, doc.children.length]
2938 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2941 if t.type is TYPE_EOF
2946 ins_mode = ins_mode_in_body
2950 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2951 ins_mode_after_after_frameset = (t) ->
2952 if t.type is TYPE_COMMENT
2953 insert_comment t, [doc, doc.children.length]
2955 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2958 if t.type is TYPE_EOF
2961 if t.type is TYPE_START_TAG and t.name is 'noframes'
2968 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2969 has_color_face_or_size = (t) ->
2971 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2974 in_foreign_content_end_script = ->
2978 in_foreign_content_other_start = (t) ->
2979 acn = adjusted_current_node()
2980 if acn.namespace is NS_MATHML
2981 adjust_mathml_attributes t
2982 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2983 t.name = svg_name_fixes[t.name]
2984 if acn.namespace is NS_SVG
2985 adjust_svg_attributes t
2986 adjust_foreign_attributes t
2987 insert_foreign_element t, acn.namespace
2988 if t.flag 'self-closing'
2989 if t.name is 'script'
2990 t.acknowledge_self_closing()
2991 in_foreign_content_end_script()
2995 t.acknowledge_self_closing()
2997 in_foreign_content = (t) ->
2998 if t.type is TYPE_TEXT and t.text is "\u0000"
3000 insert_character new_character_token "\ufffd"
3005 if t.type is TYPE_TEXT
3006 flag_frameset_ok = false
3009 if t.type is TYPE_COMMENT
3012 if t.type is TYPE_DOCTYPE
3015 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
3017 if flag_fragment_parsing
3018 in_foreign_content_other_start t
3020 loop # is this safe?
3022 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
3026 if t.type is TYPE_START_TAG
3027 in_foreign_content_other_start t
3029 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
3030 in_foreign_content_end_script()
3032 if t.type is TYPE_END_TAG
3035 if node.name.toLowerCase() isnt t.name
3038 if node is open_els[open_els.length - 1]
3040 if node.name.toLowerCase() is t.name
3042 el = open_els.shift()
3047 if node.namespace is NS_HTML
3049 ins_mode t # explicitly call HTML insertion mode
3053 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
3055 switch c = txt.charAt(cur++)
3057 return new_text_node parse_character_reference()
3059 tok_state = tok_state_tag_open
3062 return new_text_node c
3064 return new_eof_token()
3066 return new_text_node c
3069 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
3070 # not needed: tok_state_character_reference_in_data = ->
3071 # just call parse_character_reference()
3073 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
3074 tok_state_rcdata = ->
3075 switch c = txt.charAt(cur++)
3077 return new_text_node parse_character_reference()
3079 tok_state = tok_state_rcdata_less_than_sign
3082 return new_character_token "\ufffd"
3084 return new_eof_token()
3086 return new_character_token c
3089 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
3090 # not needed: tok_state_character_reference_in_rcdata = ->
3091 # just call parse_character_reference()
3093 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
3094 tok_state_rawtext = ->
3095 switch c = txt.charAt(cur++)
3097 tok_state = tok_state_rawtext_less_than_sign
3100 return new_character_token "\ufffd"
3102 return new_eof_token()
3104 return new_character_token c
3107 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
3108 tok_state_script_data = ->
3109 switch c = txt.charAt(cur++)
3111 tok_state = tok_state_script_data_less_than_sign
3114 return new_character_token "\ufffd"
3116 return new_eof_token()
3118 return new_character_token c
3121 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3122 tok_state_plaintext = ->
3123 switch c = txt.charAt(cur++)
3126 return new_character_token "\ufffd"
3128 return new_eof_token()
3130 return new_character_token c
3134 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3135 tok_state_tag_open = ->
3136 c = txt.charAt(cur++)
3138 tok_state = tok_state_markup_declaration_open
3141 tok_state = tok_state_end_tag_open
3144 tok_cur_tag = new_open_tag c.toLowerCase()
3145 tok_state = tok_state_tag_name
3148 tok_cur_tag = new_open_tag c
3149 tok_state = tok_state_tag_name
3153 tok_cur_tag = new_comment_token '?' # FIXME right?
3154 tok_state = tok_state_bogus_comment
3158 tok_state = tok_state_data
3159 cur -= 1 # we didn't parse/handle the char after <
3160 return new_text_node '<'
3162 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3163 tok_state_end_tag_open = ->
3164 c = txt.charAt(cur++)
3166 tok_cur_tag = new_end_tag c.toLowerCase()
3167 tok_state = tok_state_tag_name
3170 tok_cur_tag = new_end_tag c
3171 tok_state = tok_state_tag_name
3175 tok_state = tok_state_data
3179 tok_state = tok_state_data
3180 return new_text_node '</'
3183 tok_cur_tag = new_comment_token c
3184 tok_state = tok_state_bogus_comment
3187 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3188 tok_state_tag_name = ->
3189 switch c = txt.charAt(cur++)
3190 when "\t", "\n", "\u000c", ' '
3191 tok_state = tok_state_before_attribute_name
3193 tok_state = tok_state_self_closing_start_tag
3195 tok_state = tok_state_data
3201 tok_cur_tag.name += "\ufffd"
3204 tok_state = tok_state_data
3207 tok_cur_tag.name += c.toLowerCase()
3209 tok_cur_tag.name += c
3212 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3213 tok_state_rcdata_less_than_sign = ->
3214 c = txt.charAt(cur++)
3216 temporary_buffer = ''
3217 tok_state = tok_state_rcdata_end_tag_open
3220 tok_state = tok_state_rcdata
3221 cur -= 1 # reconsume the input character
3222 return new_character_token '<'
3224 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3225 tok_state_rcdata_end_tag_open = ->
3226 c = txt.charAt(cur++)
3228 tok_cur_tag = new_end_tag c.toLowerCase()
3229 temporary_buffer += c
3230 tok_state = tok_state_rcdata_end_tag_name
3233 tok_cur_tag = new_end_tag c
3234 temporary_buffer += c
3235 tok_state = tok_state_rcdata_end_tag_name
3238 tok_state = tok_state_rcdata
3239 cur -= 1 # reconsume the input character
3240 return new_character_token "</" # fixfull separate these
3242 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3243 is_appropriate_end_tag = (t) ->
3244 # fixfull: this assumes that open_els[0].name is "the tag name of the last
3245 # start tag to have been emitted from this tokenizer"
3246 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3248 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3249 tok_state_rcdata_end_tag_name = ->
3250 c = txt.charAt(cur++)
3251 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3252 if is_appropriate_end_tag tok_cur_tag
3253 tok_state = tok_state_before_attribute_name
3255 # else fall through to "Anything else"
3257 if is_appropriate_end_tag tok_cur_tag
3258 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3260 # else fall through to "Anything else"
3262 if is_appropriate_end_tag tok_cur_tag
3263 tok_state = tok_state_data
3265 # else fall through to "Anything else"
3267 tok_cur_tag.name += c.toLowerCase()
3268 temporary_buffer += c
3271 tok_cur_tag.name += c
3272 temporary_buffer += c
3275 tok_state = tok_state_rcdata
3276 cur -= 1 # reconsume the input character
3277 return new_character_token '</' + temporary_buffer # fixfull separate these
3279 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3280 tok_state_rawtext_less_than_sign = ->
3281 c = txt.charAt(cur++)
3283 temporary_buffer = ''
3284 tok_state = tok_state_rawtext_end_tag_open
3287 tok_state = tok_state_rawtext
3288 cur -= 1 # reconsume the input character
3289 return new_character_token '<'
3291 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3292 tok_state_rawtext_end_tag_open = ->
3293 c = txt.charAt(cur++)
3295 tok_cur_tag = new_end_tag c.toLowerCase()
3296 temporary_buffer += c
3297 tok_state = tok_state_rawtext_end_tag_name
3300 tok_cur_tag = new_end_tag c
3301 temporary_buffer += c
3302 tok_state = tok_state_rawtext_end_tag_name
3305 tok_state = tok_state_rawtext
3306 cur -= 1 # reconsume the input character
3307 return new_character_token "</" # fixfull separate these
3309 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3310 tok_state_rawtext_end_tag_name = ->
3311 c = txt.charAt(cur++)
3312 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3313 if is_appropriate_end_tag tok_cur_tag
3314 tok_state = tok_state_before_attribute_name
3316 # else fall through to "Anything else"
3318 if is_appropriate_end_tag tok_cur_tag
3319 tok_state = tok_state_self_closing_start_tag
3321 # else fall through to "Anything else"
3323 if is_appropriate_end_tag tok_cur_tag
3324 tok_state = tok_state_data
3326 # else fall through to "Anything else"
3328 tok_cur_tag.name += c.toLowerCase()
3329 temporary_buffer += c
3332 tok_cur_tag.name += c
3333 temporary_buffer += c
3336 tok_state = tok_state_rawtext
3337 cur -= 1 # reconsume the input character
3338 return new_character_token '</' + temporary_buffer # fixfull separate these
3340 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3341 tok_state_script_data_less_than_sign = ->
3342 c = txt.charAt(cur++)
3344 temporary_buffer = ''
3345 tok_state = tok_state_script_data_end_tag_open
3348 tok_state = tok_state_script_data_escape_start
3349 return new_character_token '<!' # fixfull split
3351 tok_state = tok_state_script_data
3352 cur -= 1 # Reconsume
3353 return new_character_token '<'
3355 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3356 tok_state_script_data_end_tag_open = ->
3357 c = txt.charAt(cur++)
3359 tok_cur_tag = new_end_tag c.toLowerCase()
3360 temporary_buffer += c
3361 tok_state = tok_state_script_data_end_tag_name
3364 tok_cur_tag = new_end_tag c
3365 temporary_buffer += c
3366 tok_state = tok_state_script_data_end_tag_name
3369 tok_state = tok_state_script_data
3370 cur -= 1 # Reconsume
3371 return new_character_token '</'
3373 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3374 tok_state_script_data_end_tag_name = ->
3375 c = txt.charAt(cur++)
3376 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3377 if is_appropriate_end_tag tok_cur_tag
3378 tok_state = tok_state_before_attribute_name
3382 if is_appropriate_end_tag tok_cur_tag
3383 tok_state = tok_state_self_closing_start_tag
3387 if is_appropriate_end_tag tok_cur_tag
3388 tok_state = tok_state_data
3392 tok_cur_tag.name += c.toLowerCase()
3393 temporary_buffer += c
3396 tok_cur_tag.name += c
3397 temporary_buffer += c
3400 tok_state = tok_state_script_data
3401 cur -= 1 # Reconsume
3402 return new_character_token "</#{temporary_buffer}" # fixfull split
3404 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3405 tok_state_script_data_escape_start = ->
3406 c = txt.charAt(cur++)
3408 tok_state = tok_state_script_data_escape_start_dash
3409 return new_character_token '-'
3411 tok_state = tok_state_script_data
3412 cur -= 1 # Reconsume
3415 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3416 tok_state_script_data_escape_start_dash = ->
3417 c = txt.charAt(cur++)
3419 tok_state = tok_state_script_data_escaped_dash_dash
3420 return new_character_token '-'
3422 tok_state = tok_state_script_data
3423 cur -= 1 # Reconsume
3426 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3427 tok_state_script_data_escaped = ->
3428 c = txt.charAt(cur++)
3430 tok_state = tok_state_script_data_escaped_dash
3431 return new_character_token '-'
3433 tok_state = tok_state_script_data_escaped_less_than_sign
3437 return new_character_token "\ufffd"
3439 tok_state = tok_state_data
3441 cur -= 1 # Reconsume
3444 return new_character_token c
3446 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3447 tok_state_script_data_escaped_dash = ->
3448 c = txt.charAt(cur++)
3450 tok_state = tok_state_script_data_escaped_dash_dash
3451 return new_character_token '-'
3453 tok_state = tok_state_script_data_escaped_less_than_sign
3457 tok_state = tok_state_script_data_escaped
3458 return new_character_token "\ufffd"
3460 tok_state = tok_state_data
3462 cur -= 1 # Reconsume
3465 tok_state = tok_state_script_data_escaped
3466 return new_character_token c
3468 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3469 tok_state_script_data_escaped_dash_dash = ->
3470 c = txt.charAt(cur++)
3472 return new_character_token '-'
3474 tok_state = tok_state_script_data_escaped_less_than_sign
3477 tok_state = tok_state_script_data
3478 return new_character_token '>'
3481 tok_state = tok_state_script_data_escaped
3482 return new_character_token "\ufffd"
3485 tok_state = tok_state_data
3486 cur -= 1 # Reconsume
3489 tok_state = tok_state_script_data_escaped
3490 return new_character_token c
3492 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3493 tok_state_script_data_escaped_less_than_sign = ->
3494 c = txt.charAt(cur++)
3496 temporary_buffer = ''
3497 tok_state = tok_state_script_data_escaped_end_tag_open
3500 temporary_buffer = c.toLowerCase() # yes, really
3501 tok_state = tok_state_script_data_double_escape_start
3502 return new_character_token "<#{c}" # fixfull split
3504 temporary_buffer = c
3505 tok_state = tok_state_script_data_double_escape_start
3506 return new_character_token "<#{c}" # fixfull split
3508 tok_state = tok_state_script_data_escaped
3509 cur -= 1 # Reconsume
3510 return new_character_token '<'
3512 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3513 tok_state_script_data_escaped_end_tag_open = ->
3514 c = txt.charAt(cur++)
3516 tok_cur_tag = new_end_tag c.toLowerCase()
3517 temporary_buffer += c
3518 tok_state = tok_state_script_data_escaped_end_tag_name
3521 tok_cur_tag = new_end_tag c
3522 temporary_buffer += c
3523 tok_state = tok_state_script_data_escaped_end_tag_name
3526 tok_state = tok_state_script_data_escaped
3527 cur -= 1 # Reconsume
3528 return new_character_token '</' # fixfull split
3530 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3531 tok_state_script_data_escaped_end_tag_name = ->
3532 c = txt.charAt(cur++)
3533 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3534 if is_appropriate_end_tag tok_cur_tag
3535 tok_state = tok_state_before_attribute_name
3539 if is_appropriate_end_tag tok_cur_tag
3540 tok_state = tok_state_self_closing_start_tag
3544 if is_appropriate_end_tag tok_cur_tag
3545 tok_state = tok_state_data
3549 tok_cur_tag.name += c.toLowerCase()
3550 temporary_buffer += c.toLowerCase()
3553 tok_cur_tag.name += c
3554 temporary_buffer += c.toLowerCase()
3557 tok_state = tok_state_script_data_escaped
3558 cur -= 1 # Reconsume
3559 return new_character_token "</#{temporary_buffer}" # fixfull split
3561 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3562 tok_state_script_data_double_escape_start = ->
3563 c = txt.charAt(cur++)
3564 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3565 if temporary_buffer is 'script'
3566 tok_state = tok_state_script_data_double_escaped
3568 tok_state = tok_state_script_data_escaped
3569 return new_character_token c
3571 temporary_buffer += c.toLowerCase() # yes, really lowercase
3572 return new_character_token c
3574 temporary_buffer += c
3575 return new_character_token c
3577 tok_state = tok_state_script_data_escaped
3578 cur -= 1 # Reconsume
3581 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3582 tok_state_script_data_double_escaped = ->
3583 c = txt.charAt(cur++)
3585 tok_state = tok_state_script_data_double_escaped_dash
3586 return new_character_token '-'
3588 tok_state = tok_state_script_data_double_escaped_less_than_sign
3589 return new_character_token '<'
3592 return new_character_token "\ufffd"
3595 tok_state = tok_state_data
3596 cur -= 1 # Reconsume
3599 return new_character_token c
3601 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3602 tok_state_script_data_double_escaped_dash = ->
3603 c = txt.charAt(cur++)
3605 tok_state = tok_state_script_data_double_escaped_dash_dash
3606 return new_character_token '-'
3608 tok_state = tok_state_script_data_double_escaped_less_than_sign
3609 return new_character_token '<'
3612 tok_state = tok_state_script_data_double_escaped
3613 return new_character_token "\ufffd"
3616 tok_state = tok_state_data
3617 cur -= 1 # Reconsume
3620 tok_state = tok_state_script_data_double_escaped
3621 return new_character_token c
3623 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3624 tok_state_script_data_double_escaped_dash_dash = ->
3625 c = txt.charAt(cur++)
3627 return new_character_token '-'
3629 tok_state = tok_state_script_data_double_escaped_less_than_sign
3630 return new_character_token '<'
3632 tok_state = tok_state_script_data
3633 return new_character_token '>'
3636 tok_state = tok_state_script_data_double_escaped
3637 return new_character_token "\ufffd"
3640 tok_state = tok_state_data
3641 cur -= 1 # Reconsume
3644 tok_state = tok_state_script_data_double_escaped
3645 return new_character_token c
3647 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3648 tok_state_script_data_double_escaped_less_than_sign = ->
3649 c = txt.charAt(cur++)
3651 temporary_buffer = ''
3652 tok_state = tok_state_script_data_double_escape_end
3653 return new_character_token '/'
3655 tok_state = tok_state_script_data_double_escaped
3656 cur -= 1 # Reconsume
3659 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3660 tok_state_script_data_double_escape_end = ->
3661 c = txt.charAt(cur++)
3662 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3663 if temporary_buffer is 'script'
3664 tok_state = tok_state_script_data_escaped
3666 tok_state = tok_state_script_data_double_escaped
3667 return new_character_token c
3669 temporary_buffer += c.toLowerCase() # yes, really lowercase
3670 return new_character_token c
3672 temporary_buffer += c
3673 return new_character_token c
3675 tok_state = tok_state_script_data_double_escaped
3676 cur -= 1 # Reconsume
3679 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3680 tok_state_before_attribute_name = ->
3682 switch c = txt.charAt(cur++)
3683 when "\t", "\n", "\u000c", ' '
3686 tok_state = tok_state_self_closing_start_tag
3689 tok_state = tok_state_data
3695 attr_name = "\ufffd"
3696 when '"', "'", '<', '='
3701 tok_state = tok_state_data
3704 attr_name = c.toLowerCase()
3708 tok_cur_tag.attrs_a.unshift [attr_name, '']
3709 tok_state = tok_state_attribute_name
3712 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3713 tok_state_attribute_name = ->
3714 switch c = txt.charAt(cur++)
3715 when "\t", "\n", "\u000c", ' '
3716 tok_state = tok_state_after_attribute_name
3718 tok_state = tok_state_self_closing_start_tag
3720 tok_state = tok_state_before_attribute_value
3722 tok_state = tok_state_data
3728 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3731 tok_cur_tag.attrs_a[0][0] += c
3734 tok_state = tok_state_data
3737 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3739 tok_cur_tag.attrs_a[0][0] += c
3742 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3743 tok_state_after_attribute_name = ->
3744 c = txt.charAt(cur++)
3745 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3748 tok_state = tok_state_self_closing_start_tag
3751 tok_state = tok_state_before_attribute_value
3754 tok_state = tok_state_data
3757 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3758 tok_state = tok_state_attribute_name
3762 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3763 tok_state = tok_state_attribute_name
3767 tok_state = tok_state_data
3768 cur -= 1 # reconsume
3770 if c is '"' or c is "'" or c is '<'
3772 # fall through to Anything else
3774 tok_cur_tag.attrs_a.unshift [c, '']
3775 tok_state = tok_state_attribute_name
3778 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3779 tok_state_before_attribute_value = ->
3780 switch c = txt.charAt(cur++)
3781 when "\t", "\n", "\u000c", ' '
3784 tok_state = tok_state_attribute_value_double_quoted
3786 tok_state = tok_state_attribute_value_unquoted
3789 tok_state = tok_state_attribute_value_single_quoted
3792 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3793 tok_state = tok_state_attribute_value_unquoted
3796 tok_state = tok_state_data
3802 tok_state = tok_state_data
3804 tok_cur_tag.attrs_a[0][1] += c
3805 tok_state = tok_state_attribute_value_unquoted
3808 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3809 tok_state_attribute_value_double_quoted = ->
3810 switch c = txt.charAt(cur++)
3812 tok_state = tok_state_after_attribute_value_quoted
3814 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3817 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3820 tok_state = tok_state_data
3822 tok_cur_tag.attrs_a[0][1] += c
3825 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3826 tok_state_attribute_value_single_quoted = ->
3827 switch c = txt.charAt(cur++)
3829 tok_state = tok_state_after_attribute_value_quoted
3831 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3834 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3837 tok_state = tok_state_data
3839 tok_cur_tag.attrs_a[0][1] += c
3842 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3843 tok_state_attribute_value_unquoted = ->
3844 switch c = txt.charAt(cur++)
3845 when "\t", "\n", "\u000c", ' '
3846 tok_state = tok_state_before_attribute_name
3848 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3850 tok_state = tok_state_data
3855 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3858 tok_state = tok_state_data
3860 # Parse Error if ', <, = or ` (backtick)
3861 tok_cur_tag.attrs_a[0][1] += c
3864 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3865 tok_state_after_attribute_value_quoted = ->
3866 switch c = txt.charAt(cur++)
3867 when "\t", "\n", "\u000c", ' '
3868 tok_state = tok_state_before_attribute_name
3870 tok_state = tok_state_self_closing_start_tag
3872 tok_state = tok_state_data
3878 tok_state = tok_state_data
3881 tok_state = tok_state_before_attribute_name
3882 cur -= 1 # we didn't handle that char
3885 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3886 tok_state_self_closing_start_tag = ->
3887 c = txt.charAt(cur++)
3889 tok_cur_tag.flag 'self-closing', true
3890 tok_state = tok_state_data
3894 tok_state = tok_state_data
3895 cur -= 1 # Reconsume
3899 tok_state = tok_state_before_attribute_name
3900 cur -= 1 # Reconsume
3903 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3904 # WARNING: put a comment token in tok_cur_tag before setting this state
3905 tok_state_bogus_comment = ->
3906 next_gt = txt.indexOf '>', cur
3908 val = txt.substr cur
3911 val = txt.substr cur, (next_gt - cur)
3913 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3914 tok_cur_tag.text += val
3915 tok_state = tok_state_data
3918 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3919 tok_state_markup_declaration_open = ->
3920 if txt.substr(cur, 2) is '--'
3922 tok_cur_tag = new_comment_token ''
3923 tok_state = tok_state_comment_start
3925 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3927 tok_state = tok_state_doctype
3929 acn = adjusted_current_node()
3930 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3932 tok_state = tok_state_cdata_section
3936 tok_cur_tag = new_comment_token ''
3937 tok_state = tok_state_bogus_comment
3940 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3941 tok_state_comment_start = ->
3942 switch c = txt.charAt(cur++)
3944 tok_state = tok_state_comment_start_dash
3947 tok_state = tok_state_comment
3948 return new_character_token "\ufffd"
3951 tok_state = tok_state_data
3955 tok_state = tok_state_data
3956 cur -= 1 # Reconsume
3959 tok_cur_tag.text += c
3960 tok_state = tok_state_comment
3963 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3964 tok_state_comment_start_dash = ->
3965 switch c = txt.charAt(cur++)
3967 tok_state = tok_state_comment_end
3970 tok_cur_tag.text += "-\ufffd"
3971 tok_state = tok_state_comment
3974 tok_state = tok_state_data
3978 tok_state = tok_state_data
3979 cur -= 1 # Reconsume
3982 tok_cur_tag.text += "-#{c}"
3983 tok_state = tok_state_comment
3986 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3987 tok_state_comment = ->
3988 switch c = txt.charAt(cur++)
3990 tok_state = tok_state_comment_end_dash
3993 tok_cur_tag.text += "\ufffd"
3996 tok_state = tok_state_data
3997 cur -= 1 # Reconsume
4000 tok_cur_tag.text += c
4003 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
4004 tok_state_comment_end_dash = ->
4005 switch c = txt.charAt(cur++)
4007 tok_state = tok_state_comment_end
4010 tok_cur_tag.text += "-\ufffd"
4011 tok_state = tok_state_comment
4014 tok_state = tok_state_data
4015 cur -= 1 # Reconsume
4018 tok_cur_tag.text += "-#{c}"
4019 tok_state = tok_state_comment
4022 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
4023 tok_state_comment_end = ->
4024 switch c = txt.charAt(cur++)
4026 tok_state = tok_state_data
4030 tok_cur_tag.text += "--\ufffd"
4031 tok_state = tok_state_comment
4034 tok_state = tok_state_comment_end_bang
4037 tok_cur_tag.text += '-'
4040 tok_state = tok_state_data
4041 cur -= 1 # Reconsume
4045 tok_cur_tag.text += "--#{c}"
4046 tok_state = tok_state_comment
4049 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
4050 tok_state_comment_end_bang = ->
4051 switch c = txt.charAt(cur++)
4053 tok_cur_tag.text += "--!#{c}"
4054 tok_state = tok_state_comment_end_dash
4056 tok_state = tok_state_data
4060 tok_cur_tag.text += "--!\ufffd"
4061 tok_state = tok_state_comment
4064 tok_state = tok_state_data
4065 cur -= 1 # Reconsume
4068 tok_cur_tag.text += "--!#{c}"
4069 tok_state = tok_state_comment
4072 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4073 tok_state_doctype = ->
4074 switch c = txt.charAt(cur++)
4075 when "\t", "\u000a", "\u000c", ' '
4076 tok_state = tok_state_before_doctype_name
4079 tok_state = tok_state_data
4080 el = new_doctype_token ''
4081 el.flag 'force-quirks', true
4082 cur -= 1 # Reconsume
4086 tok_state = tok_state_before_doctype_name
4087 cur -= 1 # Reconsume
4090 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4091 tok_state_before_doctype_name = ->
4092 c = txt.charAt(cur++)
4093 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4096 tok_cur_tag = new_doctype_token c.toLowerCase()
4097 tok_state = tok_state_doctype_name
4101 tok_cur_tag = new_doctype_token "\ufffd"
4102 tok_state = tok_state_doctype_name
4106 el = new_doctype_token ''
4107 el.flag 'force-quirks', true
4108 tok_state = tok_state_data
4112 tok_state = tok_state_data
4113 el = new_doctype_token ''
4114 el.flag 'force-quirks', true
4115 cur -= 1 # Reconsume
4118 tok_cur_tag = new_doctype_token c
4119 tok_state = tok_state_doctype_name
4122 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4123 tok_state_doctype_name = ->
4124 c = txt.charAt(cur++)
4125 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4126 tok_state = tok_state_after_doctype_name
4129 tok_state = tok_state_data
4132 tok_cur_tag.name += c.toLowerCase()
4136 tok_cur_tag.name += "\ufffd"
4140 tok_state = tok_state_data
4141 tok_cur_tag.flag 'force-quirks', true
4142 cur -= 1 # Reconsume
4145 tok_cur_tag.name += c
4148 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4149 tok_state_after_doctype_name = ->
4150 c = txt.charAt(cur++)
4151 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4154 tok_state = tok_state_data
4158 tok_state = tok_state_data
4159 tok_cur_tag.flag 'force-quirks', true
4160 cur -= 1 # Reconsume
4163 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4165 tok_state = tok_state_after_doctype_public_keyword
4167 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4169 tok_state = tok_state_after_doctype_system_keyword
4172 tok_cur_tag.flag 'force-quirks', true
4173 tok_state = tok_state_bogus_doctype
4176 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4177 tok_state_after_doctype_public_keyword = ->
4178 c = txt.charAt(cur++)
4179 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4180 tok_state = tok_state_before_doctype_public_identifier
4184 tok_cur_tag.public_identifier = ''
4185 tok_state = tok_state_doctype_public_identifier_double_quoted
4189 tok_cur_tag.public_identifier = ''
4190 tok_state = tok_state_doctype_public_identifier_single_quoted
4194 tok_cur_tag.flag 'force-quirks', true
4195 tok_state = tok_state_data
4199 tok_state = tok_state_data
4200 tok_cur_tag.flag 'force-quirks', true
4201 cur -= 1 # Reconsume
4205 tok_cur_tag.flag 'force-quirks', true
4206 tok_state = tok_state_bogus_doctype
4209 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4210 tok_state_before_doctype_public_identifier = ->
4211 c = txt.charAt(cur++)
4212 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4216 tok_cur_tag.public_identifier = ''
4217 tok_state = tok_state_doctype_public_identifier_double_quoted
4221 tok_cur_tag.public_identifier = ''
4222 tok_state = tok_state_doctype_public_identifier_single_quoted
4226 tok_cur_tag.flag 'force-quirks', true
4227 tok_state = tok_state_data
4231 tok_state = tok_state_data
4232 tok_cur_tag.flag 'force-quirks', true
4233 cur -= 1 # Reconsume
4237 tok_cur_tag.flag 'force-quirks', true
4238 tok_state = tok_state_bogus_doctype
4242 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4243 tok_state_doctype_public_identifier_double_quoted = ->
4244 c = txt.charAt(cur++)
4246 tok_state = tok_state_after_doctype_public_identifier
4250 tok_cur_tag.public_identifier += "\ufffd"
4254 tok_cur_tag.flag 'force-quirks', true
4255 tok_state = tok_state_data
4259 tok_state = tok_state_data
4260 tok_cur_tag.flag 'force-quirks', true
4261 cur -= 1 # Reconsume
4264 tok_cur_tag.public_identifier += c
4267 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4268 tok_state_doctype_public_identifier_single_quoted = ->
4269 c = txt.charAt(cur++)
4271 tok_state = tok_state_after_doctype_public_identifier
4275 tok_cur_tag.public_identifier += "\ufffd"
4279 tok_cur_tag.flag 'force-quirks', true
4280 tok_state = tok_state_data
4284 tok_state = tok_state_data
4285 tok_cur_tag.flag 'force-quirks', true
4286 cur -= 1 # Reconsume
4289 tok_cur_tag.public_identifier += c
4292 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4293 tok_state_after_doctype_public_identifier = ->
4294 c = txt.charAt(cur++)
4295 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4296 tok_state = tok_state_between_doctype_public_and_system_identifiers
4299 tok_state = tok_state_data
4303 tok_cur_tag.system_identifier = ''
4304 tok_state = tok_state_doctype_system_identifier_double_quoted
4308 tok_cur_tag.system_identifier = ''
4309 tok_state = tok_state_doctype_system_identifier_single_quoted
4313 tok_state = tok_state_data
4314 tok_cur_tag.flag 'force-quirks', true
4315 cur -= 1 # Reconsume
4319 tok_cur_tag.flag 'force-quirks', true
4320 tok_state = tok_state_bogus_doctype
4323 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4324 tok_state_between_doctype_public_and_system_identifiers = ->
4325 c = txt.charAt(cur++)
4326 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4329 tok_state = tok_state_data
4333 tok_cur_tag.system_identifier = ''
4334 tok_state = tok_state_doctype_system_identifier_double_quoted
4338 tok_cur_tag.system_identifier = ''
4339 tok_state = tok_state_doctype_system_identifier_single_quoted
4343 tok_state = tok_state_data
4344 tok_cur_tag.flag 'force-quirks', true
4345 cur -= 1 # Reconsume
4349 tok_cur_tag.flag 'force-quirks', true
4350 tok_state = tok_state_bogus_doctype
4353 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4354 tok_state_after_doctype_system_keyword = ->
4355 c = txt.charAt(cur++)
4356 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4357 tok_state = tok_state_before_doctype_system_identifier
4361 tok_cur_tag.system_identifier = ''
4362 tok_state = tok_state_doctype_system_identifier_double_quoted
4366 tok_cur_tag.system_identifier = ''
4367 tok_state = tok_state_doctype_system_identifier_single_quoted
4371 tok_cur_tag.flag 'force-quirks', true
4372 tok_state = tok_state_data
4376 tok_state = tok_state_data
4377 tok_cur_tag.flag 'force-quirks', true
4378 cur -= 1 # Reconsume
4382 tok_cur_tag.flag 'force-quirks', true
4383 tok_state = tok_state_bogus_doctype
4386 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4387 tok_state_before_doctype_system_identifier = ->
4388 c = txt.charAt(cur++)
4389 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4392 tok_cur_tag.system_identifier = ''
4393 tok_state = tok_state_doctype_system_identifier_double_quoted
4396 tok_cur_tag.system_identifier = ''
4397 tok_state = tok_state_doctype_system_identifier_single_quoted
4401 tok_cur_tag.flag 'force-quirks', true
4402 tok_state = tok_state_data
4406 tok_state = tok_state_data
4407 tok_cur_tag.flag 'force-quirks', true
4408 cur -= 1 # Reconsume
4412 tok_cur_tag.flag 'force-quirks', true
4413 tok_state = tok_state_bogus_doctype
4416 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4417 tok_state_doctype_system_identifier_double_quoted = ->
4418 c = txt.charAt(cur++)
4420 tok_state = tok_state_after_doctype_system_identifier
4424 tok_cur_tag.system_identifier += "\ufffd"
4428 tok_cur_tag.flag 'force-quirks', true
4429 tok_state = tok_state_data
4433 tok_state = tok_state_data
4434 tok_cur_tag.flag 'force-quirks', true
4435 cur -= 1 # Reconsume
4438 tok_cur_tag.system_identifier += c
4441 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4442 tok_state_doctype_system_identifier_single_quoted = ->
4443 c = txt.charAt(cur++)
4445 tok_state = tok_state_after_doctype_system_identifier
4449 tok_cur_tag.system_identifier += "\ufffd"
4453 tok_cur_tag.flag 'force-quirks', true
4454 tok_state = tok_state_data
4458 tok_state = tok_state_data
4459 tok_cur_tag.flag 'force-quirks', true
4460 cur -= 1 # Reconsume
4463 tok_cur_tag.system_identifier += c
4466 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4467 tok_state_after_doctype_system_identifier = ->
4468 c = txt.charAt(cur++)
4469 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4472 tok_state = tok_state_data
4476 tok_state = tok_state_data
4477 tok_cur_tag.flag 'force-quirks', true
4478 cur -= 1 # Reconsume
4482 # do _not_ tok_cur_tag.flag 'force-quirks', true
4483 tok_state = tok_state_bogus_doctype
4486 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4487 tok_state_bogus_doctype = ->
4488 c = txt.charAt(cur++)
4490 tok_state = tok_state_data
4493 tok_state = tok_state_data
4494 cur -= 1 # Reconsume
4499 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4500 tok_state_cdata_section = ->
4501 tok_state = tok_state_data
4502 next_gt = txt.indexOf ']]>', cur
4504 val = txt.substr cur
4507 val = txt.substr cur, (next_gt - cur)
4509 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
4511 return new_character_token val # fixfull split
4514 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4515 # Don't set this as a state, just call it
4516 # returns a string (NOT a text node)
4517 parse_character_reference = (allowed_char = null, in_attr = false) ->
4518 if cur >= txt.length
4520 switch c = txt.charAt(cur)
4521 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4522 # explicitly not a parse error
4525 # there has to be "one or more" alnums between & and ; to be a parse error
4528 if cur + 1 >= txt.length
4530 if txt.charAt(cur + 1).toLowerCase() is 'x'
4539 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4544 if txt.charAt(start + i) is ';'
4548 code_point = txt.substr(start, i)
4549 while code_point.charAt(0) is '0' and code_point.length > 1
4550 code_point = code_point.substr 1
4551 code_point = parseInt(code_point, base)
4552 if unicode_fixes[code_point]?
4554 return unicode_fixes[code_point]
4556 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4560 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4562 return from_code_point code_point
4566 if alnum.indexOf(txt.charAt(cur + i)) is -1
4569 # exit early, because parse_error() below needs at least one alnum
4571 if txt.charAt(cur + i) is ';'
4572 decoded = decode_named_char_ref txt.substr(cur, i)
4573 i += 1 # scan past the ';' (after, so we dno't pass it to decode)
4577 # else FALL THROUGH (check for match without last char(s) or ";")
4578 # no ';' terminator (only legacy char refs)
4580 for i in [2..max] # no prefix matches, so ok to check shortest first
4581 c = legacy_char_refs[txt.substr(cur, i)]
4584 if txt.charAt(cur + i) is '='
4585 # "because some legacy user agents will
4586 # misinterpret the markup in those cases"
4589 if alnum.indexOf(txt.charAt(cur + i)) > -1
4590 # this makes attributes forgiving about url args
4592 # ok, and besides the weird exceptions for attributes...
4593 # return the matching char
4594 cur += i # consume entity chars
4595 parse_error() # because no terminating ";"
4599 return # never reached
4601 eat_next_token_if_newline = ->
4606 if t.type is TYPE_TEXT
4607 # definition of a newline depends on whether it was a character ref or not
4608 if cur - old_cur is 1
4609 # not a character reference
4610 if t.text is "\u000d" or t.text is "\u000a"
4613 if t.text is "\u000a"
4619 # tree constructor initialization
4620 # see comments on TYPE_TAG/etc for the structure of this data
4623 doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
4624 doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
4625 fragment_root = null # fragment parsing algorithm returns children of this
4627 afe = [] # active formatting elements
4628 template_ins_modes = []
4629 ins_mode = ins_mode_initial
4630 original_ins_mode = ins_mode # TODO check spec
4631 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4632 flag_frameset_ok = true
4634 flag_foster_parenting = false
4635 form_element_pointer = null
4636 temporary_buffer = null
4637 pending_table_character_tokens = []
4638 head_element_pointer = null
4639 flag_fragment_parsing = false
4640 context_element = null
4641 prev_node_id = 0 # just for debugging
4643 # tokenizer initialization
4644 tok_state = tok_state_data
4647 # fragment parsing (text arg)
4649 # this handles the fragment from the tests in the format described here:
4650 # https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/README.md
4653 if f.substr(0, 5) is 'math '
4656 else if f.substr(0, 4) is 'svg '
4660 context_element = token_to_element t, ns
4661 context_element.document = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
4662 context_element.document.flag 'quirks mode', QUIRKS_NO
4663 # fragment parsing (Node arg)
4665 context_element = args.context
4667 # http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4668 # fragment parsing algorithm
4670 flag_fragment_parsing = true
4671 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4672 # search up the tree from context, to try to find it's document,
4673 # because this file only puts a "document" property on the root
4676 el = context_element
4679 old_doc = el.document
4686 doc.flag 'quirks mode', old_doc.flag 'quirks mode'
4688 if context_element.namespace is NS_HTML
4689 switch context_element.name
4690 when 'title', 'textarea'
4691 tok_state = tok_state_rcdata
4692 when 'style', 'xmp', 'iframe', 'noembed', 'noframes'
4693 tok_state = tok_state_rawtext
4695 tok_state = tok_state_script_data
4698 tok_state = tok_state_rawtext
4700 tok_state = tok_state_plaintext
4701 fragment_root = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4702 doc.children.push fragment_root
4703 fragment_root.document = doc
4704 open_els = [fragment_root]
4705 if context_element.name is 'template' and context_element.namespace is NS_HTML
4706 template_ins_modes.unshift ins_mode_in_template
4707 # fixfull create token for context (it should have it's original one already)
4709 # set form_element pointer... in the foreign doc?!
4710 el = context_element
4712 if el.name is 'form' and el.namespace is NS_HTML
4713 form_element_pointer = el
4720 # text pre-processing
4721 # FIXME check http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4722 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4723 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4727 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4728 parse_main_loop = ->
4733 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4738 if flag_fragment_parsing
4739 return fragment_root.children
4742 exports.parse_html = parse_html
4743 exports.debug_log_reset = debug_log_reset
4744 exports.debug_log_each = debug_log_each
4745 exports.TYPE_TAG = TYPE_TAG
4746 exports.TYPE_TEXT = TYPE_TEXT
4747 exports.TYPE_COMMENT = TYPE_COMMENT
4748 exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4749 exports.NS_HTML = NS_HTML
4750 exports.NS_MATHML = NS_MATHML
4751 exports.NS_SVG = NS_SVG
4752 exports.QUIRKS_NO = QUIRKS_NO
4753 exports.QUIRKS_LIMITED = QUIRKS_LIMITED
4754 exports.QUIRKS_YES = QUIRKS_YES