1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
24 # http://www.w3.org/TR/html5/syntax.html
26 # except for some places marked "WHATWG" that are implemented as described here:
28 # https://html.spec.whatwg.org/multipage/syntax.html
30 # This code passes all of the tests in the .dat files at:
32 # https://github.com/JasonWoof/html5lib-tests/tree/patch-1/tree-construction
35 ##################################
36 ## how to use this code
37 ##################################
39 # See README.md for how to pre-compile this file, or compile it in the browser.
41 # This file exports a single useful function: parse_tml
43 # Once you include this file in a page (see index.html for an example) you'll
48 # wheic.parse_html({html: "<p><b>hi</p>"})
50 # Or, if you don't want <html><head><body>/etc, do this:
52 # wheic.parse_html({fragment: "body", html: "<p><b>hi</p>"})
54 # This code can _almost_ run outside the browser (eg under node.js). To get it
55 # to run without the browser would require native implementation of
56 # decode_named_char_ref(). The current implementation of that function uses the
57 # browser's DOM api, to save space (the list of valid named characters is
60 # This code is a work in progress, eg try search this file for "fixfull",
66 # Jason was frequently confused by the terminology used to refer to different
67 # parts of the stacks and lists in the spec, so he made this chart to help keep
70 # stacks grow downward (current element is index=0)
72 # example: open_els = [a, b, c, d, e, f, g]
74 # "grows downwards" means it's visualized like this: (index: el, names)
76 # 6: g "start of the list", "topmost", "first"
78 # 4: e "previous" (to d), "above", "before"
79 # 3: d (previous/next are relative to this element)
80 # 2: c "next", "after", "lower", "below"
82 # 0: a "end of the list", "current node", "bottommost", "last"
84 unless module?.exports?
86 module = exports: window.wheic
88 from_code_point = (x) ->
89 if String.fromCodePoint?
90 return String.fromCodePoint x
93 return String.fromCharCode x
95 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
97 # Each node is an obect of the Node class. Here are the Node types:
98 TYPE_TAG = 0 # name, {attributes}, [children]
99 TYPE_TEXT = 1 # "text"
102 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
103 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
104 TYPE_END_TAG = 5 # name
106 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
107 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
109 # namespace constants
114 # quirks mode constants
119 # queue up debug logs, so eg they can be shown only for tests that fail
127 debug_log_each = (cb) ->
128 for str in g_debug_log
134 constructor: (type, args = {}) ->
135 @type = type # one of the TYPE_* constants above
136 @name = args.name ? '' # tag name
137 @text = args.text ? '' # contents for text/comment nodes
138 @attrs = args.attrs ? {}
139 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
140 @children = args.children ? []
141 @namespace = args.namespace ? NS_HTML
142 @parent = args.parent ? null
143 @token = args.token ? null
144 @flags = args.flags ? {}
148 @id = "#{++prev_node_id}"
149 acknowledge_self_closing: ->
151 @token.flag 'did_self_close', true
153 @flag 'did_self_close', true
155 flag: (key, value = null) ->
162 # helpers: (only take args that are normally known when parser creates nodes)
163 new_open_tag = (name) ->
164 return new Node TYPE_START_TAG, name: name
165 new_end_tag = (name) ->
166 return new Node TYPE_END_TAG, name: name
167 new_element = (name) ->
168 return new Node TYPE_TAG, name: name
169 new_text_node = (txt) ->
170 return new Node TYPE_TEXT, text: txt
171 new_character_token = new_text_node
172 new_comment_token = (txt) ->
173 return new Node TYPE_COMMENT, text: txt
174 new_doctype_token = (name) ->
175 return new Node TYPE_DOCTYPE, name: name
177 return new Node TYPE_EOF
179 return new Node TYPE_AFE_MARKER
180 new_aaa_bookmark = ->
181 return new Node TYPE_AAA_BOOKMARK
183 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
184 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
185 digits = "0123456789"
186 alnum = lc_alpha + uc_alpha + digits
187 hex_chars = digits + "abcdefABCDEF"
189 is_uc_alpha = (str) ->
190 return str.length is 1 and uc_alpha.indexOf(str) > -1
191 is_lc_alpha = (str) ->
192 return str.length is 1 and lc_alpha.indexOf(str) > -1
194 # some SVG elements have dashes in them
195 tag_name_chars = alnum + "-"
197 # http://www.w3.org/TR/html5/infrastructure.html#space-character
198 space_chars = "\u0009\u000a\u000c\u000d\u0020"
200 return txt.length is 1 and space_chars.indexOf(txt) > -1
201 is_space_tok = (t) ->
202 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
204 is_input_hidden_tok = (t) ->
205 return false unless t.type is TYPE_START_TAG
208 if a[1].toLowerCase() is 'hidden'
213 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
214 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
217 unicode_fixes[0x00] = "\uFFFD"
218 unicode_fixes[0x80] = "\u20AC"
219 unicode_fixes[0x82] = "\u201A"
220 unicode_fixes[0x83] = "\u0192"
221 unicode_fixes[0x84] = "\u201E"
222 unicode_fixes[0x85] = "\u2026"
223 unicode_fixes[0x86] = "\u2020"
224 unicode_fixes[0x87] = "\u2021"
225 unicode_fixes[0x88] = "\u02C6"
226 unicode_fixes[0x89] = "\u2030"
227 unicode_fixes[0x8A] = "\u0160"
228 unicode_fixes[0x8B] = "\u2039"
229 unicode_fixes[0x8C] = "\u0152"
230 unicode_fixes[0x8E] = "\u017D"
231 unicode_fixes[0x91] = "\u2018"
232 unicode_fixes[0x92] = "\u2019"
233 unicode_fixes[0x93] = "\u201C"
234 unicode_fixes[0x94] = "\u201D"
235 unicode_fixes[0x95] = "\u2022"
236 unicode_fixes[0x96] = "\u2013"
237 unicode_fixes[0x97] = "\u2014"
238 unicode_fixes[0x98] = "\u02DC"
239 unicode_fixes[0x99] = "\u2122"
240 unicode_fixes[0x9A] = "\u0161"
241 unicode_fixes[0x9B] = "\u203A"
242 unicode_fixes[0x9C] = "\u0153"
243 unicode_fixes[0x9E] = "\u017E"
244 unicode_fixes[0x9F] = "\u0178"
246 quirks_yes_pi_prefixes = [
247 "+//silmaril//dtd html pro v0r11 19970101//"
248 "-//as//dtd html 3.0 aswedit + extensions//"
249 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
250 "-//ietf//dtd html 2.0 level 1//"
251 "-//ietf//dtd html 2.0 level 2//"
252 "-//ietf//dtd html 2.0 strict level 1//"
253 "-//ietf//dtd html 2.0 strict level 2//"
254 "-//ietf//dtd html 2.0 strict//"
255 "-//ietf//dtd html 2.0//"
256 "-//ietf//dtd html 2.1e//"
257 "-//ietf//dtd html 3.0//"
258 "-//ietf//dtd html 3.2 final//"
259 "-//ietf//dtd html 3.2//"
260 "-//ietf//dtd html 3//"
261 "-//ietf//dtd html level 0//"
262 "-//ietf//dtd html level 1//"
263 "-//ietf//dtd html level 2//"
264 "-//ietf//dtd html level 3//"
265 "-//ietf//dtd html strict level 0//"
266 "-//ietf//dtd html strict level 1//"
267 "-//ietf//dtd html strict level 2//"
268 "-//ietf//dtd html strict level 3//"
269 "-//ietf//dtd html strict//"
270 "-//ietf//dtd html//"
271 "-//metrius//dtd metrius presentational//"
272 "-//microsoft//dtd internet explorer 2.0 html strict//"
273 "-//microsoft//dtd internet explorer 2.0 html//"
274 "-//microsoft//dtd internet explorer 2.0 tables//"
275 "-//microsoft//dtd internet explorer 3.0 html strict//"
276 "-//microsoft//dtd internet explorer 3.0 html//"
277 "-//microsoft//dtd internet explorer 3.0 tables//"
278 "-//netscape comm. corp.//dtd html//"
279 "-//netscape comm. corp.//dtd strict html//"
280 "-//o'reilly and associates//dtd html 2.0//"
281 "-//o'reilly and associates//dtd html extended 1.0//"
282 "-//o'reilly and associates//dtd html extended relaxed 1.0//"
283 "-//sq//dtd html 2.0 hotmetal + extensions//"
284 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
285 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
286 "-//spyglass//dtd html 2.0 extended//"
287 "-//sun microsystems corp.//dtd hotjava html//"
288 "-//sun microsystems corp.//dtd hotjava strict html//"
289 "-//w3c//dtd html 3 1995-03-24//"
290 "-//w3c//dtd html 3.2 draft//"
291 "-//w3c//dtd html 3.2 final//"
292 "-//w3c//dtd html 3.2//"
293 "-//w3c//dtd html 3.2s draft//"
294 "-//w3c//dtd html 4.0 frameset//"
295 "-//w3c//dtd html 4.0 transitional//"
296 "-//w3c//dtd html experimental 19960712//"
297 "-//w3c//dtd html experimental 970421//"
298 "-//w3c//dtd w3 html//"
299 "-//w3o//dtd w3 html 3.0//"
300 "-//webtechs//dtd mozilla html 2.0//"
301 "-//webtechs//dtd mozilla html//"
304 # These are the character references that don't need a terminating semicolon
305 # min length: 2, max: 6, none are a prefix of any other.
307 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
308 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
309 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
310 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
311 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
312 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
313 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
314 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
315 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
316 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
317 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
318 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
319 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
320 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
321 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
322 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
323 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
327 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
328 raw_text_elements = ['script', 'style']
329 escapable_raw_text_elements = ['textarea', 'title']
330 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
332 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
333 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
334 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
335 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
336 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
337 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
338 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
339 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
340 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
341 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
342 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
343 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
344 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
345 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
349 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
351 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
352 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
353 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
354 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
355 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
356 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
357 'determinant', 'diff', 'divergence', 'divide', 'domain',
358 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
359 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
360 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
361 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
362 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
363 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
364 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
365 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
366 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
367 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
368 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
369 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
370 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
371 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
372 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
373 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
374 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
375 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
376 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
377 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
378 'vectorproduct', 'xor'
380 # foreign_elements = [svg_elements..., mathml_elements...]
381 #normal_elements = All other allowed HTML elements are normal elements.
385 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
386 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
387 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
388 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
389 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
390 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
391 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
392 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
393 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
394 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
395 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
397 menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
399 meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
400 noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
401 plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
402 select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
403 table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
404 textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
405 tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
408 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
409 'annotation-xml':NS_MATHML,
412 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
415 formatting_elements = {
416 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
417 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
421 mathml_text_integration = {
422 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
424 is_mathml_text_integration_point = (el) ->
425 return mathml_text_integration[el.name] is el.namespace
426 is_html_integration = (el) -> # DON'T PASS A TOKEN
427 if el.namespace is NS_MATHML
428 if el.name is 'annotation-xml'
429 if el.attrs.encoding?
430 if el.attrs.encoding.toLowerCase() is 'text/html'
432 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
435 if el.namespace is NS_SVG
436 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
441 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
444 foster_parenting_targets = {
465 el_is_special = (e) ->
466 return special_elements[e.name] is e.namespace
468 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
469 el_is_special_not_adp = (el) ->
470 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
474 altglyphdef: 'altGlyphDef'
475 altglyphitem: 'altGlyphItem'
476 animatecolor: 'animateColor'
477 animatemotion: 'animateMotion'
478 animatetransform: 'animateTransform'
481 fecolormatrix: 'feColorMatrix'
482 fecomponenttransfer: 'feComponentTransfer'
483 fecomposite: 'feComposite'
484 feconvolvematrix: 'feConvolveMatrix'
485 fediffuselighting: 'feDiffuseLighting'
486 fedisplacementmap: 'feDisplacementMap'
487 fedistantlight: 'feDistantLight'
488 fedropshadow: 'feDropShadow'
494 fegaussianblur: 'feGaussianBlur'
497 femergenode: 'feMergeNode'
498 femorphology: 'feMorphology'
500 fepointlight: 'fePointLight'
501 fespecularlighting: 'feSpecularLighting'
502 fespotlight: 'feSpotLight'
504 feturbulence: 'feTurbulence'
505 foreignobject: 'foreignObject'
507 lineargradient: 'linearGradient'
508 radialgradient: 'radialGradient'
511 svg_attribute_fixes = {
512 attributename: 'attributeName'
513 attributetype: 'attributeType'
514 basefrequency: 'baseFrequency'
515 baseprofile: 'baseProfile'
517 clippathunits: 'clipPathUnits'
518 contentscripttype: 'contentScriptType'
519 contentstyletype: 'contentStyleType'
520 diffuseconstant: 'diffuseConstant'
522 externalresourcesrequired: 'externalResourcesRequired'
523 # WHATWG removes this: filterres: 'filterRes'
524 filterunits: 'filterUnits'
526 gradienttransform: 'gradientTransform'
527 gradientunits: 'gradientUnits'
528 kernelmatrix: 'kernelMatrix'
529 kernelunitlength: 'kernelUnitLength'
530 keypoints: 'keyPoints'
531 keysplines: 'keySplines'
533 lengthadjust: 'lengthAdjust'
534 limitingconeangle: 'limitingConeAngle'
535 markerheight: 'markerHeight'
536 markerunits: 'markerUnits'
537 markerwidth: 'markerWidth'
538 maskcontentunits: 'maskContentUnits'
539 maskunits: 'maskUnits'
540 numoctaves: 'numOctaves'
541 pathlength: 'pathLength'
542 patterncontentunits: 'patternContentUnits'
543 patterntransform: 'patternTransform'
544 patternunits: 'patternUnits'
545 pointsatx: 'pointsAtX'
546 pointsaty: 'pointsAtY'
547 pointsatz: 'pointsAtZ'
548 preservealpha: 'preserveAlpha'
549 preserveaspectratio: 'preserveAspectRatio'
550 primitiveunits: 'primitiveUnits'
553 repeatcount: 'repeatCount'
554 repeatdur: 'repeatDur'
555 requiredextensions: 'requiredExtensions'
556 requiredfeatures: 'requiredFeatures'
557 specularconstant: 'specularConstant'
558 specularexponent: 'specularExponent'
559 spreadmethod: 'spreadMethod'
560 startoffset: 'startOffset'
561 stddeviation: 'stdDeviation'
562 stitchtiles: 'stitchTiles'
563 surfacescale: 'surfaceScale'
564 systemlanguage: 'systemLanguage'
565 tablevalues: 'tableValues'
568 textlength: 'textLength'
570 viewtarget: 'viewTarget'
571 xchannelselector: 'xChannelSelector'
572 ychannelselector: 'yChannelSelector'
573 zoomandpan: 'zoomAndPan'
575 foreign_attr_fixes = {
576 'xlink:actuate': 'xlink actuate'
577 'xlink:arcrole': 'xlink arcrole'
578 'xlink:href': 'xlink href'
579 'xlink:role': 'xlink role'
580 'xlink:show': 'xlink show'
581 'xlink:title': 'xlink title'
582 'xlink:type': 'xlink type'
583 'xml:base': 'xml base'
584 'xml:lang': 'xml lang'
585 'xml:space': 'xml space'
587 'xmlns:xlink': 'xmlns xlink'
589 adjust_mathml_attributes = (t) ->
591 if a[0] is 'definitionurl'
592 a[0] = 'definitionURL'
594 adjust_svg_attributes = (t) ->
596 if svg_attribute_fixes[a[0]]?
597 a[0] = svg_attribute_fixes[a[0]]
599 adjust_foreign_attributes = (t) ->
602 if foreign_attr_fixes[a[0]]?
603 a[0] = foreign_attr_fixes[a[0]]
606 # decode_named_char_ref()
608 # The list of named character references is _huge_ so ask the browser to decode
609 # for us instead of wasting bandwidth/space on including the table here.
611 # Pass without the "&" but with the ";" examples:
612 # for "&" pass "amp;"
613 # for "′" pass "x2032;"
616 textarea: document.createElement('textarea')
618 # TODO test this in IE8
619 decode_named_char_ref = (txt) ->
621 decoded = g_dncr.cache[txt]
622 return decoded if decoded?
623 g_dncr.textarea.innerHTML = txt
624 decoded = g_dncr.textarea.value
625 return null if decoded is txt
626 return g_dncr.cache[txt] = decoded
628 parse_html = (args) ->
630 cur = null # index of next char in txt to be parsed
631 # declare doc and tokenizer variables so they're in scope below
633 open_els = null # stack of open elements
634 afe = null # active formatting elements
635 template_ins_modes = null
637 original_ins_mode = null
639 tok_cur_tag = null # partially parsed tag
640 flag_scripting = null
641 flag_frameset_ok = null
643 flag_foster_parenting = null
644 form_element_pointer = null
645 temporary_buffer = null
646 pending_table_character_tokens = null
647 head_element_pointer = null
648 flag_fragment_parsing = null
649 context_element = null
659 console.log "Parse error at character #{cur} of #{txt.length}"
662 # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
663 # "Noah's Ark clause" but with three
664 afe_push = (new_el) ->
667 if el.type is TYPE_AFE_MARKER
669 if el.name is new_el.name and el.namespace is new_el.namespace
672 unless new_el.attrs[k] is v
676 for k, v of new_el.attrs
677 unless el.attrs[k] is v
689 afe.unshift new_afe_marker()
692 # the functions below impliment the Tree Contstruction algorithm
693 # http://www.w3.org/TR/html5/syntax.html#tree-construction
695 # But first... the helpers
696 template_tag_is_open = ->
698 if el.name is 'template' and el.namespace is NS_HTML
701 is_in_scope_x = (tag_name, scope, namespace) ->
703 if el.name is tag_name and (namespace is null or namespace is el.namespace)
705 if scope[el.name] is el.namespace
708 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
710 if el.name is tag_name and (namespace is null or namespace is el.namespace)
712 if scope[el.name] is el.namespace
714 if scope2[el.name] is el.namespace
718 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
719 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
722 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
723 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
725 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
727 button_scopers = button: NS_HTML
728 li_scopers = ol: NS_HTML, ul: NS_HTML
729 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
730 is_in_scope = (tag_name, namespace = null) ->
731 return is_in_scope_x tag_name, standard_scopers, namespace
732 is_in_button_scope = (tag_name, namespace = null) ->
733 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
734 is_in_table_scope = (tag_name, namespace = null) ->
735 return is_in_scope_x tag_name, table_scopers, namespace
736 # aka is_in_list_item_scope
737 is_in_li_scope = (tag_name, namespace = null) ->
738 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
739 is_in_select_scope = (tag_name, namespace = null) ->
741 if t.name is tag_name and (namespace is null or namespace is t.namespace)
743 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
746 # this checks for a particular element, not by name
747 # this requires a namespace match
748 el_is_in_scope = (needle) ->
752 if standard_scopers[el.name] is el.namespace
756 clear_to_table_stopers = {
761 clear_stack_to_table_context = ->
763 if clear_to_table_stopers[open_els[0].name]?
767 clear_to_table_body_stopers = {
774 clear_stack_to_table_body_context = ->
776 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
780 clear_to_table_row_stopers = {
785 clear_stack_to_table_row_context = ->
787 if clear_to_table_row_stopers[open_els[0].name]?
791 clear_afe_to_marker = ->
793 return unless afe.length > 0 # this happens in fragment case, ?spec error
795 if el.type is TYPE_AFE_MARKER
800 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
802 # 1. Let last be false.
804 # 2. Let node be the last node in the stack of open elements.
806 node = open_els[node_i]
807 # 3. Loop: If node is the first node in the stack of open elements,
808 # then set last to true, and, if the parser was originally created as
809 # part of the HTML fragment parsing algorithm (fragment case) set node
810 # to the context element.
812 if node_i is open_els.length - 1
814 if flag_fragment_parsing
815 node = context_element
816 # 4. If node is a select element, run these substeps:
817 if node.name is 'select' and node.namespace is NS_HTML
818 # 1. If last is true, jump to the step below labeled done.
820 # 2. Let ancestor be node.
823 # 3. Loop: If ancestor is the first node in the stack of
824 # open elements, jump to the step below labeled done.
826 if ancestor_i is open_els.length - 1
828 # 4. Let ancestor be the node before ancestor in the stack
831 ancestor = open_els[ancestor_i]
832 # 5. If ancestor is a template node, jump to the step below
834 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
836 # 6. If ancestor is a table node, switch the insertion mode
837 # to "in select in table" and abort these steps.
838 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
839 ins_mode = ins_mode_in_select_in_table
841 # 7. Jump back to the step labeled loop.
842 # 8. Done: Switch the insertion mode to "in select" and abort
844 ins_mode = ins_mode_in_select
846 # 5. If node is a td or th element and last is false, then switch
847 # the insertion mode to "in cell" and abort these steps.
848 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
849 ins_mode = ins_mode_in_cell
851 # 6. If node is a tr element, then switch the insertion mode to "in
852 # row" and abort these steps.
853 if node.name is 'tr' and node.namespace is NS_HTML
854 ins_mode = ins_mode_in_row
856 # 7. If node is a tbody, thead, or tfoot element, then switch the
857 # insertion mode to "in table body" and abort these steps.
858 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
859 ins_mode = ins_mode_in_table_body
861 # 8. If node is a caption element, then switch the insertion mode
862 # to "in caption" and abort these steps.
863 if node.name is 'caption' and node.namespace is NS_HTML
864 ins_mode = ins_mode_in_caption
866 # 9. If node is a colgroup element, then switch the insertion mode
867 # to "in column group" and abort these steps.
868 if node.name is 'colgroup' and node.namespace is NS_HTML
869 ins_mode = ins_mode_in_column_group
871 # 10. If node is a table element, then switch the insertion mode to
872 # "in table" and abort these steps.
873 if node.name is 'table' and node.namespace is NS_HTML
874 ins_mode = ins_mode_in_table
876 # 11. If node is a template element, then switch the insertion mode
877 # to the current template insertion mode and abort these steps.
878 if node.name is 'template' and node.namespace is NS_HTML
879 ins_mode = template_ins_modes[0]
881 # 12. If node is a head element and last is true, then switch the
882 # insertion mode to "in body" ("in body"! not "in head"!) and abort
883 # these steps. (fragment case)
884 if node.name is 'head' and node.namespace is NS_HTML and last
885 ins_mode = ins_mode_in_body
887 # 13. If node is a head element and last is false, then switch the
888 # insertion mode to "in head" and abort these steps.
889 if node.name is 'head' and node.namespace is NS_HTML and last is false
890 ins_mode = ins_mode_in_head
892 # 14. If node is a body element, then switch the insertion mode to
893 # "in body" and abort these steps.
894 if node.name is 'body' and node.namespace is NS_HTML
895 ins_mode = ins_mode_in_body
897 # 15. If node is a frameset element, then switch the insertion mode
898 # to "in frameset" and abort these steps. (fragment case)
899 if node.name is 'frameset' and node.namespace is NS_HTML
900 ins_mode = ins_mode_in_frameset
902 # 16. If node is an html element, run these substeps:
903 if node.name is 'html' and node.namespace is NS_HTML
904 # 1. If the head element pointer is null, switch the insertion
905 # mode to "before head" and abort these steps. (fragment case)
906 if head_element_pointer is null
907 ins_mode = ins_mode_before_head
909 # 2. Otherwise, the head element pointer is not null,
910 # switch the insertion mode to "after head" and abort these
912 ins_mode = ins_mode_after_head
914 # 17. If last is true, then switch the insertion mode to "in body"
915 # and abort these steps. (fragment case)
917 ins_mode = ins_mode_in_body
919 # 18. Let node now be the node before node in the stack of open
922 node = open_els[node_i]
923 # 19. Return to the step labeled loop.
928 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
929 adjusted_current_node = ->
930 if open_els.length is 1 and flag_fragment_parsing
931 return context_element
934 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
935 # this implementation is structured (mostly) as described at the link above.
936 # capitalized comments are the "labels" described at the link above.
938 return if afe.length is 0
939 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
944 if i is afe.length - 1
947 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
952 el = insert_html_element afe[i].token
958 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
959 # adoption agency algorithm
961 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
962 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
963 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
964 adoption_agency = (subject) ->
965 # this block implements tha W3C spec
966 # # 1. If the current node is an HTML element whose tag name is subject,
967 # # then run these substeps:
969 # # 1. Let element be the current node.
971 # # 2. Pop element off the stack of open elements.
973 # # 3. If element is also in the list of active formatting elements,
974 # # remove the element from the list.
976 # # 4. Abort the adoption agency algorithm.
977 # if open_els[0].name is subject and open_els[0].namespace is NS_HTML
978 # el = open_els.shift()
979 # # remove it from the list of active formatting elements (if found)
985 # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
986 # If the current node is an HTML element whose tag name is subject, and
987 # the current node is not in the list of active formatting elements,
988 # then pop the current node off the stack of open elements, and abort
990 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
991 # remove it from the list of active formatting elements (if found)
1007 # 5. Let formatting element be the last element in the list of
1008 # active formatting elements that: is between the end of the list
1009 # and the last scope marker in the list, if any, or the start of
1010 # the list otherwise, and has the tag name subject.
1012 for t, fe_of_afe in afe
1013 if t.type is TYPE_AFE_MARKER
1015 if t.name is subject
1018 # If there is no such element, then abort these steps and instead
1019 # act as described in the "any other end tag" entry above.
1021 in_body_any_other_end_tag subject
1023 # 6. If formatting element is not in the stack of open elements,
1024 # then this is a parse error; remove the element from the list, and
1025 # abort these steps.
1027 for t, fe_of_open_els in open_els
1033 # "remove it from the list" must mean afe, since it's not in open_els
1034 afe.splice fe_of_afe, 1
1036 # 7. If formatting element is in the stack of open elements, but
1037 # the element is not in scope, then this is a parse error; abort
1039 unless el_is_in_scope fe
1042 # 8. If formatting element is not the current node, this is a parse
1043 # error. (But do not abort these steps.)
1044 unless open_els[0] is fe
1047 # 9. Let furthest block be the topmost node in the stack of open
1048 # elements that is lower in the stack than formatting element, and
1049 # is an element in the special category. There might not be one.
1051 fb_of_open_els = null
1052 for t, i in open_els
1058 # and continue, to see if there's one that's more "topmost"
1059 # 10. If there is no furthest block, then the UA must first pop all
1060 # the nodes from the bottom of the stack of open elements, from the
1061 # current node up to and including formatting element, then remove
1062 # formatting element from the list of active formatting elements,
1063 # and finally abort these steps.
1066 t = open_els.shift()
1068 afe.splice fe_of_afe, 1
1070 # 11. Let common ancestor be the element immediately above
1071 # formatting element in the stack of open elements.
1072 ca = open_els[fe_of_open_els + 1] # common ancestor
1074 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
1075 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
1076 bookmark = new_aaa_bookmark()
1079 afe.splice i, 0, bookmark
1081 node = last_node = fb
1085 # 3. Let node be the element immediately above node in the
1086 # stack of open elements, or if node is no longer in the stack
1087 # of open elements (e.g. because it got removed by this
1088 # algorithm), the element that was immediately above node in
1089 # the stack of open elements before node was removed.
1091 for t, i in open_els
1093 node_next = open_els[i + 1]
1095 node = node_next ? node_above
1096 # TODO make sure node_above gets re-set if/when node is removed from open_els
1098 # 4. If node is formatting element, then go to the next step in
1099 # the overall algorithm.
1102 # 5. If inner loop counter is greater than three and node is in
1103 # the list of active formatting elements, then remove node from
1104 # the list of active formatting elements.
1113 # 6. If node is not in the list of active formatting elements,
1114 # then remove node from the stack of open elements and then go
1115 # back to the step labeled inner loop.
1117 for t, i in open_els
1119 node_above = open_els[i + 1]
1120 open_els.splice i, 1
1123 # 7. create an element for the token for which the element node
1124 # was created, in the HTML namespace, with common ancestor as
1125 # the intended parent; replace the entry for node in the list
1126 # of active formatting elements with an entry for the new
1127 # element, replace the entry for node in the stack of open
1128 # elements with an entry for the new element, and let node be
1130 new_node = token_to_element node.token, NS_HTML, ca
1135 for t, i in open_els
1137 node_above = open_els[i + 1]
1138 open_els[i] = new_node
1141 # 8. If last node is furthest block, then move the
1142 # aforementioned bookmark to be immediately after the new node
1143 # in the list of active formatting elements.
1151 # "after" means lower
1152 afe.splice i, 0, bookmark # "after as <-
1154 # 9. Insert last node into node, first removing it from its
1155 # previous parent node if any.
1156 if last_node.parent?
1157 for c, i in last_node.parent.children
1159 last_node.parent.children.splice i, 1
1161 node.children.push last_node
1162 last_node.parent = node
1163 # 10. Let last node be node.
1165 # 11. Return to the step labeled inner loop.
1166 # 14. Insert whatever last node ended up being in the previous step
1167 # at the appropriate place for inserting a node, but using common
1168 # ancestor as the override target.
1170 # In the case where fe is immediately followed by fb:
1171 # * inner loop exits out early (node==fe)
1173 # * last_node is still in the tree (not a duplicate)
1174 if last_node.parent?
1175 for c, i in last_node.parent.children
1177 last_node.parent.children.splice i, 1
1179 # can't use standard insert token thing, because it's already in
1180 # open_els and must stay at it's current position in open_els
1181 dest = adjusted_insertion_location ca
1182 dest[0].children.splice dest[1], 0, last_node
1183 last_node.parent = dest[0]
1184 # 15. Create an element for the token for which formatting element
1185 # was created, in the HTML namespace, with furthest block as the
1187 new_element = token_to_element fe.token, NS_HTML, fb
1188 # 16. Take all of the child nodes of furthest block and append them
1189 # to the element created in the last step.
1190 while fb.children.length
1191 t = fb.children.shift()
1192 t.parent = new_element
1193 new_element.children.push t
1194 # 17. Append that new element to furthest block.
1195 new_element.parent = fb
1196 fb.children.push new_element
1197 # 18. Remove formatting element from the list of active formatting
1198 # elements, and insert the new element into the list of active
1199 # formatting elements at the position of the aforementioned
1207 afe[i] = new_element
1209 # 19. Remove formatting element from the stack of open elements,
1210 # and insert the new element into the stack of open elements
1211 # immediately below the position of furthest block in that stack.
1212 for t, i in open_els
1214 open_els.splice i, 1
1216 for t, i in open_els
1218 open_els.splice i, 0, new_element
1220 # 20. Jump back to the step labeled outer loop.
1223 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1224 close_p_element = ->
1225 generate_implied_end_tags 'p' # arg is exception
1226 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1228 while open_els.length > 1 # just in case
1229 el = open_els.shift()
1230 if el.name is 'p' and el.namespace is NS_HTML
1233 close_p_if_in_button_scope = ->
1234 if is_in_button_scope 'p', NS_HTML
1238 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1239 # aka insert_a_character = (t) ->
1240 insert_character = (t) ->
1241 dest = adjusted_insertion_location()
1242 # fixfull check for Document node
1244 prev = dest[0].children[dest[1] - 1]
1245 if prev.type is TYPE_TEXT
1248 dest[0].children.splice dest[1], 0, t
1251 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1252 process_token = (t) ->
1253 acn = adjusted_current_node()
1257 if acn.namespace is NS_HTML
1260 if is_mathml_text_integration_point(acn)
1261 if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1264 if t.type is TYPE_TEXT
1267 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1270 if is_html_integration acn
1271 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1274 if t.type is TYPE_EOF
1277 in_foreign_content t
1281 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1282 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1283 adjusted_insertion_location = (override_target = null) ->
1284 # 1. If there was an override target specified, then let target be the
1287 target = override_target
1288 else # Otherwise, let target be the current node.
1289 target = open_els[0]
1290 # 2. Determine the adjusted insertion location using the first matching
1291 # steps from the following list:
1293 # If foster parenting is enabled and target is a table, tbody, tfoot,
1294 # thead, or tr element Foster parenting happens when content is
1295 # misnested in tables.
1296 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1297 loop # once. this is here so we can ``break`` to "abort these substeps"
1298 # 1. Let last template be the last template element in the
1299 # stack of open elements, if any.
1300 last_template = null
1301 last_template_i = null
1302 for el, i in open_els
1303 if el.name is 'template' and el.namespace is NS_HTML
1307 # 2. Let last table be the last table element in the stack of
1308 # open elements, if any.
1311 for el, i in open_els
1312 if el.name is 'table' and el.namespace is NS_HTML
1316 # 3. If there is a last template and either there is no last
1317 # table, or there is one, but last template is lower (more
1318 # recently added) than last table in the stack of open
1319 # elements, then: let adjusted insertion location be inside
1320 # last template's template contents, after its last child (if
1321 # any), and abort these substeps.
1322 if last_template and (last_table is null or last_template_i < last_table_i)
1323 target = last_template # fixfull should be it's contents
1324 target_i = target.children.length
1326 # 4. If there is no last table, then let adjusted insertion
1327 # location be inside the first element in the stack of open
1328 # elements (the html element), after its last child (if any),
1329 # and abort these substeps. (fragment case)
1330 if last_table is null
1332 target = open_els[open_els.length - 1]
1333 target_i = target.children.length
1335 # 5. If last table has a parent element, then let adjusted
1336 # insertion location be inside last table's parent element,
1337 # immediately before last table, and abort these substeps.
1338 if last_table.parent?
1339 for c, i in last_table.parent.children
1341 target = last_table.parent
1345 # 6. Let previous element be the element immediately above last
1346 # table in the stack of open elements.
1348 # huh? how could it not have a parent?
1349 previous_element = open_els[last_table_i + 1]
1350 # 7. Let adjusted insertion location be inside previous
1351 # element, after its last child (if any).
1352 target = previous_element
1353 target_i = target.children.length
1354 # Note: These steps are involved in part because it's possible
1355 # for elements, the table element in this case in particular,
1356 # to have been moved by a script around in the DOM, or indeed
1357 # removed from the DOM entirely, after the element was inserted
1359 break # don't really loop
1361 # Otherwise Let adjusted insertion location be inside target, after
1362 # its last child (if any).
1363 target_i = target.children.length
1365 # 3. If the adjusted insertion location is inside a template element,
1366 # let it instead be inside the template element's template contents,
1367 # after its last child (if any).
1368 # fixfull (template)
1370 # 4. Return the adjusted insertion location.
1371 return [target, target_i]
1373 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1374 # aka create_an_element_for_token
1375 token_to_element = (t, namespace, intended_parent) ->
1376 # convert attributes into a hash
1379 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1380 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1382 # TODO 2. If the newly created element has an xmlns attribute in the
1383 # XMLNS namespace whose value is not exactly the same as the element's
1384 # namespace, that is a parse error. Similarly, if the newly created
1385 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1386 # value is not the XLink Namespace, that is a parse error.
1388 # fixfull: the spec says stuff about form pointers and ownerDocument
1392 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1393 insert_foreign_element = (token, namespace) ->
1394 ail = adjusted_insertion_location()
1397 el = token_to_element token, namespace, ail_el
1398 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1400 ail_el.children.splice ail_i, 0, el
1403 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1404 insert_html_element = (token) ->
1405 return insert_foreign_element token, NS_HTML
1407 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1408 # position should be [node, index_within_children]
1409 insert_comment = (t, position = null) ->
1410 position ?= adjusted_insertion_location()
1411 position[0].children.splice position[1], 0, t
1415 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1416 parse_generic_raw_text = (t) ->
1417 insert_html_element t
1418 tok_state = tok_state_rawtext
1419 original_ins_mode = ins_mode
1420 ins_mode = ins_mode_text
1422 parse_generic_rcdata_text = (t) ->
1423 insert_html_element t
1424 tok_state = tok_state_rcdata
1425 original_ins_mode = ins_mode
1426 ins_mode = ins_mode_text
1429 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1430 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1431 generate_implied_end_tags = (except = null) ->
1432 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1436 # 8.2.5.4 The rules for parsing tokens in HTML content
1437 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1439 # 8.2.5.4.1 The "initial" insertion mode
1440 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1441 is_quirks_yes_doctype = (t) ->
1442 if t.flag 'force-quirks'
1444 if t.name isnt 'html'
1446 if t.public_identifier?
1447 pi = t.public_identifier.toLowerCase()
1448 for p in quirks_yes_pi_prefixes
1449 if pi.substr(0, p.length) is p
1451 if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
1453 if t.system_identifier?
1454 if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
1456 else if t.public_identifier?
1457 # already did this: pi = t.public_identifier.toLowerCase()
1458 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1461 is_quirks_limited_doctype = (t) ->
1462 if t.public_identifier?
1463 pi = t.public_identifier.toLowerCase()
1464 if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
1466 if t.system_identifier?
1467 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1470 ins_mode_initial = (t) ->
1473 if t.type is TYPE_COMMENT
1477 if t.type is TYPE_DOCTYPE
1478 # fixfull syntax error from first paragraph and following bullets
1479 # fixfull set doc.doctype
1480 # fixfull is the "not an iframe srcdoc" thing relevant?
1481 if is_quirks_yes_doctype t
1482 doc.flag 'quirks mode', QUIRKS_YES
1483 else if is_quirks_limited_doctype t
1484 doc.flag 'quirks mode', QUIRKS_LIMITED
1486 ins_mode = ins_mode_before_html
1489 # fixfull not iframe srcdoc?
1491 doc.flag 'quirks mode', QUIRKS_YES
1492 ins_mode = ins_mode_before_html
1496 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1497 ins_mode_before_html = (t) ->
1498 if t.type is TYPE_DOCTYPE
1501 if t.type is TYPE_COMMENT
1506 if t.type is TYPE_START_TAG and t.name is 'html'
1507 el = token_to_element t, NS_HTML, doc
1508 doc.children.push el
1510 open_els.unshift(el)
1511 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1512 ins_mode = ins_mode_before_head
1514 if t.type is TYPE_END_TAG
1515 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1516 # fall through to "anything else"
1521 el = token_to_element new_open_tag('html'), NS_HTML, doc
1522 doc.children.push el
1525 # ?fixfull browsing context
1526 ins_mode = ins_mode_before_head
1530 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1531 ins_mode_before_head = (t) ->
1534 if t.type is TYPE_COMMENT
1537 if t.type is TYPE_DOCTYPE
1540 if t.type is TYPE_START_TAG and t.name is 'html'
1543 if t.type is TYPE_START_TAG and t.name is 'head'
1544 el = insert_html_element t
1545 head_element_pointer = el
1546 ins_mode = ins_mode_in_head
1548 if t.type is TYPE_END_TAG
1549 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1550 # fall through to Anything else below
1555 el = insert_html_element new_open_tag 'head'
1556 head_element_pointer = el
1557 ins_mode = ins_mode_in_head
1561 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1562 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1563 open_els.shift() # spec says this will be a 'head' node
1564 ins_mode = ins_mode_after_head
1567 ins_mode_in_head = (t) ->
1568 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1571 if t.type is TYPE_COMMENT
1574 if t.type is TYPE_DOCTYPE
1577 if t.type is TYPE_START_TAG and t.name is 'html'
1580 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1581 el = insert_html_element t
1583 t.acknowledge_self_closing()
1585 if t.type is TYPE_START_TAG and t.name is 'meta'
1586 el = insert_html_element t
1588 t.acknowledge_self_closing()
1589 # fixfull encoding stuff
1591 if t.type is TYPE_START_TAG and t.name is 'title'
1592 parse_generic_rcdata_text t
1594 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1595 parse_generic_raw_text t
1597 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1598 insert_html_element t
1599 ins_mode = ins_mode_in_head_noscript
1601 if t.type is TYPE_START_TAG and t.name is 'script'
1602 ail = adjusted_insertion_location()
1603 el = token_to_element t, NS_HTML, ail
1604 el.flag 'parser-inserted', true
1605 # fixfull frament case
1606 ail[0].children.splice ail[1], 0, el
1608 tok_state = tok_state_script_data
1609 original_ins_mode = ins_mode # make sure orig... is defined
1610 ins_mode = ins_mode_text
1612 if t.type is TYPE_END_TAG and t.name is 'head'
1613 open_els.shift() # will be a head element... spec says so
1614 ins_mode = ins_mode_after_head
1616 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1617 ins_mode_in_head_else t
1619 if t.type is TYPE_START_TAG and t.name is 'template'
1620 insert_html_element t
1622 flag_frameset_ok = false
1623 ins_mode = ins_mode_in_template
1624 template_ins_modes.unshift ins_mode_in_template
1626 if t.type is TYPE_END_TAG and t.name is 'template'
1627 if template_tag_is_open()
1628 generate_implied_end_tags
1629 if open_els[0].name isnt 'template'
1632 el = open_els.shift()
1633 if el.name is 'template' and el.namespace is NS_HTML
1635 clear_afe_to_marker()
1636 template_ins_modes.shift()
1641 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1644 ins_mode_in_head_else t
1647 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1648 ins_mode_in_head_noscript_else = (t) ->
1651 ins_mode = ins_mode_in_head
1654 ins_mode_in_head_noscript = (t) ->
1655 if t.type is TYPE_DOCTYPE
1658 if t.type is TYPE_START_TAG and t.name is 'html'
1661 if t.type is TYPE_END_TAG and t.name is 'noscript'
1663 ins_mode = ins_mode_in_head
1665 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1668 if t.type is TYPE_END_TAG and t.name is 'br'
1669 ins_mode_in_head_noscript_else t
1671 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1675 ins_mode_in_head_noscript_else t
1678 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1679 ins_mode_after_head_else = (t) ->
1680 body_tok = new_open_tag 'body'
1681 insert_html_element body_tok
1682 ins_mode = ins_mode_in_body
1685 ins_mode_after_head = (t) ->
1689 if t.type is TYPE_COMMENT
1692 if t.type is TYPE_DOCTYPE
1695 if t.type is TYPE_START_TAG and t.name is 'html'
1698 if t.type is TYPE_START_TAG and t.name is 'body'
1699 insert_html_element t
1700 flag_frameset_ok = false
1701 ins_mode = ins_mode_in_body
1703 if t.type is TYPE_START_TAG and t.name is 'frameset'
1704 insert_html_element t
1705 ins_mode = ins_mode_in_frameset
1707 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1709 open_els.unshift head_element_pointer
1711 for el, i in open_els
1712 if el is head_element_pointer
1713 open_els.splice i, 1
1716 if t.type is TYPE_END_TAG and t.name is 'template'
1719 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1720 ins_mode_after_head_else t
1722 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1726 ins_mode_after_head_else t
1729 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1730 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1733 if node.name is name and node.namespace is NS_HTML
1734 generate_implied_end_tags name # arg is exception
1735 unless node is open_els[0]
1738 el = open_els.shift()
1741 if special_elements[node.name] is node.namespace
1744 for el, i in open_els
1746 node = open_els[i + 1]
1749 ins_mode_in_body = (t) ->
1750 if t.type is TYPE_TEXT and t.text is "\u0000"
1757 if t.type is TYPE_TEXT
1760 flag_frameset_ok = false
1762 if t.type is TYPE_COMMENT
1765 if t.type is TYPE_DOCTYPE
1768 if t.type is TYPE_START_TAG and t.name is 'html'
1770 return if template_tag_is_open()
1771 root_attrs = open_els[open_els.length - 1].attrs
1773 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1776 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1779 if t.type is TYPE_START_TAG and t.name is 'body'
1781 return if open_els.length < 2
1782 second = open_els[open_els.length - 2]
1783 return unless second.namespace is NS_HTML
1784 return unless second.name is 'body'
1785 return if template_tag_is_open()
1786 flag_frameset_ok = false
1788 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1790 if t.type is TYPE_START_TAG and t.name is 'frameset'
1792 return if open_els.length < 2
1793 second_i = open_els.length - 2
1794 second = open_els[second_i]
1795 return unless second.namespace is NS_HTML
1796 return unless second.name is 'body'
1797 if flag_frameset_ok is false
1800 for el, i in second.parent.children
1802 second.parent.children.splice i, 1
1804 open_els.splice second_i, 1
1805 # pop everything except the "root html element"
1806 while open_els.length > 1
1808 insert_html_element t
1809 ins_mode = ins_mode_in_frameset
1811 if t.type is TYPE_EOF
1813 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1814 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1815 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1818 unless ok_tags[t.name] is el.namespace
1821 if template_ins_modes.length > 0
1822 ins_mode_in_template t
1826 if t.type is TYPE_END_TAG and t.name is 'body'
1827 unless is_in_scope 'body', NS_HTML
1831 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1832 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1833 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1834 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1838 unless ok_tags[t.name] is el.namespace
1841 ins_mode = ins_mode_after_body
1843 if t.type is TYPE_END_TAG and t.name is 'html'
1844 unless is_in_scope 'body', NS_HTML
1848 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1849 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1850 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1851 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1855 unless ok_tags[t.name] is el.namespace
1858 ins_mode = ins_mode_after_body
1861 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1862 close_p_if_in_button_scope()
1863 insert_html_element t
1865 if t.type is TYPE_START_TAG and h_tags[t.name]?
1866 close_p_if_in_button_scope()
1867 if h_tags[open_els[0].name] is open_els[0].namespace
1870 insert_html_element t
1872 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1873 close_p_if_in_button_scope()
1874 insert_html_element t
1875 eat_next_token_if_newline()
1876 flag_frameset_ok = false
1878 if t.type is TYPE_START_TAG and t.name is 'form'
1879 unless form_element_pointer is null or template_tag_is_open()
1882 close_p_if_in_button_scope()
1883 el = insert_html_element t
1884 unless template_tag_is_open()
1885 form_element_pointer = el
1887 if t.type is TYPE_START_TAG and t.name is 'li'
1888 flag_frameset_ok = false
1889 for node in open_els
1890 if node.name is 'li' and node.namespace is NS_HTML
1891 generate_implied_end_tags 'li' # arg is exception
1892 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1895 el = open_els.shift()
1896 if el.name is 'li' and el.namespace is NS_HTML
1899 if el_is_special_not_adp node
1901 close_p_if_in_button_scope()
1902 insert_html_element t
1904 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1905 flag_frameset_ok = false
1906 for node in open_els
1907 if node.name is 'dd' and node.namespace is NS_HTML
1908 generate_implied_end_tags 'dd' # arg is exception
1909 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1912 el = open_els.shift()
1913 if el.name is 'dd' and el.namespace is NS_HTML
1916 if node.name is 'dt' and node.namespace is NS_HTML
1917 generate_implied_end_tags 'dt' # arg is exception
1918 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1921 el = open_els.shift()
1922 if el.name is 'dt' and el.namespace is NS_HTML
1925 if el_is_special_not_adp node
1927 close_p_if_in_button_scope()
1928 insert_html_element t
1930 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1931 close_p_if_in_button_scope()
1932 insert_html_element t
1933 tok_state = tok_state_plaintext
1935 if t.type is TYPE_START_TAG and t.name is 'button'
1936 if is_in_scope 'button', NS_HTML
1938 generate_implied_end_tags()
1940 el = open_els.shift()
1941 if el.name is 'button' and el.namespace is NS_HTML
1944 insert_html_element t
1945 flag_frameset_ok = false
1947 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1948 unless is_in_scope t.name, NS_HTML
1951 generate_implied_end_tags()
1952 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1955 el = open_els.shift()
1956 if el.name is t.name and el.namespace is NS_HTML
1959 if t.type is TYPE_END_TAG and t.name is 'form'
1960 unless template_tag_is_open()
1961 node = form_element_pointer
1962 form_element_pointer = null
1963 if node is null or not el_is_in_scope node
1966 generate_implied_end_tags()
1967 if open_els[0] isnt node
1969 for el, i in open_els
1971 open_els.splice i, 1
1974 unless is_in_scope 'form', NS_HTML
1977 generate_implied_end_tags()
1978 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1981 el = open_els.shift()
1982 if el.name is 'form' and el.namespace is NS_HTML
1985 if t.type is TYPE_END_TAG and t.name is 'p'
1986 unless is_in_button_scope 'p', NS_HTML
1988 insert_html_element new_open_tag 'p'
1991 if t.type is TYPE_END_TAG and t.name is 'li'
1992 unless is_in_li_scope 'li', NS_HTML
1995 generate_implied_end_tags 'li' # arg is exception
1996 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1999 el = open_els.shift()
2000 if el.name is 'li' and el.namespace is NS_HTML
2003 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
2004 unless is_in_scope t.name, NS_HTML
2007 generate_implied_end_tags t.name # arg is exception
2008 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2011 el = open_els.shift()
2012 if el.name is t.name and el.namespace is NS_HTML
2015 if t.type is TYPE_END_TAG and h_tags[t.name]?
2018 if h_tags[el.name] is el.namespace
2021 if standard_scopers[el.name] is el.namespace
2026 generate_implied_end_tags()
2027 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2030 el = open_els.shift()
2031 if h_tags[el.name] is el.namespace
2035 if t.type is TYPE_START_TAG and t.name is 'a'
2036 # If the list of active formatting elements contains an a element
2037 # between the end of the list and the last marker on the list (or
2038 # the start of the list if there is no marker on the list), then
2039 # this is a parse error; run the adoption agency algorithm for the
2040 # tag name "a", then remove that element from the list of active
2041 # formatting elements and the stack of open elements if the
2042 # adoption agency algorithm didn't already remove it (it might not
2043 # have if the element is not in table scope).
2046 if el.type is TYPE_AFE_MARKER
2048 if el.name is 'a' and el.namespace is NS_HTML
2056 for el, i in open_els
2058 open_els.splice i, 1
2060 el = insert_html_element t
2063 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2065 el = insert_html_element t
2068 if t.type is TYPE_START_TAG and t.name is 'nobr'
2070 if is_in_scope 'nobr', NS_HTML
2072 adoption_agency 'nobr'
2074 el = insert_html_element t
2077 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2078 adoption_agency t.name
2080 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2082 insert_html_element t
2084 flag_frameset_ok = false
2086 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2087 unless is_in_scope t.name, NS_HTML
2090 generate_implied_end_tags()
2091 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2094 el = open_els.shift()
2095 if el.name is t.name and el.namespace is NS_HTML
2097 clear_afe_to_marker()
2099 if t.type is TYPE_START_TAG and t.name is 'table'
2100 unless doc.flag('quirks mode') is QUIRKS_YES
2101 close_p_if_in_button_scope() # test
2102 insert_html_element t
2103 flag_frameset_ok = false
2104 ins_mode = ins_mode_in_table
2106 if t.type is TYPE_END_TAG and t.name is 'br'
2108 # W3C: t.type = TYPE_START_TAG
2109 t = new_open_tag 'br' # WHATWG
2111 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2113 insert_html_element t
2115 t.acknowledge_self_closing()
2116 flag_frameset_ok = false
2118 if t.type is TYPE_START_TAG and t.name is 'input'
2120 insert_html_element t
2122 t.acknowledge_self_closing()
2123 unless is_input_hidden_tok t
2124 flag_frameset_ok = false
2126 if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
2127 # WHATWG adds 'menuitem' for this block
2128 insert_html_element t
2130 t.acknowledge_self_closing()
2132 if t.type is TYPE_START_TAG and t.name is 'hr'
2133 close_p_if_in_button_scope()
2134 insert_html_element t
2136 t.acknowledge_self_closing()
2137 flag_frameset_ok = false
2139 if t.type is TYPE_START_TAG and t.name is 'image'
2144 if t.type is TYPE_START_TAG and t.name is 'isindex'
2146 if template_tag_is_open() is false and form_element_pointer isnt null
2148 t.acknowledge_self_closing()
2149 flag_frameset_ok = false
2150 close_p_if_in_button_scope()
2151 el = insert_html_element new_open_tag 'form'
2152 unless template_tag_is_open()
2153 form_element_pointer = el
2156 el.attrs['action'] = a[1]
2158 insert_html_element new_open_tag 'hr'
2161 insert_html_element new_open_tag 'label'
2162 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2163 input_el = new_open_tag 'input'
2168 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2169 input_el.attrs_a.push [a[0], a[1]]
2170 input_el.attrs_a.push ['name', 'isindex']
2171 # fixfull this next bit is in english... internationalize?
2172 prompt ?= "This is a searchable index. Enter search keywords: "
2173 insert_character new_character_token prompt # fixfull split
2174 # TODO submit typo "balue" in spec
2175 insert_html_element input_el
2177 # insert_character '' # you can put chars here if promt attr missing
2179 insert_html_element new_open_tag 'hr'
2182 unless template_tag_is_open()
2183 form_element_pointer = null
2185 if t.type is TYPE_START_TAG and t.name is 'textarea'
2186 insert_html_element t
2187 eat_next_token_if_newline()
2188 tok_state = tok_state_rcdata
2189 original_ins_mode = ins_mode
2190 flag_frameset_ok = false
2191 ins_mode = ins_mode_text
2193 if t.type is TYPE_START_TAG and t.name is 'xmp'
2194 close_p_if_in_button_scope()
2196 flag_frameset_ok = false
2197 parse_generic_raw_text t
2199 if t.type is TYPE_START_TAG and t.name is 'iframe'
2200 flag_frameset_ok = false
2201 parse_generic_raw_text t
2203 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2204 parse_generic_raw_text t
2206 if t.type is TYPE_START_TAG and t.name is 'select'
2208 insert_html_element t
2209 flag_frameset_ok = false
2210 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2211 ins_mode = ins_mode_in_select_in_table
2213 ins_mode = ins_mode_in_select
2215 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2216 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2219 insert_html_element t
2221 # this comment block implements the W3C spec
2222 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2223 # if is_in_scope 'ruby', NS_HTML
2224 # generate_implied_end_tags()
2225 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2227 # insert_html_element t
2229 # if t.type is TYPE_START_TAG and t.name is 'rt'
2230 # if is_in_scope 'ruby', NS_HTML
2231 # generate_implied_end_tags 'rtc' # arg is exception
2232 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2234 # insert_html_element t
2236 # below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2237 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2238 if is_in_scope 'ruby', NS_HTML
2239 generate_implied_end_tags()
2240 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2242 insert_html_element t
2244 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2245 if is_in_scope 'ruby', NS_HTML
2246 generate_implied_end_tags 'rtc'
2247 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2249 insert_html_element t
2252 if t.type is TYPE_START_TAG and t.name is 'math'
2254 adjust_mathml_attributes t
2255 adjust_foreign_attributes t
2256 insert_foreign_element t, NS_MATHML
2257 if t.flag 'self-closing'
2259 t.acknowledge_self_closing()
2261 if t.type is TYPE_START_TAG and t.name is 'svg'
2263 adjust_svg_attributes t
2264 adjust_foreign_attributes t
2265 insert_foreign_element t, NS_SVG
2266 if t.flag 'self-closing'
2268 t.acknowledge_self_closing()
2270 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2273 if t.type is TYPE_START_TAG # any other start tag
2275 insert_html_element t
2277 if t.type is TYPE_END_TAG # any other end tag
2278 in_body_any_other_end_tag t.name
2282 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2283 ins_mode_text = (t) ->
2284 if t.type is TYPE_TEXT
2287 if t.type is TYPE_EOF
2289 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2290 open_els[0].flag 'already started', true
2292 ins_mode = original_ins_mode
2295 if t.type is TYPE_END_TAG and t.name is 'script'
2297 ins_mode = original_ins_mode
2298 # fixfull the spec seems to assume that I'm going to run the script
2299 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2301 if t.type is TYPE_END_TAG
2303 ins_mode = original_ins_mode
2307 # the functions below implement the tokenizer stats described here:
2308 # http://www.w3.org/TR/html5/syntax.html#tokenization
2310 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2311 ins_mode_in_table_else = (t) ->
2313 flag_foster_parenting = true
2315 flag_foster_parenting = false
2317 ins_mode_in_table = (t) ->
2320 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2321 pending_table_character_tokens = []
2322 original_ins_mode = ins_mode
2323 ins_mode = ins_mode_in_table_text
2326 ins_mode_in_table_else t
2334 clear_stack_to_table_context()
2336 insert_html_element t
2337 ins_mode = ins_mode_in_caption
2339 clear_stack_to_table_context()
2340 insert_html_element t
2341 ins_mode = ins_mode_in_column_group
2343 clear_stack_to_table_context()
2344 insert_html_element new_open_tag 'colgroup'
2345 ins_mode = ins_mode_in_column_group
2347 when 'tbody', 'tfoot', 'thead'
2348 clear_stack_to_table_context()
2349 insert_html_element t
2350 ins_mode = ins_mode_in_table_body
2351 when 'td', 'th', 'tr'
2352 clear_stack_to_table_context()
2353 insert_html_element new_open_tag 'tbody'
2354 ins_mode = ins_mode_in_table_body
2358 if is_in_table_scope 'table', NS_HTML
2360 el = open_els.shift()
2361 if el.name is 'table' and el.namespace is NS_HTML
2365 when 'style', 'script', 'template'
2368 unless is_input_hidden_tok t
2369 ins_mode_in_table_else t
2372 el = insert_html_element t
2374 t.acknowledge_self_closing()
2377 if form_element_pointer?
2379 if template_tag_is_open()
2381 form_element_pointer = insert_html_element t
2384 ins_mode_in_table_else t
2388 if is_in_table_scope 'table', NS_HTML
2390 el = open_els.shift()
2391 if el.name is 'table' and el.namespace is NS_HTML
2396 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2401 ins_mode_in_table_else t
2405 ins_mode_in_table_else t
2409 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2410 ins_mode_in_table_text = (t) ->
2411 if t.type is TYPE_TEXT and t.text is "\u0000"
2415 if t.type is TYPE_TEXT
2416 pending_table_character_tokens.push t
2420 for old in pending_table_character_tokens
2421 unless is_space_tok old
2425 for old in pending_table_character_tokens
2426 insert_character old
2428 for old in pending_table_character_tokens
2429 ins_mode_in_table_else old
2430 pending_table_character_tokens = []
2431 ins_mode = original_ins_mode
2435 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2436 ins_mode_in_caption = (t) ->
2437 if t.type is TYPE_END_TAG and t.name is 'caption'
2438 if is_in_table_scope 'caption', NS_HTML
2439 generate_implied_end_tags()
2440 if open_els[0].name isnt 'caption'
2443 el = open_els.shift()
2444 if el.name is 'caption' and el.namespace is NS_HTML
2446 clear_afe_to_marker()
2447 ins_mode = ins_mode_in_table
2452 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2454 if is_in_table_scope 'caption', NS_HTML
2456 el = open_els.shift()
2457 if el.name is 'caption' and el.namespace is NS_HTML
2459 clear_afe_to_marker()
2460 ins_mode = ins_mode_in_table
2462 # else fragment case
2464 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2471 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2472 ins_mode_in_column_group = (t) ->
2476 if t.type is TYPE_COMMENT
2479 if t.type is TYPE_DOCTYPE
2482 if t.type is TYPE_START_TAG and t.name is 'html'
2485 if t.type is TYPE_START_TAG and t.name is 'col'
2486 el = insert_html_element t
2488 t.acknowledge_self_closing()
2490 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2491 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2493 ins_mode = ins_mode_in_table
2497 if t.type is TYPE_END_TAG and t.name is 'col'
2500 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2503 if t.type is TYPE_EOF
2507 if open_els[0].name isnt 'colgroup'
2511 ins_mode = ins_mode_in_table
2515 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2516 ins_mode_in_table_body = (t) ->
2517 if t.type is TYPE_START_TAG and t.name is 'tr'
2518 clear_stack_to_table_body_context()
2519 insert_html_element t
2520 ins_mode = ins_mode_in_row
2522 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2524 clear_stack_to_table_body_context()
2525 insert_html_element new_open_tag 'tr'
2526 ins_mode = ins_mode_in_row
2529 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2530 unless is_in_table_scope t.name, NS_HTML
2533 clear_stack_to_table_body_context()
2535 ins_mode = ins_mode_in_table
2537 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2540 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2543 if table_scopers[el.name] is el.namespace
2548 clear_stack_to_table_body_context()
2550 ins_mode = ins_mode_in_table
2553 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2560 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2561 ins_mode_in_row = (t) ->
2562 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2563 clear_stack_to_table_row_context()
2564 insert_html_element t
2565 ins_mode = ins_mode_in_cell
2568 if t.type is TYPE_END_TAG and t.name is 'tr'
2569 if is_in_table_scope 'tr', NS_HTML
2570 clear_stack_to_table_row_context()
2572 ins_mode = ins_mode_in_table_body
2576 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2577 if is_in_table_scope 'tr', NS_HTML
2578 clear_stack_to_table_row_context()
2580 ins_mode = ins_mode_in_table_body
2585 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2586 if is_in_table_scope t.name, NS_HTML
2587 if is_in_table_scope 'tr', NS_HTML
2588 clear_stack_to_table_row_context()
2590 ins_mode = ins_mode_in_table_body
2595 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2602 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2604 generate_implied_end_tags()
2605 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2608 el = open_els.shift()
2609 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2611 clear_afe_to_marker()
2612 ins_mode = ins_mode_in_row
2615 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2616 ins_mode_in_cell = (t) ->
2617 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2618 if is_in_table_scope t.name, NS_HTML
2619 generate_implied_end_tags()
2620 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2623 el = open_els.shift()
2624 if el.name is t.name and el.namespace is NS_HTML
2626 clear_afe_to_marker()
2627 ins_mode = ins_mode_in_row
2631 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2634 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2637 if table_scopers[el.name] is el.namespace
2645 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2648 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2649 if is_in_table_scope t.name, NS_HTML
2659 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2660 ins_mode_in_select = (t) ->
2661 if t.type is TYPE_TEXT and t.text is "\u0000"
2664 if t.type is TYPE_TEXT
2667 if t.type is TYPE_COMMENT
2670 if t.type is TYPE_DOCTYPE
2673 if t.type is TYPE_START_TAG and t.name is 'html'
2676 if t.type is TYPE_START_TAG and t.name is 'option'
2677 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2679 insert_html_element t
2681 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2682 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2684 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2686 insert_html_element t
2688 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2689 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2690 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2692 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2697 if t.type is TYPE_END_TAG and t.name is 'option'
2698 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2703 if t.type is TYPE_END_TAG and t.name is 'select'
2704 if is_in_select_scope 'select', NS_HTML
2706 el = open_els.shift()
2707 if el.name is 'select' and el.namespace is NS_HTML
2713 if t.type is TYPE_START_TAG and t.name is 'select'
2716 el = open_els.shift()
2717 if el.name is 'select' and el.namespace is NS_HTML
2720 # spec says that this is the same as </select> but it doesn't say
2721 # to check scope first
2723 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2725 unless is_in_select_scope 'select', NS_HTML
2728 el = open_els.shift()
2729 if el.name is 'select' and el.namespace is NS_HTML
2734 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2737 if t.type is TYPE_EOF
2744 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2745 ins_mode_in_select_in_table = (t) ->
2746 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2749 el = open_els.shift()
2750 if el.name is 'select' and el.namespace is NS_HTML
2755 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2757 unless is_in_table_scope t.name, NS_HTML
2760 el = open_els.shift()
2761 if el.name is 'select' and el.namespace is NS_HTML
2767 ins_mode_in_select t
2770 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2771 ins_mode_in_template = (t) ->
2772 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2775 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2778 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2779 template_ins_modes.shift()
2780 template_ins_modes.unshift ins_mode_in_table
2781 ins_mode = ins_mode_in_table
2784 if t.type is TYPE_START_TAG and t.name is 'col'
2785 template_ins_modes.shift()
2786 template_ins_modes.unshift ins_mode_in_column_group
2787 ins_mode = ins_mode_in_column_group
2790 if t.type is TYPE_START_TAG and t.name is 'tr'
2791 template_ins_modes.shift()
2792 template_ins_modes.unshift ins_mode_in_table_body
2793 ins_mode = ins_mode_in_table_body
2796 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2797 template_ins_modes.shift()
2798 template_ins_modes.unshift ins_mode_in_row
2799 ins_mode = ins_mode_in_row
2802 if t.type is TYPE_START_TAG
2803 template_ins_modes.shift()
2804 template_ins_modes.unshift ins_mode_in_body
2805 ins_mode = ins_mode_in_body
2808 if t.type is TYPE_END_TAG
2811 if t.type is TYPE_EOF
2812 unless template_tag_is_open()
2817 el = open_els.shift()
2818 if el.name is 'template' and el.namespace is NS_HTML
2820 clear_afe_to_marker()
2821 template_ins_modes.shift()
2826 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2827 ins_mode_after_body = (t) ->
2831 if t.type is TYPE_COMMENT
2832 first = open_els[open_els.length - 1]
2833 insert_comment t, [first, first.children.length]
2835 if t.type is TYPE_DOCTYPE
2838 if t.type is TYPE_START_TAG and t.name is 'html'
2841 if t.type is TYPE_END_TAG and t.name is 'html'
2842 if flag_fragment_parsing
2845 ins_mode = ins_mode_after_after_body
2847 if t.type is TYPE_EOF
2852 ins_mode = ins_mode_in_body
2856 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2857 ins_mode_in_frameset = (t) ->
2861 if t.type is TYPE_COMMENT
2864 if t.type is TYPE_DOCTYPE
2867 if t.type is TYPE_START_TAG and t.name is 'html'
2870 if t.type is TYPE_START_TAG and t.name is 'frameset'
2871 insert_html_element t
2873 if t.type is TYPE_END_TAG and t.name is 'frameset'
2874 if open_els.length is 1
2876 return # fragment case
2878 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2879 ins_mode = ins_mode_after_frameset
2881 if t.type is TYPE_START_TAG and t.name is 'frame'
2882 insert_html_element t
2884 t.acknowledge_self_closing()
2886 if t.type is TYPE_START_TAG and t.name is 'noframes'
2889 if t.type is TYPE_EOF
2890 if open_els.length isnt 1
2898 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2899 ins_mode_after_frameset = (t) ->
2903 if t.type is TYPE_COMMENT
2906 if t.type is TYPE_DOCTYPE
2909 if t.type is TYPE_START_TAG and t.name is 'html'
2912 if t.type is TYPE_END_TAG and t.name is 'html'
2913 ins_mode = ins_mode_after_after_frameset
2915 if t.type is TYPE_START_TAG and t.name is 'noframes'
2918 if t.type is TYPE_EOF
2925 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2926 ins_mode_after_after_body = (t) ->
2927 if t.type is TYPE_COMMENT
2928 insert_comment t, [doc, doc.children.length]
2930 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2933 if t.type is TYPE_EOF
2938 ins_mode = ins_mode_in_body
2942 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2943 ins_mode_after_after_frameset = (t) ->
2944 if t.type is TYPE_COMMENT
2945 insert_comment t, [doc, doc.children.length]
2947 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2950 if t.type is TYPE_EOF
2953 if t.type is TYPE_START_TAG and t.name is 'noframes'
2960 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2961 has_color_face_or_size = (t) ->
2963 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2966 in_foreign_content_end_script = ->
2970 in_foreign_content_other_start = (t) ->
2971 acn = adjusted_current_node()
2972 if acn.namespace is NS_MATHML
2973 adjust_mathml_attributes t
2974 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2975 t.name = svg_name_fixes[t.name]
2976 if acn.namespace is NS_SVG
2977 adjust_svg_attributes t
2978 adjust_foreign_attributes t
2979 insert_foreign_element t, acn.namespace
2980 if t.flag 'self-closing'
2981 if t.name is 'script'
2982 t.acknowledge_self_closing()
2983 in_foreign_content_end_script()
2987 t.acknowledge_self_closing()
2989 in_foreign_content = (t) ->
2990 if t.type is TYPE_TEXT and t.text is "\u0000"
2992 insert_character new_character_token "\ufffd"
2997 if t.type is TYPE_TEXT
2998 flag_frameset_ok = false
3001 if t.type is TYPE_COMMENT
3004 if t.type is TYPE_DOCTYPE
3007 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
3009 if flag_fragment_parsing
3010 in_foreign_content_other_start t
3012 loop # is this safe?
3014 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
3018 if t.type is TYPE_START_TAG
3019 in_foreign_content_other_start t
3021 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
3022 in_foreign_content_end_script()
3024 if t.type is TYPE_END_TAG
3027 if node.name.toLowerCase() isnt t.name
3030 if node is open_els[open_els.length - 1]
3032 if node.name.toLowerCase() is t.name
3034 el = open_els.shift()
3039 if node.namespace is NS_HTML
3041 ins_mode t # explicitly call HTML insertion mode
3045 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
3047 switch c = txt.charAt(cur++)
3049 return new_text_node parse_character_reference()
3051 tok_state = tok_state_tag_open
3054 return new_text_node c
3056 return new_eof_token()
3058 return new_text_node c
3061 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
3062 # not needed: tok_state_character_reference_in_data = ->
3063 # just call parse_character_reference()
3065 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
3066 tok_state_rcdata = ->
3067 switch c = txt.charAt(cur++)
3069 return new_text_node parse_character_reference()
3071 tok_state = tok_state_rcdata_less_than_sign
3074 return new_character_token "\ufffd"
3076 return new_eof_token()
3078 return new_character_token c
3081 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
3082 # not needed: tok_state_character_reference_in_rcdata = ->
3083 # just call parse_character_reference()
3085 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
3086 tok_state_rawtext = ->
3087 switch c = txt.charAt(cur++)
3089 tok_state = tok_state_rawtext_less_than_sign
3092 return new_character_token "\ufffd"
3094 return new_eof_token()
3096 return new_character_token c
3099 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
3100 tok_state_script_data = ->
3101 switch c = txt.charAt(cur++)
3103 tok_state = tok_state_script_data_less_than_sign
3106 return new_character_token "\ufffd"
3108 return new_eof_token()
3110 return new_character_token c
3113 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3114 tok_state_plaintext = ->
3115 switch c = txt.charAt(cur++)
3118 return new_character_token "\ufffd"
3120 return new_eof_token()
3122 return new_character_token c
3126 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3127 tok_state_tag_open = ->
3128 c = txt.charAt(cur++)
3130 tok_state = tok_state_markup_declaration_open
3133 tok_state = tok_state_end_tag_open
3136 tok_cur_tag = new_open_tag c.toLowerCase()
3137 tok_state = tok_state_tag_name
3140 tok_cur_tag = new_open_tag c
3141 tok_state = tok_state_tag_name
3145 tok_cur_tag = new_comment_token '?' # FIXME right?
3146 tok_state = tok_state_bogus_comment
3150 tok_state = tok_state_data
3151 cur -= 1 # we didn't parse/handle the char after <
3152 return new_text_node '<'
3154 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3155 tok_state_end_tag_open = ->
3156 c = txt.charAt(cur++)
3158 tok_cur_tag = new_end_tag c.toLowerCase()
3159 tok_state = tok_state_tag_name
3162 tok_cur_tag = new_end_tag c
3163 tok_state = tok_state_tag_name
3167 tok_state = tok_state_data
3171 tok_state = tok_state_data
3172 return new_text_node '</'
3175 tok_cur_tag = new_comment_token c
3176 tok_state = tok_state_bogus_comment
3179 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3180 tok_state_tag_name = ->
3181 switch c = txt.charAt(cur++)
3182 when "\t", "\n", "\u000c", ' '
3183 tok_state = tok_state_before_attribute_name
3185 tok_state = tok_state_self_closing_start_tag
3187 tok_state = tok_state_data
3193 tok_cur_tag.name += "\ufffd"
3196 tok_state = tok_state_data
3199 tok_cur_tag.name += c.toLowerCase()
3201 tok_cur_tag.name += c
3204 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3205 tok_state_rcdata_less_than_sign = ->
3206 c = txt.charAt(cur++)
3208 temporary_buffer = ''
3209 tok_state = tok_state_rcdata_end_tag_open
3212 tok_state = tok_state_rcdata
3213 cur -= 1 # reconsume the input character
3214 return new_character_token '<'
3216 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3217 tok_state_rcdata_end_tag_open = ->
3218 c = txt.charAt(cur++)
3220 tok_cur_tag = new_end_tag c.toLowerCase()
3221 temporary_buffer += c
3222 tok_state = tok_state_rcdata_end_tag_name
3225 tok_cur_tag = new_end_tag c
3226 temporary_buffer += c
3227 tok_state = tok_state_rcdata_end_tag_name
3230 tok_state = tok_state_rcdata
3231 cur -= 1 # reconsume the input character
3232 return new_character_token "</" # fixfull separate these
3234 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3235 is_appropriate_end_tag = (t) ->
3236 # fixfull: this assumes that open_els[0].name is "the tag name of the last
3237 # start tag to have been emitted from this tokenizer"
3238 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3240 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3241 tok_state_rcdata_end_tag_name = ->
3242 c = txt.charAt(cur++)
3243 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3244 if is_appropriate_end_tag tok_cur_tag
3245 tok_state = tok_state_before_attribute_name
3247 # else fall through to "Anything else"
3249 if is_appropriate_end_tag tok_cur_tag
3250 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3252 # else fall through to "Anything else"
3254 if is_appropriate_end_tag tok_cur_tag
3255 tok_state = tok_state_data
3257 # else fall through to "Anything else"
3259 tok_cur_tag.name += c.toLowerCase()
3260 temporary_buffer += c
3263 tok_cur_tag.name += c
3264 temporary_buffer += c
3267 tok_state = tok_state_rcdata
3268 cur -= 1 # reconsume the input character
3269 return new_character_token '</' + temporary_buffer # fixfull separate these
3271 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3272 tok_state_rawtext_less_than_sign = ->
3273 c = txt.charAt(cur++)
3275 temporary_buffer = ''
3276 tok_state = tok_state_rawtext_end_tag_open
3279 tok_state = tok_state_rawtext
3280 cur -= 1 # reconsume the input character
3281 return new_character_token '<'
3283 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3284 tok_state_rawtext_end_tag_open = ->
3285 c = txt.charAt(cur++)
3287 tok_cur_tag = new_end_tag c.toLowerCase()
3288 temporary_buffer += c
3289 tok_state = tok_state_rawtext_end_tag_name
3292 tok_cur_tag = new_end_tag c
3293 temporary_buffer += c
3294 tok_state = tok_state_rawtext_end_tag_name
3297 tok_state = tok_state_rawtext
3298 cur -= 1 # reconsume the input character
3299 return new_character_token "</" # fixfull separate these
3301 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3302 tok_state_rawtext_end_tag_name = ->
3303 c = txt.charAt(cur++)
3304 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3305 if is_appropriate_end_tag tok_cur_tag
3306 tok_state = tok_state_before_attribute_name
3308 # else fall through to "Anything else"
3310 if is_appropriate_end_tag tok_cur_tag
3311 tok_state = tok_state_self_closing_start_tag
3313 # else fall through to "Anything else"
3315 if is_appropriate_end_tag tok_cur_tag
3316 tok_state = tok_state_data
3318 # else fall through to "Anything else"
3320 tok_cur_tag.name += c.toLowerCase()
3321 temporary_buffer += c
3324 tok_cur_tag.name += c
3325 temporary_buffer += c
3328 tok_state = tok_state_rawtext
3329 cur -= 1 # reconsume the input character
3330 return new_character_token '</' + temporary_buffer # fixfull separate these
3332 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3333 tok_state_script_data_less_than_sign = ->
3334 c = txt.charAt(cur++)
3336 temporary_buffer = ''
3337 tok_state = tok_state_script_data_end_tag_open
3340 tok_state = tok_state_script_data_escape_start
3341 return new_character_token '<!' # fixfull split
3343 tok_state = tok_state_script_data
3344 cur -= 1 # Reconsume
3345 return new_character_token '<'
3347 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3348 tok_state_script_data_end_tag_open = ->
3349 c = txt.charAt(cur++)
3351 tok_cur_tag = new_end_tag c.toLowerCase()
3352 temporary_buffer += c
3353 tok_state = tok_state_script_data_end_tag_name
3356 tok_cur_tag = new_end_tag c
3357 temporary_buffer += c
3358 tok_state = tok_state_script_data_end_tag_name
3361 tok_state = tok_state_script_data
3362 cur -= 1 # Reconsume
3363 return new_character_token '</'
3365 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3366 tok_state_script_data_end_tag_name = ->
3367 c = txt.charAt(cur++)
3368 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3369 if is_appropriate_end_tag tok_cur_tag
3370 tok_state = tok_state_before_attribute_name
3374 if is_appropriate_end_tag tok_cur_tag
3375 tok_state = tok_state_self_closing_start_tag
3379 if is_appropriate_end_tag tok_cur_tag
3380 tok_state = tok_state_data
3384 tok_cur_tag.name += c.toLowerCase()
3385 temporary_buffer += c
3388 tok_cur_tag.name += c
3389 temporary_buffer += c
3392 tok_state = tok_state_script_data
3393 cur -= 1 # Reconsume
3394 return new_character_token "</#{temporary_buffer}" # fixfull split
3396 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3397 tok_state_script_data_escape_start = ->
3398 c = txt.charAt(cur++)
3400 tok_state = tok_state_script_data_escape_start_dash
3401 return new_character_token '-'
3403 tok_state = tok_state_script_data
3404 cur -= 1 # Reconsume
3407 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3408 tok_state_script_data_escape_start_dash = ->
3409 c = txt.charAt(cur++)
3411 tok_state = tok_state_script_data_escaped_dash_dash
3412 return new_character_token '-'
3414 tok_state = tok_state_script_data
3415 cur -= 1 # Reconsume
3418 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3419 tok_state_script_data_escaped = ->
3420 c = txt.charAt(cur++)
3422 tok_state = tok_state_script_data_escaped_dash
3423 return new_character_token '-'
3425 tok_state = tok_state_script_data_escaped_less_than_sign
3429 return new_character_token "\ufffd"
3431 tok_state = tok_state_data
3433 cur -= 1 # Reconsume
3436 return new_character_token c
3438 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3439 tok_state_script_data_escaped_dash = ->
3440 c = txt.charAt(cur++)
3442 tok_state = tok_state_script_data_escaped_dash_dash
3443 return new_character_token '-'
3445 tok_state = tok_state_script_data_escaped_less_than_sign
3449 tok_state = tok_state_script_data_escaped
3450 return new_character_token "\ufffd"
3452 tok_state = tok_state_data
3454 cur -= 1 # Reconsume
3457 tok_state = tok_state_script_data_escaped
3458 return new_character_token c
3460 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3461 tok_state_script_data_escaped_dash_dash = ->
3462 c = txt.charAt(cur++)
3464 return new_character_token '-'
3466 tok_state = tok_state_script_data_escaped_less_than_sign
3469 tok_state = tok_state_script_data
3470 return new_character_token '>'
3473 tok_state = tok_state_script_data_escaped
3474 return new_character_token "\ufffd"
3477 tok_state = tok_state_data
3478 cur -= 1 # Reconsume
3481 tok_state = tok_state_script_data_escaped
3482 return new_character_token c
3484 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3485 tok_state_script_data_escaped_less_than_sign = ->
3486 c = txt.charAt(cur++)
3488 temporary_buffer = ''
3489 tok_state = tok_state_script_data_escaped_end_tag_open
3492 temporary_buffer = c.toLowerCase() # yes, really
3493 tok_state = tok_state_script_data_double_escape_start
3494 return new_character_token "<#{c}" # fixfull split
3496 temporary_buffer = c
3497 tok_state = tok_state_script_data_double_escape_start
3498 return new_character_token "<#{c}" # fixfull split
3500 tok_state = tok_state_script_data_escaped
3501 cur -= 1 # Reconsume
3502 return new_character_token '<'
3504 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3505 tok_state_script_data_escaped_end_tag_open = ->
3506 c = txt.charAt(cur++)
3508 tok_cur_tag = new_end_tag c.toLowerCase()
3509 temporary_buffer += c
3510 tok_state = tok_state_script_data_escaped_end_tag_name
3513 tok_cur_tag = new_end_tag c
3514 temporary_buffer += c
3515 tok_state = tok_state_script_data_escaped_end_tag_name
3518 tok_state = tok_state_script_data_escaped
3519 cur -= 1 # Reconsume
3520 return new_character_token '</' # fixfull split
3522 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3523 tok_state_script_data_escaped_end_tag_name = ->
3524 c = txt.charAt(cur++)
3525 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3526 if is_appropriate_end_tag tok_cur_tag
3527 tok_state = tok_state_before_attribute_name
3531 if is_appropriate_end_tag tok_cur_tag
3532 tok_state = tok_state_self_closing_start_tag
3536 if is_appropriate_end_tag tok_cur_tag
3537 tok_state = tok_state_data
3541 tok_cur_tag.name += c.toLowerCase()
3542 temporary_buffer += c.toLowerCase()
3545 tok_cur_tag.name += c
3546 temporary_buffer += c.toLowerCase()
3549 tok_state = tok_state_script_data_escaped
3550 cur -= 1 # Reconsume
3551 return new_character_token "</#{temporary_buffer}" # fixfull split
3553 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3554 tok_state_script_data_double_escape_start = ->
3555 c = txt.charAt(cur++)
3556 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3557 if temporary_buffer is 'script'
3558 tok_state = tok_state_script_data_double_escaped
3560 tok_state = tok_state_script_data_escaped
3561 return new_character_token c
3563 temporary_buffer += c.toLowerCase() # yes, really lowercase
3564 return new_character_token c
3566 temporary_buffer += c
3567 return new_character_token c
3569 tok_state = tok_state_script_data_escaped
3570 cur -= 1 # Reconsume
3573 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3574 tok_state_script_data_double_escaped = ->
3575 c = txt.charAt(cur++)
3577 tok_state = tok_state_script_data_double_escaped_dash
3578 return new_character_token '-'
3580 tok_state = tok_state_script_data_double_escaped_less_than_sign
3581 return new_character_token '<'
3584 return new_character_token "\ufffd"
3587 tok_state = tok_state_data
3588 cur -= 1 # Reconsume
3591 return new_character_token c
3593 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3594 tok_state_script_data_double_escaped_dash = ->
3595 c = txt.charAt(cur++)
3597 tok_state = tok_state_script_data_double_escaped_dash_dash
3598 return new_character_token '-'
3600 tok_state = tok_state_script_data_double_escaped_less_than_sign
3601 return new_character_token '<'
3604 tok_state = tok_state_script_data_double_escaped
3605 return new_character_token "\ufffd"
3608 tok_state = tok_state_data
3609 cur -= 1 # Reconsume
3612 tok_state = tok_state_script_data_double_escaped
3613 return new_character_token c
3615 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3616 tok_state_script_data_double_escaped_dash_dash = ->
3617 c = txt.charAt(cur++)
3619 return new_character_token '-'
3621 tok_state = tok_state_script_data_double_escaped_less_than_sign
3622 return new_character_token '<'
3624 tok_state = tok_state_script_data
3625 return new_character_token '>'
3628 tok_state = tok_state_script_data_double_escaped
3629 return new_character_token "\ufffd"
3632 tok_state = tok_state_data
3633 cur -= 1 # Reconsume
3636 tok_state = tok_state_script_data_double_escaped
3637 return new_character_token c
3639 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3640 tok_state_script_data_double_escaped_less_than_sign = ->
3641 c = txt.charAt(cur++)
3643 temporary_buffer = ''
3644 tok_state = tok_state_script_data_double_escape_end
3645 return new_character_token '/'
3647 tok_state = tok_state_script_data_double_escaped
3648 cur -= 1 # Reconsume
3651 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3652 tok_state_script_data_double_escape_end = ->
3653 c = txt.charAt(cur++)
3654 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3655 if temporary_buffer is 'script'
3656 tok_state = tok_state_script_data_escaped
3658 tok_state = tok_state_script_data_double_escaped
3659 return new_character_token c
3661 temporary_buffer += c.toLowerCase() # yes, really lowercase
3662 return new_character_token c
3664 temporary_buffer += c
3665 return new_character_token c
3667 tok_state = tok_state_script_data_double_escaped
3668 cur -= 1 # Reconsume
3671 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3672 tok_state_before_attribute_name = ->
3674 switch c = txt.charAt(cur++)
3675 when "\t", "\n", "\u000c", ' '
3678 tok_state = tok_state_self_closing_start_tag
3681 tok_state = tok_state_data
3687 attr_name = "\ufffd"
3688 when '"', "'", '<', '='
3693 tok_state = tok_state_data
3696 attr_name = c.toLowerCase()
3700 tok_cur_tag.attrs_a.unshift [attr_name, '']
3701 tok_state = tok_state_attribute_name
3704 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3705 tok_state_attribute_name = ->
3706 switch c = txt.charAt(cur++)
3707 when "\t", "\n", "\u000c", ' '
3708 tok_state = tok_state_after_attribute_name
3710 tok_state = tok_state_self_closing_start_tag
3712 tok_state = tok_state_before_attribute_value
3714 tok_state = tok_state_data
3720 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3723 tok_cur_tag.attrs_a[0][0] += c
3726 tok_state = tok_state_data
3729 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3731 tok_cur_tag.attrs_a[0][0] += c
3734 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3735 tok_state_after_attribute_name = ->
3736 c = txt.charAt(cur++)
3737 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3740 tok_state = tok_state_self_closing_start_tag
3743 tok_state = tok_state_before_attribute_value
3746 tok_state = tok_state_data
3749 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3750 tok_state = tok_state_attribute_name
3754 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3755 tok_state = tok_state_attribute_name
3759 tok_state = tok_state_data
3760 cur -= 1 # reconsume
3762 if c is '"' or c is "'" or c is '<'
3764 # fall through to Anything else
3766 tok_cur_tag.attrs_a.unshift [c, '']
3767 tok_state = tok_state_attribute_name
3770 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3771 tok_state_before_attribute_value = ->
3772 switch c = txt.charAt(cur++)
3773 when "\t", "\n", "\u000c", ' '
3776 tok_state = tok_state_attribute_value_double_quoted
3778 tok_state = tok_state_attribute_value_unquoted
3781 tok_state = tok_state_attribute_value_single_quoted
3784 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3785 tok_state = tok_state_attribute_value_unquoted
3788 tok_state = tok_state_data
3794 tok_state = tok_state_data
3796 tok_cur_tag.attrs_a[0][1] += c
3797 tok_state = tok_state_attribute_value_unquoted
3800 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3801 tok_state_attribute_value_double_quoted = ->
3802 switch c = txt.charAt(cur++)
3804 tok_state = tok_state_after_attribute_value_quoted
3806 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3809 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3812 tok_state = tok_state_data
3814 tok_cur_tag.attrs_a[0][1] += c
3817 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3818 tok_state_attribute_value_single_quoted = ->
3819 switch c = txt.charAt(cur++)
3821 tok_state = tok_state_after_attribute_value_quoted
3823 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3826 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3829 tok_state = tok_state_data
3831 tok_cur_tag.attrs_a[0][1] += c
3834 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3835 tok_state_attribute_value_unquoted = ->
3836 switch c = txt.charAt(cur++)
3837 when "\t", "\n", "\u000c", ' '
3838 tok_state = tok_state_before_attribute_name
3840 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3842 tok_state = tok_state_data
3847 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3850 tok_state = tok_state_data
3852 # Parse Error if ', <, = or ` (backtick)
3853 tok_cur_tag.attrs_a[0][1] += c
3856 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3857 tok_state_after_attribute_value_quoted = ->
3858 switch c = txt.charAt(cur++)
3859 when "\t", "\n", "\u000c", ' '
3860 tok_state = tok_state_before_attribute_name
3862 tok_state = tok_state_self_closing_start_tag
3864 tok_state = tok_state_data
3870 tok_state = tok_state_data
3873 tok_state = tok_state_before_attribute_name
3874 cur -= 1 # we didn't handle that char
3877 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3878 tok_state_self_closing_start_tag = ->
3879 c = txt.charAt(cur++)
3881 tok_cur_tag.flag 'self-closing', true
3882 tok_state = tok_state_data
3886 tok_state = tok_state_data
3887 cur -= 1 # Reconsume
3891 tok_state = tok_state_before_attribute_name
3892 cur -= 1 # Reconsume
3895 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3896 # WARNING: put a comment token in tok_cur_tag before setting this state
3897 tok_state_bogus_comment = ->
3898 next_gt = txt.indexOf '>', cur
3900 val = txt.substr cur
3903 val = txt.substr cur, (next_gt - cur)
3905 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3906 tok_cur_tag.text += val
3907 tok_state = tok_state_data
3910 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3911 tok_state_markup_declaration_open = ->
3912 if txt.substr(cur, 2) is '--'
3914 tok_cur_tag = new_comment_token ''
3915 tok_state = tok_state_comment_start
3917 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3919 tok_state = tok_state_doctype
3921 acn = adjusted_current_node()
3922 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3924 tok_state = tok_state_cdata_section
3928 tok_cur_tag = new_comment_token ''
3929 tok_state = tok_state_bogus_comment
3932 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3933 tok_state_comment_start = ->
3934 switch c = txt.charAt(cur++)
3936 tok_state = tok_state_comment_start_dash
3939 tok_state = tok_state_comment
3940 return new_character_token "\ufffd"
3943 tok_state = tok_state_data
3947 tok_state = tok_state_data
3948 cur -= 1 # Reconsume
3951 tok_cur_tag.text += c
3952 tok_state = tok_state_comment
3955 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3956 tok_state_comment_start_dash = ->
3957 switch c = txt.charAt(cur++)
3959 tok_state = tok_state_comment_end
3962 tok_cur_tag.text += "-\ufffd"
3963 tok_state = tok_state_comment
3966 tok_state = tok_state_data
3970 tok_state = tok_state_data
3971 cur -= 1 # Reconsume
3974 tok_cur_tag.text += "-#{c}"
3975 tok_state = tok_state_comment
3978 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3979 tok_state_comment = ->
3980 switch c = txt.charAt(cur++)
3982 tok_state = tok_state_comment_end_dash
3985 tok_cur_tag.text += "\ufffd"
3988 tok_state = tok_state_data
3989 cur -= 1 # Reconsume
3992 tok_cur_tag.text += c
3995 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3996 tok_state_comment_end_dash = ->
3997 switch c = txt.charAt(cur++)
3999 tok_state = tok_state_comment_end
4002 tok_cur_tag.text += "-\ufffd"
4003 tok_state = tok_state_comment
4006 tok_state = tok_state_data
4007 cur -= 1 # Reconsume
4010 tok_cur_tag.text += "-#{c}"
4011 tok_state = tok_state_comment
4014 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
4015 tok_state_comment_end = ->
4016 switch c = txt.charAt(cur++)
4018 tok_state = tok_state_data
4022 tok_cur_tag.text += "--\ufffd"
4023 tok_state = tok_state_comment
4026 tok_state = tok_state_comment_end_bang
4029 tok_cur_tag.text += '-'
4032 tok_state = tok_state_data
4033 cur -= 1 # Reconsume
4037 tok_cur_tag.text += "--#{c}"
4038 tok_state = tok_state_comment
4041 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
4042 tok_state_comment_end_bang = ->
4043 switch c = txt.charAt(cur++)
4045 tok_cur_tag.text += "--!#{c}"
4046 tok_state = tok_state_comment_end_dash
4048 tok_state = tok_state_data
4052 tok_cur_tag.text += "--!\ufffd"
4053 tok_state = tok_state_comment
4056 tok_state = tok_state_data
4057 cur -= 1 # Reconsume
4060 tok_cur_tag.text += "--!#{c}"
4061 tok_state = tok_state_comment
4064 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4065 tok_state_doctype = ->
4066 switch c = txt.charAt(cur++)
4067 when "\t", "\u000a", "\u000c", ' '
4068 tok_state = tok_state_before_doctype_name
4071 tok_state = tok_state_data
4072 el = new_doctype_token ''
4073 el.flag 'force-quirks', true
4074 cur -= 1 # Reconsume
4078 tok_state = tok_state_before_doctype_name
4079 cur -= 1 # Reconsume
4082 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4083 tok_state_before_doctype_name = ->
4084 c = txt.charAt(cur++)
4085 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4088 tok_cur_tag = new_doctype_token c.toLowerCase()
4089 tok_state = tok_state_doctype_name
4093 tok_cur_tag = new_doctype_token "\ufffd"
4094 tok_state = tok_state_doctype_name
4098 el = new_doctype_token ''
4099 el.flag 'force-quirks', true
4100 tok_state = tok_state_data
4104 tok_state = tok_state_data
4105 el = new_doctype_token ''
4106 el.flag 'force-quirks', true
4107 cur -= 1 # Reconsume
4110 tok_cur_tag = new_doctype_token c
4111 tok_state = tok_state_doctype_name
4114 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4115 tok_state_doctype_name = ->
4116 c = txt.charAt(cur++)
4117 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4118 tok_state = tok_state_after_doctype_name
4121 tok_state = tok_state_data
4124 tok_cur_tag.name += c.toLowerCase()
4128 tok_cur_tag.name += "\ufffd"
4132 tok_state = tok_state_data
4133 tok_cur_tag.flag 'force-quirks', true
4134 cur -= 1 # Reconsume
4137 tok_cur_tag.name += c
4140 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4141 tok_state_after_doctype_name = ->
4142 c = txt.charAt(cur++)
4143 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4146 tok_state = tok_state_data
4150 tok_state = tok_state_data
4151 tok_cur_tag.flag 'force-quirks', true
4152 cur -= 1 # Reconsume
4155 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4157 tok_state = tok_state_after_doctype_public_keyword
4159 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4161 tok_state = tok_state_after_doctype_system_keyword
4164 tok_cur_tag.flag 'force-quirks', true
4165 tok_state = tok_state_bogus_doctype
4168 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4169 tok_state_after_doctype_public_keyword = ->
4170 c = txt.charAt(cur++)
4171 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4172 tok_state = tok_state_before_doctype_public_identifier
4176 tok_cur_tag.public_identifier = ''
4177 tok_state = tok_state_doctype_public_identifier_double_quoted
4181 tok_cur_tag.public_identifier = ''
4182 tok_state = tok_state_doctype_public_identifier_single_quoted
4186 tok_cur_tag.flag 'force-quirks', true
4187 tok_state = tok_state_data
4191 tok_state = tok_state_data
4192 tok_cur_tag.flag 'force-quirks', true
4193 cur -= 1 # Reconsume
4197 tok_cur_tag.flag 'force-quirks', true
4198 tok_state = tok_state_bogus_doctype
4201 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4202 tok_state_before_doctype_public_identifier = ->
4203 c = txt.charAt(cur++)
4204 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4208 tok_cur_tag.public_identifier = ''
4209 tok_state = tok_state_doctype_public_identifier_double_quoted
4213 tok_cur_tag.public_identifier = ''
4214 tok_state = tok_state_doctype_public_identifier_single_quoted
4218 tok_cur_tag.flag 'force-quirks', true
4219 tok_state = tok_state_data
4223 tok_state = tok_state_data
4224 tok_cur_tag.flag 'force-quirks', true
4225 cur -= 1 # Reconsume
4229 tok_cur_tag.flag 'force-quirks', true
4230 tok_state = tok_state_bogus_doctype
4234 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4235 tok_state_doctype_public_identifier_double_quoted = ->
4236 c = txt.charAt(cur++)
4238 tok_state = tok_state_after_doctype_public_identifier
4242 tok_cur_tag.public_identifier += "\ufffd"
4246 tok_cur_tag.flag 'force-quirks', true
4247 tok_state = tok_state_data
4251 tok_state = tok_state_data
4252 tok_cur_tag.flag 'force-quirks', true
4253 cur -= 1 # Reconsume
4256 tok_cur_tag.public_identifier += c
4259 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4260 tok_state_doctype_public_identifier_single_quoted = ->
4261 c = txt.charAt(cur++)
4263 tok_state = tok_state_after_doctype_public_identifier
4267 tok_cur_tag.public_identifier += "\ufffd"
4271 tok_cur_tag.flag 'force-quirks', true
4272 tok_state = tok_state_data
4276 tok_state = tok_state_data
4277 tok_cur_tag.flag 'force-quirks', true
4278 cur -= 1 # Reconsume
4281 tok_cur_tag.public_identifier += c
4284 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4285 tok_state_after_doctype_public_identifier = ->
4286 c = txt.charAt(cur++)
4287 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4288 tok_state = tok_state_between_doctype_public_and_system_identifiers
4291 tok_state = tok_state_data
4295 tok_cur_tag.system_identifier = ''
4296 tok_state = tok_state_doctype_system_identifier_double_quoted
4300 tok_cur_tag.system_identifier = ''
4301 tok_state = tok_state_doctype_system_identifier_single_quoted
4305 tok_state = tok_state_data
4306 tok_cur_tag.flag 'force-quirks', true
4307 cur -= 1 # Reconsume
4311 tok_cur_tag.flag 'force-quirks', true
4312 tok_state = tok_state_bogus_doctype
4315 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4316 tok_state_between_doctype_public_and_system_identifiers = ->
4317 c = txt.charAt(cur++)
4318 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4321 tok_state = tok_state_data
4325 tok_cur_tag.system_identifier = ''
4326 tok_state = tok_state_doctype_system_identifier_double_quoted
4330 tok_cur_tag.system_identifier = ''
4331 tok_state = tok_state_doctype_system_identifier_single_quoted
4335 tok_state = tok_state_data
4336 tok_cur_tag.flag 'force-quirks', true
4337 cur -= 1 # Reconsume
4341 tok_cur_tag.flag 'force-quirks', true
4342 tok_state = tok_state_bogus_doctype
4345 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4346 tok_state_after_doctype_system_keyword = ->
4347 c = txt.charAt(cur++)
4348 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4349 tok_state = tok_state_before_doctype_system_identifier
4353 tok_cur_tag.system_identifier = ''
4354 tok_state = tok_state_doctype_system_identifier_double_quoted
4358 tok_cur_tag.system_identifier = ''
4359 tok_state = tok_state_doctype_system_identifier_single_quoted
4363 tok_cur_tag.flag 'force-quirks', true
4364 tok_state = tok_state_data
4368 tok_state = tok_state_data
4369 tok_cur_tag.flag 'force-quirks', true
4370 cur -= 1 # Reconsume
4374 tok_cur_tag.flag 'force-quirks', true
4375 tok_state = tok_state_bogus_doctype
4378 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4379 tok_state_before_doctype_system_identifier = ->
4380 c = txt.charAt(cur++)
4381 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4384 tok_cur_tag.system_identifier = ''
4385 tok_state = tok_state_doctype_system_identifier_double_quoted
4388 tok_cur_tag.system_identifier = ''
4389 tok_state = tok_state_doctype_system_identifier_single_quoted
4393 tok_cur_tag.flag 'force-quirks', true
4394 tok_state = tok_state_data
4398 tok_state = tok_state_data
4399 tok_cur_tag.flag 'force-quirks', true
4400 cur -= 1 # Reconsume
4404 tok_cur_tag.flag 'force-quirks', true
4405 tok_state = tok_state_bogus_doctype
4408 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4409 tok_state_doctype_system_identifier_double_quoted = ->
4410 c = txt.charAt(cur++)
4412 tok_state = tok_state_after_doctype_system_identifier
4416 tok_cur_tag.system_identifier += "\ufffd"
4420 tok_cur_tag.flag 'force-quirks', true
4421 tok_state = tok_state_data
4425 tok_state = tok_state_data
4426 tok_cur_tag.flag 'force-quirks', true
4427 cur -= 1 # Reconsume
4430 tok_cur_tag.system_identifier += c
4433 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4434 tok_state_doctype_system_identifier_single_quoted = ->
4435 c = txt.charAt(cur++)
4437 tok_state = tok_state_after_doctype_system_identifier
4441 tok_cur_tag.system_identifier += "\ufffd"
4445 tok_cur_tag.flag 'force-quirks', true
4446 tok_state = tok_state_data
4450 tok_state = tok_state_data
4451 tok_cur_tag.flag 'force-quirks', true
4452 cur -= 1 # Reconsume
4455 tok_cur_tag.system_identifier += c
4458 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4459 tok_state_after_doctype_system_identifier = ->
4460 c = txt.charAt(cur++)
4461 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4464 tok_state = tok_state_data
4468 tok_state = tok_state_data
4469 tok_cur_tag.flag 'force-quirks', true
4470 cur -= 1 # Reconsume
4474 # do _not_ tok_cur_tag.flag 'force-quirks', true
4475 tok_state = tok_state_bogus_doctype
4478 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4479 tok_state_bogus_doctype = ->
4480 c = txt.charAt(cur++)
4482 tok_state = tok_state_data
4485 tok_state = tok_state_data
4486 cur -= 1 # Reconsume
4491 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4492 tok_state_cdata_section = ->
4493 tok_state = tok_state_data
4494 next_gt = txt.indexOf ']]>', cur
4496 val = txt.substr cur
4499 val = txt.substr cur, (next_gt - cur)
4501 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
4503 return new_character_token val # fixfull split
4506 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4507 # Don't set this as a state, just call it
4508 # returns a string (NOT a text node)
4509 parse_character_reference = (allowed_char = null, in_attr = false) ->
4510 if cur >= txt.length
4512 switch c = txt.charAt(cur)
4513 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4514 # explicitly not a parse error
4517 # there has to be "one or more" alnums between & and ; to be a parse error
4520 if cur + 1 >= txt.length
4522 if txt.charAt(cur + 1).toLowerCase() is 'x'
4531 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4536 if txt.charAt(start + i) is ';'
4540 code_point = txt.substr(start, i)
4541 while code_point.charAt(0) is '0' and code_point.length > 1
4542 code_point = code_point.substr 1
4543 code_point = parseInt(code_point, base)
4544 if unicode_fixes[code_point]?
4546 return unicode_fixes[code_point]
4548 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4552 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4554 return from_code_point code_point
4558 if alnum.indexOf(txt.charAt(cur + i)) is -1
4561 # exit early, because parse_error() below needs at least one alnum
4563 if txt.charAt(cur + i) is ';'
4564 i += 1 # include ';' terminator in value
4565 decoded = decode_named_char_ref txt.substr(cur, i)
4572 # no ';' terminator (only legacy char refs)
4574 for i in [2..max] # no prefix matches, so ok to check shortest first
4575 c = legacy_char_refs[txt.substr(cur, i)]
4578 if txt.charAt(cur + i) is '='
4579 # "because some legacy user agents will
4580 # misinterpret the markup in those cases"
4583 if alnum.indexOf(txt.charAt(cur + i)) > -1
4584 # this makes attributes forgiving about url args
4586 # ok, and besides the weird exceptions for attributes...
4587 # return the matching char
4588 cur += i # consume entity chars
4589 parse_error() # because no terminating ";"
4593 return # never reached
4595 eat_next_token_if_newline = ->
4600 if t.type is TYPE_TEXT
4601 # definition of a newline depends on whether it was a character ref or not
4602 if cur - old_cur is 1
4603 # not a character reference
4604 if t.text is "\u000d" or t.text is "\u000a"
4607 if t.text is "\u000a"
4613 # tree constructor initialization
4614 # see comments on TYPE_TAG/etc for the structure of this data
4617 doc = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
4618 doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
4619 fragment_root = null # fragment parsing algorithm returns children of this
4621 afe = [] # active formatting elements
4622 template_ins_modes = []
4623 ins_mode = ins_mode_initial
4624 original_ins_mode = ins_mode # TODO check spec
4625 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4626 flag_frameset_ok = true
4628 flag_foster_parenting = false
4629 form_element_pointer = null
4630 temporary_buffer = null
4631 pending_table_character_tokens = []
4632 head_element_pointer = null
4633 flag_fragment_parsing = false
4634 context_element = null
4635 prev_node_id = 0 # just for debugging
4637 # tokenizer initialization
4638 tok_state = tok_state_data
4641 # fragment parsing (text arg)
4643 # this handles the fragment from the tests in the format described here:
4644 # https://github.com/html5lib/html5lib-tests/blob/master/tree-construction/README.md
4647 if f.substr(0, 5) is 'math '
4650 else if f.substr(0, 4) is 'svg '
4654 context_element = token_to_element t, ns
4655 context_element.document = new Node TYPE_TAG, name: 'document', namespace: NS_HTML
4656 context_element.document.flag 'quirks mode', QUIRKS_NO
4657 # fragment parsing (Node arg)
4659 context_element = args.context
4661 # http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4662 # fragment parsing algorithm
4664 flag_fragment_parsing = true
4665 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4666 # search up the tree from context, to try to find it's document,
4667 # because this file only puts a "document" property on the root
4670 el = context_element
4673 old_doc = el.document
4680 doc.flag 'quirks mode', old_doc.flag 'quirks mode'
4682 if context_element.namespace is NS_HTML
4683 switch context_element.name
4684 when 'title', 'textarea'
4685 tok_state = tok_state_rcdata
4686 when 'style', 'xmp', 'iframe', 'noembed', 'noframes'
4687 tok_state = tok_state_rawtext
4689 tok_state = tok_state_script_data
4692 tok_state = tok_state_rawtext
4694 tok_state = tok_state_plaintext
4695 fragment_root = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4696 doc.children.push fragment_root
4697 fragment_root.document = doc
4698 open_els = [fragment_root]
4699 if context_element.name is 'template' and context_element.namespace is NS_HTML
4700 template_ins_modes.unshift ins_mode_in_template
4701 # fixfull create token for context (it should have it's original one already)
4703 # set form_element pointer... in the foreign doc?!
4704 el = context_element
4706 if el.name is 'form' and el.namespace is NS_HTML
4707 form_element_pointer = el
4714 # text pre-processing
4715 # FIXME check http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4716 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4717 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4721 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4722 parse_main_loop = ->
4727 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4732 if flag_fragment_parsing
4733 return fragment_root.children
4736 module.exports.parse_html = parse_html
4737 module.exports.debug_log_reset = debug_log_reset
4738 module.exports.debug_log_each = debug_log_each
4739 module.exports.TYPE_TAG = TYPE_TAG
4740 module.exports.TYPE_TEXT = TYPE_TEXT
4741 module.exports.TYPE_COMMENT = TYPE_COMMENT
4742 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4743 module.exports.NS_HTML = NS_HTML
4744 module.exports.NS_MATHML = NS_MATHML
4745 module.exports.NS_SVG = NS_SVG
4746 module.exports.QUIRKS_NO = QUIRKS_NO
4747 module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
4748 module.exports.QUIRKS_YES = QUIRKS_YES