1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
21 # The implementation is a pretty direct implementation of the parsing algorithm
23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
25 # Deviations from that spec:
27 # Purposeful: search this file for "WTAG"
29 # Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
34 # the spec uses a many different words do indicate which ends of lists/stacks
35 # they are talking about (and relative movement within the lists/stacks). This
36 # section splains. I'm implementing "lists" (afe and open_els) the same way
39 # stacks grow downward (current element is index=0)
41 # example: open_els = [a, b, c, d, e, f, g]
43 # "grows downwards" means it's visualized like this: (index: el, names)
45 # 6: g "start of the list", "topmost", "first"
47 # 4: e "previous" (to d), "above", "before"
48 # 3: d (previous/next are relative to this element)
49 # 2: c "next", "after", "lower", "below"
51 # 0: a "end of the list", "current node", "bottommost", "last"
55 # note: to get this to run outside a browser, you'll have to write a native
56 # implementation of decode_named_char_ref()
57 unless module?.exports?
59 module = exports: window.wheic
61 from_code_point = (x) ->
62 if String.fromCodePoint?
63 return String.fromCodePoint x
66 return String.fromCharCode x
68 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
70 # Each node is an obect of the Node class. Here are the Node types:
71 TYPE_TAG = 0 # name, {attributes}, [children]
72 TYPE_TEXT = 1 # "text"
75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
77 TYPE_END_TAG = 5 # name
79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
92 debug_log_each = (cb) ->
93 for str in g_debug_log
98 constructor: (type, args = {}) ->
99 @type = type # one of the TYPE_* constants above
100 @name = args.name ? '' # tag name
101 @text = args.text ? '' # contents for text/comment nodes
102 @attrs = args.attrs ? {}
103 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
104 @children = args.children ? []
105 @namespace = args.namespace ? NS_HTML
106 @parent = args.parent ? null
107 @token = args.token ? null
108 @flags = args.flags ? {}
112 @id = "#{++prev_node_id}"
113 acknowledge_self_closing: ->
115 @token.flag 'did_self_close', true
117 @flag 'did_self_close', true
118 flag: (key, value = null) ->
123 serialize: (shallow = false, show_ids = false) -> # for unit tests
128 ret += JSON.stringify @name
143 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
149 ret += c.serialize shallow, show_ids
153 ret += JSON.stringify @text
156 ret += JSON.stringify @text
158 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
161 when TYPE_AAA_BOOKMARK
162 ret += 'aaa_bookmark'
165 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
168 # helpers: (only take args that are normally known when parser creates nodes)
169 new_open_tag = (name) ->
170 return new Node TYPE_START_TAG, name: name
171 new_end_tag = (name) ->
172 return new Node TYPE_END_TAG, name: name
173 new_element = (name) ->
174 return new Node TYPE_TAG, name: name
175 new_text_node = (txt) ->
176 return new Node TYPE_TEXT, text: txt
177 new_character_token = new_text_node
178 new_comment_token = (txt) ->
179 return new Node TYPE_COMMENT, text: txt
180 new_doctype_token = (name) ->
181 return new Node TYPE_DOCTYPE, name: name
183 return new Node TYPE_EOF
185 return new Node TYPE_AFE_MARKER
186 new_aaa_bookmark = ->
187 return new Node TYPE_AAA_BOOKMARK
189 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
190 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
191 digits = "0123456789"
192 alnum = lc_alpha + uc_alpha + digits
193 hex_chars = digits + "abcdefABCDEF"
195 is_uc_alpha = (str) ->
196 return str.length is 1 and uc_alpha.indexOf(str) > -1
197 is_lc_alpha = (str) ->
198 return str.length is 1 and lc_alpha.indexOf(str) > -1
200 # some SVG elements have dashes in them
201 tag_name_chars = alnum + "-"
203 # http://www.w3.org/TR/html5/infrastructure.html#space-character
204 space_chars = "\u0009\u000a\u000c\u000d\u0020"
206 return txt.length is 1 and space_chars.indexOf(txt) > -1
207 is_space_tok = (t) ->
208 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
210 is_input_hidden_tok = (t) ->
211 return false unless t.type is TYPE_START_TAG
214 if a[1].toLowerCase() is 'hidden'
219 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
220 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
223 unicode_fixes[0x00] = "\uFFFD"
224 unicode_fixes[0x80] = "\u20AC"
225 unicode_fixes[0x82] = "\u201A"
226 unicode_fixes[0x83] = "\u0192"
227 unicode_fixes[0x84] = "\u201E"
228 unicode_fixes[0x85] = "\u2026"
229 unicode_fixes[0x86] = "\u2020"
230 unicode_fixes[0x87] = "\u2021"
231 unicode_fixes[0x88] = "\u02C6"
232 unicode_fixes[0x89] = "\u2030"
233 unicode_fixes[0x8A] = "\u0160"
234 unicode_fixes[0x8B] = "\u2039"
235 unicode_fixes[0x8C] = "\u0152"
236 unicode_fixes[0x8E] = "\u017D"
237 unicode_fixes[0x91] = "\u2018"
238 unicode_fixes[0x92] = "\u2019"
239 unicode_fixes[0x93] = "\u201C"
240 unicode_fixes[0x94] = "\u201D"
241 unicode_fixes[0x95] = "\u2022"
242 unicode_fixes[0x96] = "\u2013"
243 unicode_fixes[0x97] = "\u2014"
244 unicode_fixes[0x98] = "\u02DC"
245 unicode_fixes[0x99] = "\u2122"
246 unicode_fixes[0x9A] = "\u0161"
247 unicode_fixes[0x9B] = "\u203A"
248 unicode_fixes[0x9C] = "\u0153"
249 unicode_fixes[0x9E] = "\u017E"
250 unicode_fixes[0x9F] = "\u0178"
252 # These are the character references that don't need a terminating semicolon
253 # min length: 2, max: 6, none are a prefix of any other.
255 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
256 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
257 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
258 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
259 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
260 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
261 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
262 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
263 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
264 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
265 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
266 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
267 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
268 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
269 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
270 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
271 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
275 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
276 raw_text_elements = ['script', 'style']
277 escapable_raw_text_elements = ['textarea', 'title']
278 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
280 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
281 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
282 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
283 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
284 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
285 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
286 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
287 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
288 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
289 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
290 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
291 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
292 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
293 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
297 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
299 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
300 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
301 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
302 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
303 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
304 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
305 'determinant', 'diff', 'divergence', 'divide', 'domain',
306 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
307 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
308 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
309 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
310 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
311 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
312 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
313 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
314 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
315 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
316 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
317 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
318 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
319 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
320 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
321 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
322 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
323 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
324 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
325 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
326 'vectorproduct', 'xor'
328 # foreign_elements = [svg_elements..., mathml_elements...]
329 #normal_elements = All other allowed HTML elements are normal elements.
333 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
334 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
335 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
336 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
337 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
338 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
339 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
340 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
341 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
342 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
343 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
345 menu:NS_HTML,menuitem:NS_HTML, # WATWG adds these
347 meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
348 noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
349 plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
350 select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
351 table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
352 textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
353 tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
356 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
357 'annotation-xml':NS_MATHML,
360 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
363 formatting_elements = {
364 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
365 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
369 mathml_text_integration = {
370 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
372 is_mathml_text_integration_point = (el) ->
373 return mathml_text_integration[el.name] is el.namespace
374 is_html_integration = (el) -> # DON'T PASS A TOKEN
375 if el.namespace is NS_MATHML
376 if el.name is 'annotation-xml'
377 if el.attrs.encoding?
378 if el.attrs.encoding.toLowerCase() is 'text/html'
380 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
383 if el.namespace is NS_SVG
384 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
389 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
392 foster_parenting_targets = {
413 el_is_special = (e) ->
414 return special_elements[e.name] is e.namespace
416 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
417 el_is_special_not_adp = (el) ->
418 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
422 altglyphdef: 'altGlyphDef'
423 altglyphitem: 'altGlyphItem'
424 animatecolor: 'animateColor'
425 animatemotion: 'animateMotion'
426 animatetransform: 'animateTransform'
429 fecolormatrix: 'feColorMatrix'
430 fecomponenttransfer: 'feComponentTransfer'
431 fecomposite: 'feComposite'
432 feconvolvematrix: 'feConvolveMatrix'
433 fediffuselighting: 'feDiffuseLighting'
434 fedisplacementmap: 'feDisplacementMap'
435 fedistantlight: 'feDistantLight'
436 fedropshadow: 'feDropShadow'
442 fegaussianblur: 'feGaussianBlur'
445 femergenode: 'feMergeNode'
446 femorphology: 'feMorphology'
448 fepointlight: 'fePointLight'
449 fespecularlighting: 'feSpecularLighting'
450 fespotlight: 'feSpotLight'
452 feturbulence: 'feTurbulence'
453 foreignobject: 'foreignObject'
455 lineargradient: 'linearGradient'
456 radialgradient: 'radialGradient'
459 svg_attribute_fixes = {
460 attributename: 'attributeName'
461 attributetype: 'attributeType'
462 basefrequency: 'baseFrequency'
463 baseprofile: 'baseProfile'
465 clippathunits: 'clipPathUnits'
466 contentscripttype: 'contentScriptType'
467 contentstyletype: 'contentStyleType'
468 diffuseconstant: 'diffuseConstant'
470 externalresourcesrequired: 'externalResourcesRequired'
471 filterres: 'filterRes'
472 filterunits: 'filterUnits'
474 gradienttransform: 'gradientTransform'
475 gradientunits: 'gradientUnits'
476 kernelmatrix: 'kernelMatrix'
477 kernelunitlength: 'kernelUnitLength'
478 keypoints: 'keyPoints'
479 keysplines: 'keySplines'
481 lengthadjust: 'lengthAdjust'
482 limitingconeangle: 'limitingConeAngle'
483 markerheight: 'markerHeight'
484 markerunits: 'markerUnits'
485 markerwidth: 'markerWidth'
486 maskcontentunits: 'maskContentUnits'
487 maskunits: 'maskUnits'
488 numoctaves: 'numOctaves'
489 pathlength: 'pathLength'
490 patterncontentunits: 'patternContentUnits'
491 patterntransform: 'patternTransform'
492 patternunits: 'patternUnits'
493 pointsatx: 'pointsAtX'
494 pointsaty: 'pointsAtY'
495 pointsatz: 'pointsAtZ'
496 preservealpha: 'preserveAlpha'
497 preserveaspectratio: 'preserveAspectRatio'
498 primitiveunits: 'primitiveUnits'
501 repeatcount: 'repeatCount'
502 repeatdur: 'repeatDur'
503 requiredextensions: 'requiredExtensions'
504 requiredfeatures: 'requiredFeatures'
505 specularconstant: 'specularConstant'
506 specularexponent: 'specularExponent'
507 spreadmethod: 'spreadMethod'
508 startoffset: 'startOffset'
509 stddeviation: 'stdDeviation'
510 stitchtiles: 'stitchTiles'
511 surfacescale: 'surfaceScale'
512 systemlanguage: 'systemLanguage'
513 tablevalues: 'tableValues'
516 textlength: 'textLength'
518 viewtarget: 'viewTarget'
519 xchannelselector: 'xChannelSelector'
520 ychannelselector: 'yChannelSelector'
521 zoomandpan: 'zoomAndPan'
523 foreign_attr_fixes = {
524 'xlink:actuate': 'xlink actuate'
525 'xlink:arcrole': 'xlink arcrole'
526 'xlink:href': 'xlink href'
527 'xlink:role': 'xlink role'
528 'xlink:show': 'xlink show'
529 'xlink:title': 'xlink title'
530 'xlink:type': 'xlink type'
531 'xml:base': 'xml base'
532 'xml:lang': 'xml lang'
533 'xml:space': 'xml space'
535 'xmlns:xlink': 'xmlns xlink'
537 adjust_mathml_attributes = (t) ->
539 if a[0] is 'definitionurl'
540 a[0] = 'definitionURL'
542 adjust_svg_attributes = (t) ->
544 if svg_attribute_fixes[a[0]]?
545 a[0] = svg_attribute_fixes[a[0]]
547 adjust_foreign_attributes = (t) ->
550 if foreign_attr_fixes[a[0]]?
551 a[0] = foreign_attr_fixes[a[0]]
554 # decode_named_char_ref()
556 # The list of named character references is _huge_ so ask the browser to decode
557 # for us instead of wasting bandwidth/space on including the table here.
559 # Pass without the "&" but with the ";" examples:
560 # for "&" pass "amp;"
561 # for "′" pass "x2032;"
564 textarea: document.createElement('textarea')
566 # TODO test this in IE8
567 decode_named_char_ref = (txt) ->
569 decoded = g_dncr.cache[txt]
570 return decoded if decoded?
571 g_dncr.textarea.innerHTML = txt
572 decoded = g_dncr.textarea.value
573 return null if decoded is txt
574 return g_dncr.cache[txt] = decoded
576 parse_html = (args) ->
578 cur = null # index of next char in txt to be parsed
579 # declare doc and tokenizer variables so they're in scope below
581 open_els = null # stack of open elements
582 afe = null # active formatting elements
583 template_ins_modes = null
585 original_ins_mode = null
587 tok_cur_tag = null # partially parsed tag
588 flag_scripting = null
589 flag_frameset_ok = null
591 flag_foster_parenting = null
592 form_element_pointer = null
593 temporary_buffer = null
594 pending_table_character_tokens = null
595 head_element_pointer = null
596 flag_fragment_parsing = null
597 context_element = null
606 console.log "Parse error at character #{cur} of #{txt.length}"
608 afe_push = (new_el) ->
611 if el.name is new_el.name and el.namespace is new_el.namespace
613 continue unless new_el.attrs[k] is v
614 for k, v of new_el.attrs
615 continue unless el.attrs[k] is v
622 afe.unshift new_afe_marker()
624 # the functions below impliment the Tree Contstruction algorithm
625 # http://www.w3.org/TR/html5/syntax.html#tree-construction
627 # But first... the helpers
628 template_tag_is_open = ->
630 if t.name is 'template' and t.namespace is NS_HTML
633 is_in_scope_x = (tag_name, scope, namespace) ->
635 if t.name is tag_name and (namespace is null or namespace is t.namespace)
637 if scope[t.name] is t.namespace
640 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
642 if t.name is tag_name and (namespace is null or namespace is t.namespace)
644 if scope[t.name] is t.namespace
646 if scope2[t.name] is t.namespace
650 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
651 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
654 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
655 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
657 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
659 button_scopers = button: NS_HTML
660 li_scopers = ol: NS_HTML, ul: NS_HTML
661 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
662 is_in_scope = (tag_name, namespace = null) ->
663 return is_in_scope_x tag_name, standard_scopers, namespace
664 is_in_button_scope = (tag_name, namespace = null) ->
665 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
666 is_in_table_scope = (tag_name, namespace = null) ->
667 return is_in_scope_x tag_name, table_scopers, namespace
668 # aka is_in_list_item_scope
669 is_in_li_scope = (tag_name, namespace = null) ->
670 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
671 is_in_select_scope = (tag_name, namespace = null) ->
673 if t.name is tag_name and (namespace is null or namespace is t.namespace)
675 if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
678 # this checks for a particular element, not by name
679 # this requires a namespace match
680 el_is_in_scope = (needle) ->
684 if standard_scopers[el.name] is el.namespace
688 clear_to_table_stopers = {
693 clear_stack_to_table_context = ->
695 if clear_to_table_stopers[open_els[0].name]?
699 clear_to_table_body_stopers = {
706 clear_stack_to_table_body_context = ->
708 if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
712 clear_to_table_row_stopers = {
717 clear_stack_to_table_row_context = ->
719 if clear_to_table_row_stopers[open_els[0].name]?
723 clear_afe_to_marker = ->
725 return unless afe.length > 0 # this happens in fragment case, ?spec error
727 if el.type is TYPE_AFE_MARKER
732 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
734 # 1. Let last be false.
736 # 2. Let node be the last node in the stack of open elements.
738 node = open_els[node_i]
739 # 3. Loop: If node is the first node in the stack of open elements,
740 # then set last to true, and, if the parser was originally created as
741 # part of the HTML fragment parsing algorithm (fragment case) set node
742 # to the context element.
744 if node_i is open_els.length - 1
746 # fixfull (fragment case)
748 # 4. If node is a select element, run these substeps:
749 if node.name is 'select' and node.namespace is NS_HTML
750 # 1. If last is true, jump to the step below labeled done.
752 # 2. Let ancestor be node.
755 # 3. Loop: If ancestor is the first node in the stack of
756 # open elements, jump to the step below labeled done.
758 if ancestor_i is open_els.length - 1
760 # 4. Let ancestor be the node before ancestor in the stack
763 ancestor = open_els[ancestor_i]
764 # 5. If ancestor is a template node, jump to the step below
766 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
768 # 6. If ancestor is a table node, switch the insertion mode
769 # to "in select in table" and abort these steps.
770 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
771 ins_mode = ins_mode_in_select_in_table
773 # 7. Jump back to the step labeled loop.
774 # 8. Done: Switch the insertion mode to "in select" and abort
776 ins_mode = ins_mode_in_select
778 # 5. If node is a td or th element and last is false, then switch
779 # the insertion mode to "in cell" and abort these steps.
780 if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
781 ins_mode = ins_mode_in_cell
783 # 6. If node is a tr element, then switch the insertion mode to "in
784 # row" and abort these steps.
785 if node.name is 'tr' and node.namespace is NS_HTML
786 ins_mode = ins_mode_in_row
788 # 7. If node is a tbody, thead, or tfoot element, then switch the
789 # insertion mode to "in table body" and abort these steps.
790 if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
791 ins_mode = ins_mode_in_table_body
793 # 8. If node is a caption element, then switch the insertion mode
794 # to "in caption" and abort these steps.
795 if node.name is 'caption' and node.namespace is NS_HTML
796 ins_mode = ins_mode_in_caption
798 # 9. If node is a colgroup element, then switch the insertion mode
799 # to "in column group" and abort these steps.
800 if node.name is 'colgroup' and node.namespace is NS_HTML
801 ins_mode = ins_mode_in_column_group
803 # 10. If node is a table element, then switch the insertion mode to
804 # "in table" and abort these steps.
805 if node.name is 'table' and node.namespace is NS_HTML
806 ins_mode = ins_mode_in_table
808 # 11. If node is a template element, then switch the insertion mode
809 # to the current template insertion mode and abort these steps.
810 if node.name is 'template' and node.namespace is NS_HTML
811 ins_mode = template_ins_modes[0]
813 # 12. If node is a head element and last is true, then switch the
814 # insertion mode to "in body" ("in body"! not "in head"!) and abort
815 # these steps. (fragment case)
816 if node.name is 'head' and node.namespace is NS_HTML and last
817 ins_mode = ins_mode_in_body
819 # 13. If node is a head element and last is false, then switch the
820 # insertion mode to "in head" and abort these steps.
821 if node.name is 'head' and node.namespace is NS_HTML and last is false
822 ins_mode = ins_mode_in_head
824 # 14. If node is a body element, then switch the insertion mode to
825 # "in body" and abort these steps.
826 if node.name is 'body' and node.namespace is NS_HTML
827 ins_mode = ins_mode_in_body
829 # 15. If node is a frameset element, then switch the insertion mode
830 # to "in frameset" and abort these steps. (fragment case)
831 if node.name is 'frameset' and node.namespace is NS_HTML
832 ins_mode = ins_mode_in_frameset
834 # 16. If node is an html element, run these substeps:
835 if node.name is 'html' and node.namespace is NS_HTML
836 # 1. If the head element pointer is null, switch the insertion
837 # mode to "before head" and abort these steps. (fragment case)
838 if head_element_pointer is null
839 ins_mode = ins_mode_before_head
841 # 2. Otherwise, the head element pointer is not null,
842 # switch the insertion mode to "after head" and abort these
844 ins_mode = ins_mode_after_head
846 # 17. If last is true, then switch the insertion mode to "in body"
847 # and abort these steps. (fragment case)
849 ins_mode = ins_mode_in_body
851 # 18. Let node now be the node before node in the stack of open
854 node = open_els[node_i]
855 # 19. Return to the step labeled loop.
859 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
860 adjusted_current_node = ->
861 if open_els.length is 1 and flag_fragment_parsing
862 return context_element
865 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
866 # this implementation is structured (mostly) as described at the link above.
867 # capitalized comments are the "labels" described at the link above.
869 return if afe.length is 0
870 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
875 if i is afe.length - 1
878 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
883 el = insert_html_element afe[i].token
888 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
889 # adoption agency algorithm
891 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
892 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
893 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
894 adoption_agency = (subject) ->
895 debug_log "adoption_agency()"
896 debug_log "tree: #{serialize_els doc.children, false, true}"
897 debug_log "open_els: #{serialize_els open_els, true, true}"
898 debug_log "afe: #{serialize_els afe, true, true}"
899 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
902 # remove it from the list of active formatting elements (if found)
907 debug_log "aaa: starting off with subject on top of stack, exiting"
914 # 5. Let formatting element be the last element in the list of
915 # active formatting elements that: is between the end of the list
916 # and the last scope marker in the list, if any, or the start of
917 # the list otherwise, and has the tag name subject.
919 for t, fe_of_afe in afe
920 if t.type is TYPE_AFE_MARKER
925 # If there is no such element, then abort these steps and instead
926 # act as described in the "any other end tag" entry above.
928 debug_log "aaa: fe not found in afe"
929 in_body_any_other_end_tag subject
931 # 6. If formatting element is not in the stack of open elements,
932 # then this is a parse error; remove the element from the list, and
935 for t, fe_of_open_els in open_els
940 debug_log "aaa: fe not found in open_els"
942 # "remove it from the list" must mean afe, since it's not in open_els
943 afe.splice fe_of_afe, 1
945 # 7. If formatting element is in the stack of open elements, but
946 # the element is not in scope, then this is a parse error; abort
948 unless el_is_in_scope fe
949 debug_log "aaa: fe not in scope"
952 # 8. If formatting element is not the current node, this is a parse
953 # error. (But do not abort these steps.)
954 unless open_els[0] is fe
957 # 9. Let furthest block be the topmost node in the stack of open
958 # elements that is lower in the stack than formatting element, and
959 # is an element in the special category. There might not be one.
961 fb_of_open_els = null
968 # and continue, to see if there's one that's more "topmost"
969 # 10. If there is no furthest block, then the UA must first pop all
970 # the nodes from the bottom of the stack of open elements, from the
971 # current node up to and including formatting element, then remove
972 # formatting element from the list of active formatting elements,
973 # and finally abort these steps.
975 debug_log "aaa: no fb"
979 afe.splice fe_of_afe, 1
981 # 11. Let common ancestor be the element immediately above
982 # formatting element in the stack of open elements.
983 ca = open_els[fe_of_open_els + 1] # common ancestor
985 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
986 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
987 bookmark = new_aaa_bookmark()
990 afe.splice i, 0, bookmark
992 node = last_node = fb
996 # 3. Let node be the element immediately above node in the
997 # stack of open elements, or if node is no longer in the stack
998 # of open elements (e.g. because it got removed by this
999 # algorithm), the element that was immediately above node in
1000 # the stack of open elements before node was removed.
1002 for t, i in open_els
1004 node_next = open_els[i + 1]
1006 node = node_next ? node_above
1007 debug_log "inner loop #{inner}"
1008 debug_log "tree: #{serialize_els doc.children, false, true}"
1009 debug_log "open_els: #{serialize_els open_els, true, true}"
1010 debug_log "afe: #{serialize_els afe, true, true}"
1011 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1012 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1013 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1014 debug_log "node: #{node.serialize true, true}"
1015 # TODO make sure node_above gets re-set if/when node is removed from open_els
1017 # 4. If node is formatting element, then go to the next step in
1018 # the overall algorithm.
1021 debug_log "the meat"
1022 # 5. If inner loop counter is greater than three and node is in
1023 # the list of active formatting elements, then remove node from
1024 # the list of active formatting elements.
1030 debug_log "max out inner"
1035 # 6. If node is not in the list of active formatting elements,
1036 # then remove node from the stack of open elements and then go
1037 # back to the step labeled inner loop.
1039 debug_log "not in afe"
1040 for t, i in open_els
1042 node_above = open_els[i + 1]
1043 open_els.splice i, 1
1046 debug_log "the bones"
1047 # 7. create an element for the token for which the element node
1048 # was created, in the HTML namespace, with common ancestor as
1049 # the intended parent; replace the entry for node in the list
1050 # of active formatting elements with an entry for the new
1051 # element, replace the entry for node in the stack of open
1052 # elements with an entry for the new element, and let node be
1054 new_node = token_to_element node.token, NS_HTML, ca
1058 debug_log "replaced in afe"
1060 for t, i in open_els
1062 node_above = open_els[i + 1]
1063 open_els[i] = new_node
1064 debug_log "replaced in open_els"
1067 # 8. If last node is furthest block, then move the
1068 # aforementioned bookmark to be immediately after the new node
1069 # in the list of active formatting elements.
1074 debug_log "removed bookmark"
1078 # "after" means lower
1079 afe.splice i, 0, bookmark # "after as <-
1080 debug_log "placed bookmark after node"
1081 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1083 # 9. Insert last node into node, first removing it from its
1084 # previous parent node if any.
1085 if last_node.parent?
1086 debug_log "last_node has parent"
1087 for c, i in last_node.parent.children
1089 debug_log "removing last_node from parent"
1090 last_node.parent.children.splice i, 1
1092 node.children.push last_node
1093 last_node.parent = node
1094 # 10. Let last node be node.
1097 # 11. Return to the step labeled inner loop.
1098 # 14. Insert whatever last node ended up being in the previous step
1099 # at the appropriate place for inserting a node, but using common
1100 # ancestor as the override target.
1102 # In the case where fe is immediately followed by fb:
1103 # * inner loop exits out early (node==fe)
1105 # * last_node is still in the tree (not a duplicate)
1106 if last_node.parent?
1107 debug_log "FEFIRST? last_node has parent"
1108 for c, i in last_node.parent.children
1110 debug_log "removing last_node from parent"
1111 last_node.parent.children.splice i, 1
1114 debug_log "after aaa inner loop"
1115 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1116 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1117 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1118 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1119 debug_log "tree: #{serialize_els doc.children, false, true}"
1124 # can't use standard insert token thing, because it's already in
1125 # open_els and must stay at it's current position in open_els
1126 dest = adjusted_insertion_location ca
1127 dest[0].children.splice dest[1], 0, last_node
1128 last_node.parent = dest[0]
1131 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1132 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1133 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1134 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1135 debug_log "tree: #{serialize_els doc.children, false, true}"
1137 # 15. Create an element for the token for which formatting element
1138 # was created, in the HTML namespace, with furthest block as the
1140 new_element = token_to_element fe.token, NS_HTML, fb
1141 # 16. Take all of the child nodes of furthest block and append them
1142 # to the element created in the last step.
1143 while fb.children.length
1144 t = fb.children.shift()
1145 t.parent = new_element
1146 new_element.children.push t
1147 # 17. Append that new element to furthest block.
1148 new_element.parent = fb
1149 fb.children.push new_element
1150 # 18. Remove formatting element from the list of active formatting
1151 # elements, and insert the new element into the list of active
1152 # formatting elements at the position of the aforementioned
1160 afe[i] = new_element
1162 # 19. Remove formatting element from the stack of open elements,
1163 # and insert the new element into the stack of open elements
1164 # immediately below the position of furthest block in that stack.
1165 for t, i in open_els
1167 open_els.splice i, 1
1169 for t, i in open_els
1171 open_els.splice i, 0, new_element
1173 # 20. Jump back to the step labeled outer loop.
1174 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1175 debug_log "tree: #{serialize_els doc.children, false, true}"
1176 debug_log "open_els: #{serialize_els open_els, true, true}"
1177 debug_log "afe: #{serialize_els afe, true, true}"
1178 debug_log "AAA DONE"
1180 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1181 close_p_element = ->
1182 generate_implied_end_tags 'p' # arg is exception
1183 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1185 while open_els.length > 1 # just in case
1186 el = open_els.shift()
1187 if el.name is 'p' and el.namespace is NS_HTML
1189 close_p_if_in_button_scope = ->
1190 if is_in_button_scope 'p', NS_HTML
1193 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1194 # aka insert_a_character = (t) ->
1195 insert_character = (t) ->
1196 dest = adjusted_insertion_location()
1197 # fixfull check for Document node
1199 prev = dest[0].children[dest[1] - 1]
1200 if prev.type is TYPE_TEXT
1203 dest[0].children.splice dest[1], 0, t
1206 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1207 process_token = (t) ->
1208 acn = adjusted_current_node()
1212 if acn.namespace is NS_HTML
1215 if is_mathml_text_integration_point(acn)
1216 if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1219 if t.type is TYPE_TEXT
1222 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1225 if is_html_integration acn
1226 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1229 if t.type is TYPE_EOF
1232 in_foreign_content t
1236 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1237 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1238 adjusted_insertion_location = (override_target = null) ->
1239 # 1. If there was an override target specified, then let target be the
1242 target = override_target
1243 else # Otherwise, let target be the current node.
1244 target = open_els[0]
1245 # 2. Determine the adjusted insertion location using the first matching
1246 # steps from the following list:
1248 # If foster parenting is enabled and target is a table, tbody, tfoot,
1249 # thead, or tr element Foster parenting happens when content is
1250 # misnested in tables.
1251 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1252 loop # once. this is here so we can ``break`` to "abort these substeps"
1253 # 1. Let last template be the last template element in the
1254 # stack of open elements, if any.
1255 last_template = null
1256 last_template_i = null
1257 for el, i in open_els
1258 if el.name is 'template' and el.namespace is NS_HTML
1262 # 2. Let last table be the last table element in the stack of
1263 # open elements, if any.
1266 for el, i in open_els
1267 if el.name is 'table' and el.namespace is NS_HTML
1271 # 3. If there is a last template and either there is no last
1272 # table, or there is one, but last template is lower (more
1273 # recently added) than last table in the stack of open
1274 # elements, then: let adjusted insertion location be inside
1275 # last template's template contents, after its last child (if
1276 # any), and abort these substeps.
1277 if last_template and (last_table is null or last_template_i < last_table_i)
1278 target = last_template # fixfull should be it's contents
1279 target_i = target.children.length
1281 # 4. If there is no last table, then let adjusted insertion
1282 # location be inside the first element in the stack of open
1283 # elements (the html element), after its last child (if any),
1284 # and abort these substeps. (fragment case)
1285 if last_table is null
1287 target = open_els[open_els.length - 1]
1288 target_i = target.children.length
1290 # 5. If last table has a parent element, then let adjusted
1291 # insertion location be inside last table's parent element,
1292 # immediately before last table, and abort these substeps.
1293 if last_table.parent?
1294 for c, i in last_table.parent.children
1296 target = last_table.parent
1300 # 6. Let previous element be the element immediately above last
1301 # table in the stack of open elements.
1303 # huh? how could it not have a parent?
1304 previous_element = open_els[last_table_i + 1]
1305 # 7. Let adjusted insertion location be inside previous
1306 # element, after its last child (if any).
1307 target = previous_element
1308 target_i = target.children.length
1309 # Note: These steps are involved in part because it's possible
1310 # for elements, the table element in this case in particular,
1311 # to have been moved by a script around in the DOM, or indeed
1312 # removed from the DOM entirely, after the element was inserted
1314 break # don't really loop
1316 # Otherwise Let adjusted insertion location be inside target, after
1317 # its last child (if any).
1318 target_i = target.children.length
1320 # 3. If the adjusted insertion location is inside a template element,
1321 # let it instead be inside the template element's template contents,
1322 # after its last child (if any).
1323 # fixfull (template)
1325 # 4. Return the adjusted insertion location.
1326 return [target, target_i]
1328 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1329 # aka create_an_element_for_token
1330 token_to_element = (t, namespace, intended_parent) ->
1331 # convert attributes into a hash
1334 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1335 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1337 # TODO 2. If the newly created element has an xmlns attribute in the
1338 # XMLNS namespace whose value is not exactly the same as the element's
1339 # namespace, that is a parse error. Similarly, if the newly created
1340 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1341 # value is not the XLink Namespace, that is a parse error.
1343 # fixfull: the spec says stuff about form pointers and ownerDocument
1347 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1348 insert_foreign_element = (token, namespace) ->
1349 ail = adjusted_insertion_location()
1352 el = token_to_element token, namespace, ail_el
1353 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1355 ail_el.children.splice ail_i, 0, el
1358 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1359 insert_html_element = (token) ->
1360 insert_foreign_element token, NS_HTML
1362 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1363 # position should be [node, index_within_children]
1364 insert_comment = (t, position = null) ->
1365 position ?= adjusted_insertion_location()
1366 position[0].children.splice position[1], 0, t
1369 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1370 parse_generic_raw_text = (t) ->
1371 insert_html_element t
1372 tok_state = tok_state_rawtext
1373 original_ins_mode = ins_mode
1374 ins_mode = ins_mode_text
1375 parse_generic_rcdata_text = (t) ->
1376 insert_html_element t
1377 tok_state = tok_state_rcdata
1378 original_ins_mode = ins_mode
1379 ins_mode = ins_mode_text
1381 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1382 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1383 generate_implied_end_tags = (except = null) ->
1384 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1387 # 8.2.5.4 The rules for parsing tokens in HTML content
1388 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1390 # 8.2.5.4.1 The "initial" insertion mode
1391 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1392 ins_mode_initial = (t) ->
1395 if t.type is TYPE_COMMENT
1399 if t.type is TYPE_DOCTYPE
1400 # FIXME check identifiers, set quirks, etc
1403 ins_mode = ins_mode_before_html
1406 #fixfull (iframe, quirks)
1407 ins_mode = ins_mode_before_html
1411 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1412 ins_mode_before_html = (t) ->
1413 if t.type is TYPE_DOCTYPE
1416 if t.type is TYPE_COMMENT
1421 if t.type is TYPE_START_TAG and t.name is 'html'
1422 el = token_to_element t, NS_HTML, doc
1423 doc.children.push el
1424 open_els.unshift(el)
1425 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1426 ins_mode = ins_mode_before_head
1428 if t.type is TYPE_END_TAG
1429 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1430 # fall through to "anything else"
1435 html_tok = new_open_tag 'html'
1436 el = token_to_element html_tok, NS_HTML, doc
1437 doc.children.push el
1439 # ?fixfull browsing context
1440 ins_mode = ins_mode_before_head
1444 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1445 ins_mode_before_head = (t) ->
1448 if t.type is TYPE_COMMENT
1451 if t.type is TYPE_DOCTYPE
1454 if t.type is TYPE_START_TAG and t.name is 'html'
1457 if t.type is TYPE_START_TAG and t.name is 'head'
1458 el = insert_html_element t
1459 head_element_pointer = el
1460 ins_mode = ins_mode_in_head
1462 if t.type is TYPE_END_TAG
1463 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1464 # fall through to Anything else below
1469 head_tok = new_open_tag 'head'
1470 el = insert_html_element head_tok
1471 head_element_pointer = el
1472 ins_mode = ins_mode_in_head
1475 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1476 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1477 open_els.shift() # spec says this will be a 'head' node
1478 ins_mode = ins_mode_after_head
1480 ins_mode_in_head = (t) ->
1481 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1484 if t.type is TYPE_COMMENT
1487 if t.type is TYPE_DOCTYPE
1490 if t.type is TYPE_START_TAG and t.name is 'html'
1493 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1494 el = insert_html_element t
1496 t.acknowledge_self_closing()
1498 if t.type is TYPE_START_TAG and t.name is 'meta'
1499 el = insert_html_element t
1501 t.acknowledge_self_closing()
1502 # fixfull encoding stuff
1504 if t.type is TYPE_START_TAG and t.name is 'title'
1505 parse_generic_rcdata_text t
1507 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1508 parse_generic_raw_text t
1510 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1511 insert_html_element t
1512 ins_mode = ins_mode_in_head_noscript
1514 if t.type is TYPE_START_TAG and t.name is 'script'
1515 ail = adjusted_insertion_location()
1516 el = token_to_element t, NS_HTML, ail
1517 el.flag 'parser-inserted', true
1518 # fixfull frament case
1519 ail[0].children.splice ail[1], 0, el
1521 tok_state = tok_state_script_data
1522 original_ins_mode = ins_mode # make sure orig... is defined
1523 ins_mode = ins_mode_text
1525 if t.type is TYPE_END_TAG and t.name is 'head'
1526 open_els.shift() # will be a head element... spec says so
1527 ins_mode = ins_mode_after_head
1529 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1530 ins_mode_in_head_else t
1532 if t.type is TYPE_START_TAG and t.name is 'template'
1533 insert_html_element t
1535 flag_frameset_ok = false
1536 ins_mode = ins_mode_in_template
1537 template_ins_modes.unshift ins_mode_in_template
1539 if t.type is TYPE_END_TAG and t.name is 'template'
1540 if template_tag_is_open()
1541 generate_implied_end_tags
1542 if open_els[0].name isnt 'template'
1545 el = open_els.shift()
1546 if el.name is 'template' and el.namespace is NS_HTML
1548 clear_afe_to_marker()
1549 template_ins_modes.shift()
1554 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1557 ins_mode_in_head_else t
1559 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1560 ins_mode_in_head_noscript_else = (t) ->
1563 ins_mode = ins_mode_in_head
1565 ins_mode_in_head_noscript = (t) ->
1566 if t.type is TYPE_DOCTYPE
1569 if t.type is TYPE_START_TAG and t.name is 'html'
1572 if t.type is TYPE_END_TAG and t.name is 'noscript'
1574 ins_mode = ins_mode_in_head
1576 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1579 if t.type is TYPE_END_TAG and t.name is 'br'
1580 ins_mode_in_head_noscript_else t
1582 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1586 ins_mode_in_head_noscript_else t
1591 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1592 ins_mode_after_head_else = (t) ->
1593 body_tok = new_open_tag 'body'
1594 insert_html_element body_tok
1595 ins_mode = ins_mode_in_body
1598 ins_mode_after_head = (t) ->
1602 if t.type is TYPE_COMMENT
1605 if t.type is TYPE_DOCTYPE
1608 if t.type is TYPE_START_TAG and t.name is 'html'
1611 if t.type is TYPE_START_TAG and t.name is 'body'
1612 insert_html_element t
1613 flag_frameset_ok = false
1614 ins_mode = ins_mode_in_body
1616 if t.type is TYPE_START_TAG and t.name is 'frameset'
1617 insert_html_element t
1618 ins_mode = ins_mode_in_frameset
1620 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1622 open_els.unshift head_element_pointer
1624 for el, i of open_els
1625 if el is head_element_pointer
1626 open_els.splice i, 1
1628 console.log "warning: 23904 couldn't find head element in open_els"
1630 if t.type is TYPE_END_TAG and t.name is 'template'
1633 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1634 ins_mode_after_head_else t
1636 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1640 ins_mode_after_head_else t
1642 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1643 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1644 for el, i in open_els
1645 if el.name is name and el.namespace is NS_HTML
1646 generate_implied_end_tags name # arg is exception
1647 parse_error() unless i is 0
1652 if special_elements[el.name] is el.namespace
1656 ins_mode_in_body = (t) ->
1657 if t.type is TYPE_TEXT and t.text is "\u0000"
1664 if t.type is TYPE_TEXT
1667 flag_frameset_ok = false
1669 if t.type is TYPE_COMMENT
1672 if t.type is TYPE_DOCTYPE
1675 if t.type is TYPE_START_TAG and t.name is 'html'
1677 return if template_tag_is_open()
1678 root_attrs = open_els[open_els.length - 1].attrs
1680 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1683 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1686 if t.type is TYPE_START_TAG and t.name is 'body'
1688 return if open_els.length < 2
1689 second = open_els[open_els.length - 2]
1690 return unless second.namespace is NS_HTML
1691 return unless second.name is 'body'
1692 return if template_tag_is_open()
1693 flag_frameset_ok = false
1695 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1697 if t.type is TYPE_START_TAG and t.name is 'frameset'
1699 return if open_els.length < 2
1700 second_i = open_els.length - 2
1701 second = open_els[second_i]
1702 return unless second.namespace is NS_HTML
1703 return unless second.name is 'body'
1704 if flag_frameset_ok is false
1707 for el, i in second.parent.children
1709 second.parent.children.splice i, 1
1711 open_els.splice second_i, 1
1712 # pop everything except the "root html element"
1713 while open_els.length > 1
1715 insert_html_element t
1716 ins_mode = ins_mode_in_frameset
1718 if t.type is TYPE_EOF
1720 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1721 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1722 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1725 unless ok_tags[t.name] is el.namespace
1728 if template_ins_modes.length > 0
1729 ins_mode_in_template t
1733 if t.type is TYPE_END_TAG and t.name is 'body'
1734 unless is_in_scope 'body', NS_HTML
1738 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1739 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1740 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1741 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1745 unless ok_tags[t.name] is el.namespace
1748 ins_mode = ins_mode_after_body
1750 if t.type is TYPE_END_TAG and t.name is 'html'
1751 unless is_in_scope 'body', NS_HTML
1755 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1756 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1757 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1758 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1762 unless ok_tags[t.name] is el.namespace
1765 ins_mode = ins_mode_after_body
1768 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1769 close_p_if_in_button_scope()
1770 insert_html_element t
1772 if t.type is TYPE_START_TAG and h_tags[t.name]?
1773 close_p_if_in_button_scope()
1774 if h_tags[open_els[0].name] is open_els[0].namespace
1777 insert_html_element t
1779 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1780 close_p_if_in_button_scope()
1781 insert_html_element t
1782 # spec: If the next token is a "LF" (U+000A) character token, then
1783 # ignore that token and move on to the next one. (Newlines at the
1784 # start of pre blocks are ignored as an authoring convenience.)
1785 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1787 flag_frameset_ok = false
1789 if t.type is TYPE_START_TAG and t.name is 'form'
1790 unless form_element_pointer is null or template_tag_is_open()
1793 close_p_if_in_button_scope()
1794 el = insert_html_element t
1795 unless template_tag_is_open()
1796 form_element_pointer = el
1798 if t.type is TYPE_START_TAG and t.name is 'li'
1799 flag_frameset_ok = false
1800 for node in open_els
1801 if node.name is 'li' and node.namespace is NS_HTML
1802 generate_implied_end_tags 'li' # arg is exception
1803 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1806 el = open_els.shift()
1807 if el.name is 'li' and el.namespace is NS_HTML
1810 if el_is_special_not_adp node
1812 close_p_if_in_button_scope()
1813 insert_html_element t
1815 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1816 flag_frameset_ok = false
1817 for node in open_els
1818 if node.name is 'dd' and node.namespace is NS_HTML
1819 generate_implied_end_tags 'dd' # arg is exception
1820 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1823 el = open_els.shift()
1824 if el.name is 'dd' and el.namespace is NS_HTML
1827 if node.name is 'dt' and node.namespace is NS_HTML
1828 generate_implied_end_tags 'dt' # arg is exception
1829 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1832 el = open_els.shift()
1833 if el.name is 'dt' and el.namespace is NS_HTML
1836 if el_is_special_not_adp node
1838 close_p_if_in_button_scope()
1839 insert_html_element t
1841 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1842 close_p_if_in_button_scope()
1843 insert_html_element t
1844 tok_state = tok_state_plaintext
1846 if t.type is TYPE_START_TAG and t.name is 'button'
1847 if is_in_scope 'button', NS_HTML
1849 generate_implied_end_tags()
1851 el = open_els.shift()
1852 if el.name is 'button' and el.namespace is NS_HTML
1855 insert_html_element t
1856 flag_frameset_ok = false
1858 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1859 unless is_in_scope t.name, NS_HTML
1862 generate_implied_end_tags()
1863 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1866 el = open_els.shift()
1867 if el.name is t.name and el.namespace is NS_HTML
1870 if t.type is TYPE_END_TAG and t.name is 'form'
1871 unless template_tag_is_open()
1872 node = form_element_pointer
1873 form_element_pointer = null
1874 if node is null or not el_is_in_scope node
1877 generate_implied_end_tags()
1878 if open_els[0] isnt node
1880 for el, i in open_els
1882 open_els.splice i, 1
1885 unless is_in_scope 'form', NS_HTML
1888 generate_implied_end_tags()
1889 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1892 el = open_els.shift()
1893 if el.name is 'form' and el.namespace is NS_HTML
1896 if t.type is TYPE_END_TAG and t.name is 'p'
1897 unless is_in_button_scope 'p', NS_HTML
1899 insert_html_element new_open_tag 'p'
1902 if t.type is TYPE_END_TAG and t.name is 'li'
1903 unless is_in_li_scope 'li', NS_HTML
1906 generate_implied_end_tags 'li' # arg is exception
1907 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1910 el = open_els.shift()
1911 if el.name is 'li' and el.namespace is NS_HTML
1914 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1915 unless is_in_scope t.name, NS_HTML
1918 generate_implied_end_tags t.name # arg is exception
1919 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1922 el = open_els.shift()
1923 if el.name is t.name and el.namespace is NS_HTML
1926 if t.type is TYPE_END_TAG and h_tags[t.name]?
1929 if h_tags[el.name] is el.namespace
1932 if standard_scopers[el.name] is el.namespace
1937 generate_implied_end_tags()
1938 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1941 el = open_els.shift()
1942 if h_tags[el.name] is el.namespace
1946 if t.type is TYPE_START_TAG and t.name is 'a'
1947 # If the list of active formatting elements contains an a element
1948 # between the end of the list and the last marker on the list (or
1949 # the start of the list if there is no marker on the list), then
1950 # this is a parse error; run the adoption agency algorithm for the
1951 # tag name "a", then remove that element from the list of active
1952 # formatting elements and the stack of open elements if the
1953 # adoption agency algorithm didn't already remove it (it might not
1954 # have if the element is not in table scope).
1957 if el.type is TYPE_AFE_MARKER
1959 if el.name is 'a' and el.namespace is NS_HTML
1967 for el, i in open_els
1969 open_els.splice i, 1
1971 el = insert_html_element t
1974 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1976 el = insert_html_element t
1979 if t.type is TYPE_START_TAG and t.name is 'nobr'
1981 el = insert_html_element t
1984 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1985 adoption_agency t.name
1987 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1989 insert_html_element t
1991 flag_frameset_ok = false
1993 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1994 unless is_in_scope t.name, NS_HTML
1997 generate_implied_end_tags()
1998 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2001 el = open_els.shift()
2002 if el.name is t.name and el.namespace is NS_HTML
2004 clear_afe_to_marker()
2006 if t.type is TYPE_START_TAG and t.name is 'table'
2007 close_p_if_in_button_scope() # fixfull quirksmode thing
2008 insert_html_element t
2009 flag_frameset_ok = false
2010 ins_mode = ins_mode_in_table
2012 if t.type is TYPE_END_TAG and t.name is 'br'
2014 t.type is TYPE_START_TAG
2016 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2018 insert_html_element t
2020 t.acknowledge_self_closing()
2021 flag_frameset_ok = false
2023 if t.type is TYPE_START_TAG and t.name is 'input'
2025 insert_html_element t
2027 t.acknowledge_self_closing()
2028 unless is_input_hidden_tok t
2029 flag_frameset_ok = false
2031 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
2032 insert_html_element t
2034 t.acknowledge_self_closing()
2036 if t.type is TYPE_START_TAG and t.name is 'hr'
2037 close_p_if_in_button_scope()
2038 insert_html_element t
2040 t.acknowledge_self_closing()
2041 flag_frameset_ok = false
2043 if t.type is TYPE_START_TAG and t.name is 'image'
2048 if t.type is TYPE_START_TAG and t.name is 'isindex'
2050 if template_tag_is_open() is false and form_element_pointer isnt null
2052 t.acknowledge_self_closing()
2053 flag_frameset_ok = false
2054 close_p_if_in_button_scope()
2055 el = insert_html_element new_open_tag 'form'
2056 unless template_tag_is_open()
2057 form_element_pointer = el
2060 el.attrs['action'] = a[1]
2062 insert_html_element new_open_tag 'hr'
2065 insert_html_element new_open_tag 'label'
2066 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2067 input_el = new_open_tag 'input'
2072 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2073 input_el.attrs_a.push [a[0], a[1]]
2074 input_el.attrs_a.push ['name', 'isindex']
2075 # fixfull this next bit is in english... internationalize?
2076 prompt ?= "This is a searchable index. Enter search keywords: "
2077 insert_character new_character_token prompt # fixfull split
2078 # TODO submit typo "balue" in spec
2079 insert_html_element input_el
2081 # insert_character '' # you can put chars here if promt attr missing
2083 insert_html_element new_open_tag 'hr'
2086 unless template_tag_is_open()
2087 form_element_pointer = null
2089 if t.type is TYPE_START_TAG and t.name is 'textarea'
2090 insert_html_element t
2091 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2093 tok_state = tok_state_rcdata
2094 original_ins_mode = ins_mode
2095 flag_frameset_ok = false
2096 ins_mode = ins_mode_text
2098 if t.type is TYPE_START_TAG and t.name is 'xmp'
2099 close_p_if_in_button_scope()
2101 flag_frameset_ok = false
2102 parse_generic_raw_text t
2104 if t.type is TYPE_START_TAG and t.name is 'iframe'
2105 flag_frameset_ok = false
2106 parse_generic_raw_text t
2108 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2109 parse_generic_raw_text t
2111 if t.type is TYPE_START_TAG and t.name is 'select'
2113 insert_html_element t
2114 flag_frameset_ok = false
2115 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2116 ins_mode = ins_mode_in_select_in_table
2118 ins_mode = ins_mode_in_select
2120 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2121 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2124 insert_html_element t
2126 # this comment block implements the W3C spec
2127 # if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2128 # if is_in_scope 'ruby', NS_HTML
2129 # generate_implied_end_tags()
2130 # unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2132 # insert_html_element t
2134 # if t.type is TYPE_START_TAG and t.name is 'rt'
2135 # if is_in_scope 'ruby', NS_HTML
2136 # generate_implied_end_tags 'rtc' # arg is exception
2137 # unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2139 # insert_html_element t
2141 # below implements the WATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2142 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2143 if is_in_scope 'ruby', NS_HTML
2144 generate_implied_end_tags()
2145 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2147 insert_html_element t
2149 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2150 if is_in_scope 'ruby', NS_HTML
2151 generate_implied_end_tags 'rtc'
2152 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2154 insert_html_element t
2157 if t.type is TYPE_START_TAG and t.name is 'math'
2159 adjust_mathml_attributes t
2160 adjust_foreign_attributes t
2161 insert_foreign_element t, NS_MATHML
2162 if t.flag 'self-closing'
2164 t.acknowledge_self_closing()
2166 if t.type is TYPE_START_TAG and t.name is 'svg'
2168 adjust_svg_attributes t
2169 adjust_foreign_attributes t
2170 insert_foreign_element t, NS_SVG
2171 if t.flag 'self-closing'
2173 t.acknowledge_self_closing()
2175 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2178 if t.type is TYPE_START_TAG # any other start tag
2180 insert_html_element t
2182 if t.type is TYPE_END_TAG # any other end tag
2183 in_body_any_other_end_tag t.name
2187 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2188 ins_mode_text = (t) ->
2189 if t.type is TYPE_TEXT
2192 if t.type is TYPE_EOF
2194 if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2195 open_els[0].flag 'already started', true
2197 ins_mode = original_ins_mode
2200 if t.type is TYPE_END_TAG and t.name is 'script'
2202 ins_mode = original_ins_mode
2203 # fixfull the spec seems to assume that I'm going to run the script
2204 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2206 if t.type is TYPE_END_TAG
2208 ins_mode = original_ins_mode
2210 console.log 'warning: end of ins_mode_text reached'
2212 # the functions below implement the tokenizer stats described here:
2213 # http://www.w3.org/TR/html5/syntax.html#tokenization
2215 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2216 ins_mode_in_table_else = (t) ->
2218 flag_foster_parenting = true
2220 flag_foster_parenting = false
2222 ins_mode_in_table = (t) ->
2225 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2226 pending_table_character_tokens = []
2227 original_ins_mode = ins_mode
2228 ins_mode = ins_mode_in_table_text
2231 ins_mode_in_table_else t
2239 clear_stack_to_table_context()
2241 insert_html_element t
2242 ins_mode = ins_mode_in_caption
2244 clear_stack_to_table_context()
2245 insert_html_element t
2246 ins_mode = ins_mode_in_column_group
2248 clear_stack_to_table_context()
2249 insert_html_element new_open_tag 'colgroup'
2250 ins_mode = ins_mode_in_column_group
2252 when 'tbody', 'tfoot', 'thead'
2253 clear_stack_to_table_context()
2254 insert_html_element t
2255 ins_mode = ins_mode_in_table_body
2256 when 'td', 'th', 'tr'
2257 clear_stack_to_table_context()
2258 insert_html_element new_open_tag 'tbody'
2259 ins_mode = ins_mode_in_table_body
2263 if is_in_table_scope 'table', NS_HTML
2265 el = open_els.shift()
2266 if el.name is 'table' and el.namespace is NS_HTML
2270 when 'style', 'script', 'template'
2273 unless is_input_hidden_tok t
2274 ins_mode_in_table_else t
2277 el = insert_html_element t
2279 t.acknowledge_self_closing()
2282 if form_element_pointer?
2284 if template_tag_is_open()
2286 form_element_pointer = insert_html_element t
2289 ins_mode_in_table_else t
2293 if is_in_table_scope 'table', NS_HTML
2295 el = open_els.shift()
2296 if el.name is 'table' and el.namespace is NS_HTML
2301 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2306 ins_mode_in_table_else t
2310 ins_mode_in_table_else t
2313 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2314 ins_mode_in_table_text = (t) ->
2315 if t.type is TYPE_TEXT and t.text is "\u0000"
2319 if t.type is TYPE_TEXT
2320 pending_table_character_tokens.push t
2324 for old in pending_table_character_tokens
2325 unless is_space_tok old
2329 for old in pending_table_character_tokens
2330 insert_character old
2332 for old in pending_table_character_tokens
2333 ins_mode_in_table_else old
2334 pending_table_character_tokens = []
2335 ins_mode = original_ins_mode
2338 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2339 ins_mode_in_caption = (t) ->
2340 if t.type is TYPE_END_TAG and t.name is 'caption'
2341 if is_in_table_scope 'caption', NS_HTML
2342 generate_implied_end_tags()
2343 if open_els[0].name isnt 'caption'
2346 el = open_els.shift()
2347 if el.name is 'caption' and el.namespace is NS_HTML
2349 clear_afe_to_marker()
2350 ins_mode = ins_mode_in_table
2355 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2357 if is_in_table_scope 'caption', NS_HTML
2359 el = open_els.shift()
2360 if el.name is 'caption' and el.namespace is NS_HTML
2362 clear_afe_to_marker()
2363 ins_mode = ins_mode_in_table
2365 # else fragment case
2367 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2373 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2374 ins_mode_in_column_group = (t) ->
2378 if t.type is TYPE_COMMENT
2381 if t.type is TYPE_DOCTYPE
2384 if t.type is TYPE_START_TAG and t.name is 'html'
2387 if t.type is TYPE_START_TAG and t.name is 'col'
2388 el = insert_html_element t
2390 t.acknowledge_self_closing()
2392 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2393 if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2395 ins_mode = ins_mode_in_table
2399 if t.type is TYPE_END_TAG and t.name is 'col'
2402 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2405 if t.type is TYPE_EOF
2409 if open_els[0].name isnt 'colgroup'
2413 ins_mode = ins_mode_in_table
2417 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2418 ins_mode_in_table_body = (t) ->
2419 if t.type is TYPE_START_TAG and t.name is 'tr'
2420 clear_stack_to_table_body_context()
2421 insert_html_element t
2422 ins_mode = ins_mode_in_row
2424 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2426 clear_stack_to_table_body_context()
2427 insert_html_element new_open_tag 'tr'
2428 ins_mode = ins_mode_in_row
2431 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2432 unless is_in_table_scope t.name, NS_HTML
2435 clear_stack_to_table_body_context()
2437 ins_mode = ins_mode_in_table
2439 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2442 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2445 if table_scopers[el.name] is el.namespace
2450 clear_stack_to_table_body_context()
2452 ins_mode = ins_mode_in_table
2455 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2461 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2462 ins_mode_in_row = (t) ->
2463 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2464 clear_stack_to_table_row_context()
2465 insert_html_element t
2466 ins_mode = ins_mode_in_cell
2469 if t.type is TYPE_END_TAG and t.name is 'tr'
2470 if is_in_table_scope 'tr', NS_HTML
2471 clear_stack_to_table_row_context()
2473 ins_mode = ins_mode_in_table_body
2477 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2478 if is_in_table_scope 'tr', NS_HTML
2479 clear_stack_to_table_row_context()
2481 ins_mode = ins_mode_in_table_body
2486 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2487 if is_in_table_scope t.name, NS_HTML
2488 if is_in_table_scope 'tr', NS_HTML
2489 clear_stack_to_table_row_context()
2491 ins_mode = ins_mode_in_table_body
2496 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2502 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2504 generate_implied_end_tags()
2505 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2508 el = open_els.shift()
2509 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2511 clear_afe_to_marker()
2512 ins_mode = ins_mode_in_row
2514 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2515 ins_mode_in_cell = (t) ->
2516 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2517 if is_in_table_scope t.name, NS_HTML
2518 generate_implied_end_tags()
2519 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2522 el = open_els.shift()
2523 if el.name is t.name and el.namespace is NS_HTML
2525 clear_afe_to_marker()
2526 ins_mode = ins_mode_in_row
2530 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2533 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2536 if table_scopers[el.name] is el.namespace
2544 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2547 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2548 if is_in_table_scope t.name, NS_HTML
2557 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2558 ins_mode_in_select = (t) ->
2559 if t.type is TYPE_TEXT and t.text is "\u0000"
2562 if t.type is TYPE_TEXT
2565 if t.type is TYPE_COMMENT
2568 if t.type is TYPE_DOCTYPE
2571 if t.type is TYPE_START_TAG and t.name is 'html'
2574 if t.type is TYPE_START_TAG and t.name is 'option'
2575 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2577 insert_html_element t
2579 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2580 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2582 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2584 insert_html_element t
2586 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2587 if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
2588 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2590 if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2595 if t.type is TYPE_END_TAG and t.name is 'option'
2596 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2601 if t.type is TYPE_END_TAG and t.name is 'select'
2602 if is_in_select_scope 'select', NS_HTML
2604 el = open_els.shift()
2605 if el.name is 'select' and el.namespace is NS_HTML
2611 if t.type is TYPE_START_TAG and t.name is 'select'
2614 el = open_els.shift()
2615 if el.name is 'select' and el.namespace is NS_HTML
2618 # spec says that this is the same as </select> but it doesn't say
2619 # to check scope first
2621 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2623 if is_in_select_scope 'select', NS_HTML
2626 el = open_els.shift()
2627 if el.name is 'select' and el.namespace is NS_HTML
2632 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2635 if t.type is TYPE_EOF
2642 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2643 ins_mode_in_select_in_table = (t) ->
2644 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2647 el = open_els.shift()
2648 if el.name is 'select' and el.namespace is NS_HTML
2653 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2655 unless is_in_table_scope t.name, NS_HTML
2658 el = open_els.shift()
2659 if el.name is 'select' and el.namespace is NS_HTML
2665 ins_mode_in_select t
2668 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2669 ins_mode_in_template = (t) ->
2670 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2673 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2676 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2677 template_ins_modes.shift()
2678 template_ins_modes.unshift ins_mode_in_table
2679 ins_mode = ins_mode_in_table
2682 if t.type is TYPE_START_TAG and t.name is 'col'
2683 template_ins_modes.shift()
2684 template_ins_modes.unshift ins_mode_in_column_group
2685 ins_mode = ins_mode_in_column_group
2688 if t.type is TYPE_START_TAG and t.name is 'tr'
2689 template_ins_modes.shift()
2690 template_ins_modes.unshift ins_mode_in_table_body
2691 ins_mode = ins_mode_in_table_body
2694 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2695 template_ins_modes.shift()
2696 template_ins_modes.unshift ins_mode_in_row
2697 ins_mode = ins_mode_in_row
2700 if t.type is TYPE_START_TAG
2701 template_ins_modes.shift()
2702 template_ins_modes.unshift ins_mode_in_body
2703 ins_mode = ins_mode_in_body
2706 if t.type is TYPE_END_TAG
2709 if t.type is TYPE_EOF
2710 unless template_tag_is_open()
2715 el = open_els.shift()
2716 if el.name is 'template' and el.namespace is NS_HTML
2718 clear_afe_to_marker()
2719 template_ins_modes.shift()
2723 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2724 ins_mode_after_body = (t) ->
2728 if t.type is TYPE_COMMENT
2729 insert_comment t, [open_els[0], open_els[0].children.length]
2731 if t.type is TYPE_DOCTYPE
2734 if t.type is TYPE_START_TAG and t.name is 'html'
2737 if t.type is TYPE_END_TAG and t.name is 'html'
2738 if flag_fragment_parsing
2741 ins_mode = ins_mode_after_after_body
2743 if t.type is TYPE_EOF
2748 ins_mode = ins_mode_in_body
2751 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2752 ins_mode_in_frameset = (t) ->
2756 if t.type is TYPE_COMMENT
2759 if t.type is TYPE_DOCTYPE
2762 if t.type is TYPE_START_TAG and t.name is 'html'
2765 if t.type is TYPE_START_TAG and t.name is 'frameset'
2766 insert_html_element t
2768 if t.type is TYPE_END_TAG and t.name is 'frameset'
2769 if open_els.length is 1
2771 return # fragment case
2773 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2774 ins_mode = ins_mode_after_frameset
2776 if t.type is TYPE_START_TAG and t.name is 'frame'
2777 insert_html_element t
2779 t.acknowledge_self_closing()
2781 if t.type is TYPE_START_TAG and t.name is 'noframes'
2784 if t.type is TYPE_EOF
2785 if open_els.length isnt 1
2793 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2794 ins_mode_after_frameset = (t) ->
2798 if t.type is TYPE_COMMENT
2801 if t.type is TYPE_DOCTYPE
2804 if t.type is TYPE_START_TAG and t.name is 'html'
2807 if t.type is TYPE_END_TAG and t.name is 'html'
2808 insert_mode = ins_mode_after_after_frameset
2810 if t.type is TYPE_START_TAG and t.name is 'noframes'
2813 if t.type is TYPE_EOF
2820 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2821 ins_mode_after_after_body = (t) ->
2822 if t.type is TYPE_COMMENT
2823 insert_comment t, [doc, doc.children.length]
2825 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2828 if t.type is TYPE_EOF
2833 ins_mode = ins_mode_in_body
2837 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2838 ins_mode_after_after_frameset = (t) ->
2839 if t.type is TYPE_COMMENT
2840 insert_comment t, [doc, doc.children.length]
2842 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2845 if t.type is TYPE_EOF
2848 if t.type is TYPE_START_TAG and t.name is 'noframes'
2855 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2856 has_color_face_or_size = (t) ->
2858 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2861 in_foreign_content_end_script = ->
2865 in_foreign_content_other_start = (t) ->
2866 acn = adjusted_current_node()
2867 if acn.namespace is NS_MATHML
2868 adjust_mathml_attributes t
2869 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2870 t.name = svg_name_fixes[t.name]
2871 if acn.namespace is NS_SVG
2872 adjust_svg_attributes t
2873 adjust_foreign_attributes t
2874 insert_foreign_element t, acn.namespace
2875 if t.flag 'self-closing'
2876 if t.name is 'script'
2877 t.acknowledge_self_closing()
2878 in_foreign_content_end_script()
2882 t.acknowledge_self_closing()
2884 in_foreign_content = (t) ->
2885 if t.type is TYPE_TEXT and t.text is "\u0000"
2887 insert_character new_character_token "\ufffd"
2892 if t.type is TYPE_TEXT
2893 flag_frameset_ok = false
2896 if t.type is TYPE_COMMENT
2899 if t.type is TYPE_DOCTYPE
2902 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
2904 if flag_fragment_parsing
2905 in_foreign_content_other_start t
2907 loop # is this safe?
2909 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
2913 if t.type is TYPE_START_TAG
2914 in_foreign_content_other_start t
2916 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
2917 in_foreign_content_end_script()
2919 if t.type is TYPE_END_TAG
2922 if node.name.toLowerCase() isnt t.name
2925 if node is open_els[open_els.length - 1]
2927 if node.name.toLowerCase() is t.name
2929 el = open_els.shift()
2934 if node.namespace is NS_HTML
2936 ins_mode t # explicitly call HTML insertion mode
2939 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2941 switch c = txt.charAt(cur++)
2943 return new_text_node parse_character_reference()
2945 tok_state = tok_state_tag_open
2948 return new_text_node "\ufffd"
2950 return new_eof_token()
2952 return new_text_node c
2955 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2956 # not needed: tok_state_character_reference_in_data = ->
2957 # just call parse_character_reference()
2959 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2960 tok_state_rcdata = ->
2961 switch c = txt.charAt(cur++)
2963 return new_text_node parse_character_reference()
2965 tok_state = tok_state_rcdata_less_than_sign
2968 return new_character_token "\ufffd"
2970 return new_eof_token()
2972 return new_character_token c
2975 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2976 # not needed: tok_state_character_reference_in_rcdata = ->
2977 # just call parse_character_reference()
2979 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2980 tok_state_rawtext = ->
2981 switch c = txt.charAt(cur++)
2983 tok_state = tok_state_rawtext_less_than_sign
2986 return new_character_token "\ufffd"
2988 return new_eof_token()
2990 return new_character_token c
2993 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2994 tok_state_script_data = ->
2995 switch c = txt.charAt(cur++)
2997 tok_state = tok_state_script_data_less_than_sign
3000 return new_character_token "\ufffd"
3002 return new_eof_token()
3004 return new_character_token c
3007 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3008 tok_state_plaintext = ->
3009 switch c = txt.charAt(cur++)
3012 return new_character_token "\ufffd"
3014 return new_eof_token()
3016 return new_character_token c
3020 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3021 tok_state_tag_open = ->
3022 switch c = txt.charAt(cur++)
3024 tok_state = tok_state_markup_declaration_open
3026 tok_state = tok_state_end_tag_open
3029 tok_cur_tag = new_comment_token '?'
3030 tok_state = tok_state_bogus_comment
3033 tok_cur_tag = new_open_tag c
3034 tok_state = tok_state_tag_name
3035 else if is_uc_alpha(c)
3036 tok_cur_tag = new_open_tag c.toLowerCase()
3037 tok_state = tok_state_tag_name
3040 tok_state = tok_state_data
3041 cur -= 1 # we didn't parse/handle the char after <
3042 return new_text_node '<'
3045 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3046 tok_state_end_tag_open = ->
3047 switch c = txt.charAt(cur++)
3050 tok_state = tok_state_data
3053 tok_state = tok_state_data
3054 return new_text_node '</'
3057 tok_cur_tag = new_end_tag c.toLowerCase()
3058 tok_state = tok_state_tag_name
3059 else if is_lc_alpha(c)
3060 tok_cur_tag = new_end_tag c
3061 tok_state = tok_state_tag_name
3064 tok_cur_tag = new_comment_token '/'
3065 tok_state = tok_state_bogus_comment
3068 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3069 tok_state_tag_name = ->
3070 switch c = txt.charAt(cur++)
3071 when "\t", "\n", "\u000c", ' '
3072 tok_state = tok_state_before_attribute_name
3074 tok_state = tok_state_self_closing_start_tag
3076 tok_state = tok_state_data
3082 tok_cur_tag.name += "\ufffd"
3085 tok_state = tok_state_data
3088 tok_cur_tag.name += c.toLowerCase()
3090 tok_cur_tag.name += c
3093 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3094 tok_state_rcdata_less_than_sign = ->
3095 c = txt.charAt(cur++)
3097 temporary_buffer = ''
3098 tok_state = tok_state_rcdata_end_tag_open
3101 tok_state = tok_state_rcdata
3102 cur -= 1 # reconsume the input character
3103 return new_character_token '<'
3105 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3106 tok_state_rcdata_end_tag_open = ->
3107 c = txt.charAt(cur++)
3109 tok_cur_tag = new_end_tag c.toLowerCase()
3110 temporary_buffer += c
3111 tok_state = tok_state_rcdata_end_tag_name
3114 tok_cur_tag = new_end_tag c
3115 temporary_buffer += c
3116 tok_state = tok_state_rcdata_end_tag_name
3119 tok_state = tok_state_rcdata
3120 cur -= 1 # reconsume the input character
3121 return new_character_token "</" # fixfull separate these
3123 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3124 is_appropriate_end_tag = (t) ->
3125 # spec says to check against "the tag name of the last start tag to
3126 # have been emitted from this tokenizer", but this is only called from
3127 # the various "raw" states, so it's hopefully ok to assume that
3128 # open_els[0].name will work instead TODO: verify this after the script
3129 # data states are implemented
3130 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3131 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3133 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3134 tok_state_rcdata_end_tag_name = ->
3135 c = txt.charAt(cur++)
3136 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3137 if is_appropriate_end_tag tok_cur_tag
3138 tok_state = tok_state_before_attribute_name
3140 # else fall through to "Anything else"
3142 if is_appropriate_end_tag tok_cur_tag
3143 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3145 # else fall through to "Anything else"
3147 if is_appropriate_end_tag tok_cur_tag
3148 tok_state = tok_state_data
3150 # else fall through to "Anything else"
3152 tok_cur_tag.name += c.toLowerCase()
3153 temporary_buffer += c
3156 tok_cur_tag.name += c
3157 temporary_buffer += c
3160 tok_state = tok_state_rcdata
3161 cur -= 1 # reconsume the input character
3162 return new_character_token '</' + temporary_buffer # fixfull separate these
3164 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3165 tok_state_rawtext_less_than_sign = ->
3166 c = txt.charAt(cur++)
3168 temporary_buffer = ''
3169 tok_state = tok_state_rawtext_end_tag_open
3172 tok_state = tok_state_rawtext
3173 cur -= 1 # reconsume the input character
3174 return new_character_token '<'
3176 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3177 tok_state_rawtext_end_tag_open = ->
3178 c = txt.charAt(cur++)
3180 tok_cur_tag = new_end_tag c.toLowerCase()
3181 temporary_buffer += c
3182 tok_state = tok_state_rawtext_end_tag_name
3185 tok_cur_tag = new_end_tag c
3186 temporary_buffer += c
3187 tok_state = tok_state_rawtext_end_tag_name
3190 tok_state = tok_state_rawtext
3191 cur -= 1 # reconsume the input character
3192 return new_character_token "</" # fixfull separate these
3194 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3195 tok_state_rawtext_end_tag_name = ->
3196 c = txt.charAt(cur++)
3197 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3198 if is_appropriate_end_tag tok_cur_tag
3199 tok_state = tok_state_before_attribute_name
3201 # else fall through to "Anything else"
3203 if is_appropriate_end_tag tok_cur_tag
3204 tok_state = tok_state_self_closing_start_tag
3206 # else fall through to "Anything else"
3208 if is_appropriate_end_tag tok_cur_tag
3209 tok_state = tok_state_data
3211 # else fall through to "Anything else"
3213 tok_cur_tag.name += c.toLowerCase()
3214 temporary_buffer += c
3217 tok_cur_tag.name += c
3218 temporary_buffer += c
3221 tok_state = tok_state_rawtext
3222 cur -= 1 # reconsume the input character
3223 return new_character_token '</' + temporary_buffer # fixfull separate these
3225 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3226 tok_state_script_data_less_than_sign = ->
3227 c = txt.charAt(cur++)
3229 temporary_buffer = ''
3230 tok_state = tok_state_script_data_end_tag_open
3233 tok_state = tok_state_script_data_escape_start
3234 return new_character_token '<!' # fixfull split
3236 tok_state = tok_state_script_data
3237 cur -= 1 # Reconsume
3238 return new_character_token '<'
3240 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3241 tok_state_script_data_end_tag_open = ->
3242 c = txt.charAt(cur++)
3244 tok_cur_tag = new_end_tag c.toLowerCase()
3245 temporary_buffer += c
3246 tok_state = tok_state_script_data_end_tag_name
3249 tok_cur_tag = new_end_tag c
3250 temporary_buffer += c
3251 tok_state = tok_state_script_data_end_tag_name
3254 tok_state = tok_state_script_data
3255 cur -= 1 # Reconsume
3256 return new_character_token '</'
3258 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3259 tok_state_script_data_end_tag_name = ->
3260 c = txt.charAt(cur++)
3261 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3262 if is_appropriate_end_tag tok_cur_tag
3263 tok_state = tok_state_before_attribute_name
3267 if is_appropriate_end_tag tok_cur_tag
3268 tok_state = tok_state_self_closing_start_tag
3272 if is_appropriate_end_tag tok_cur_tag
3273 tok_state = tok_state_data
3277 tok_cur_tag.name += c.toLowerCase()
3278 temporary_buffer += c
3281 tok_cur_tag.name += c
3282 temporary_buffer += c
3285 tok_state = tok_state_script_data
3286 cur -= 1 # Reconsume
3287 return new_character_token "</#{temporary_buffer}" # fixfull split
3289 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3290 tok_state_script_data_escape_start = ->
3291 c = txt.charAt(cur++)
3293 tok_state = tok_state_script_data_escape_start_dash
3294 return new_character_token '-'
3296 tok_state = tok_state_script_data
3297 cur -= 1 # Reconsume
3300 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3301 tok_state_script_data_escape_start_dash = ->
3302 c = txt.charAt(cur++)
3304 tok_state = tok_state_script_data_escaped_dash_dash
3305 return new_character_token '-'
3307 tok_state = tok_state_script_data
3308 cur -= 1 # Reconsume
3311 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3312 tok_state_script_data_escaped = ->
3313 c = txt.charAt(cur++)
3315 tok_state = tok_state_script_data_escaped_dash
3316 return new_character_token '-'
3318 tok_state = tok_state_script_data_escaped_less_than_sign
3322 return new_character_token "\ufffd"
3324 tok_state = tok_state_data
3326 cur -= 1 # Reconsume
3329 return new_character_token c
3331 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3332 tok_state_script_data_escaped_dash = ->
3333 c = txt.charAt(cur++)
3335 tok_state = tok_state_script_data_escaped_dash_dash
3336 return new_character_token '-'
3338 tok_state = tok_state_script_data_escaped_less_than_sign
3342 tok_state = tok_state_script_data_escaped
3343 return new_character_token "\ufffd"
3345 tok_state = tok_state_data
3347 cur -= 1 # Reconsume
3350 tok_state = tok_state_script_data_escaped
3351 return new_character_token c
3353 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3354 tok_state_script_data_escaped_dash_dash = ->
3355 c = txt.charAt(cur++)
3357 return new_character_token '-'
3359 tok_state = tok_state_script_data_escaped_less_than_sign
3362 tok_state = tok_state_script_data
3363 return new_character_token '>'
3366 tok_state = tok_state_script_data_escaped
3367 return new_character_token "\ufffd"
3370 tok_state = tok_state_data
3371 cur -= 1 # Reconsume
3374 tok_state = tok_state_script_data_escaped
3375 return new_character_token c
3377 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3378 tok_state_script_data_escaped_less_than_sign = ->
3379 c = txt.charAt(cur++)
3381 temporary_buffer = ''
3382 tok_state = tok_state_script_data_escaped_end_tag_open
3385 temporary_buffer = c.toLowerCase() # yes, really
3386 tok_state = tok_state_script_data_double_escape_start
3387 return new_character_token "<#{c}" # fixfull split
3389 temporary_buffer = c
3390 tok_state = tok_state_script_data_double_escape_start
3391 return new_character_token "<#{c}" # fixfull split
3393 tok_state = tok_state_script_data_escaped
3394 cur -= 1 # Reconsume
3395 return new_character_token c
3397 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3398 tok_state_script_data_escaped_end_tag_open = ->
3399 c = txt.charAt(cur++)
3401 tok_cur_tag = new_end_tag c.toLowerCase()
3402 temporary_buffer += c
3403 tok_state = tok_state_script_data_escaped_end_tag_name
3406 tok_cur_tag = new_end_tag c
3407 temporary_buffer += c
3408 tok_state = tok_state_script_data_escaped_end_tag_name
3411 tok_state = tok_state_script_data_escaped
3412 cur -= 1 # Reconsume
3413 return new_character_token '</' # fixfull split
3415 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3416 tok_state_script_data_escaped_end_tag_name = ->
3417 c = txt.charAt(cur++)
3418 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3419 if is_appropriate_end_tag tok_cur_tag
3420 tok_state = tok_state_before_attribute_name
3424 if is_appropriate_end_tag tok_cur_tag
3425 tok_state = tok_state_self_closing_start_tag
3429 if is_appropriate_end_tag tok_cur_tag
3430 tok_state = tok_state_data
3434 tok_cur_tag.name += c.toLowerCase()
3435 temporary_buffer += c.toLowerCase()
3438 tok_cur_tag.name += c
3439 temporary_buffer += c.toLowerCase()
3442 tok_state = tok_state_script_data_escaped
3443 cur -= 1 # Reconsume
3444 return new_character_token "</#{temporary_buffer}" # fixfull split
3446 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3447 tok_state_script_data_double_escape_start = ->
3448 c = txt.charAt(cur++)
3449 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3450 if temporary_buffer is 'script'
3451 tok_state = tok_state_script_data_double_escaped
3453 tok_state = tok_state_script_data_escaped
3454 return new_character_token c
3456 temporary_buffer += c.toLowerCase() # yes, really lowercase
3457 return new_character_token c
3459 temporary_buffer += c
3460 return new_character_token c
3462 tok_state = tok_state_script_data_escaped
3463 cur -= 1 # Reconsume
3466 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3467 tok_state_script_data_double_escaped = ->
3468 c = txt.charAt(cur++)
3470 tok_state = tok_state_script_data_double_escaped_dash
3471 return new_character_token '-'
3473 tok_state = tok_state_script_data_double_escaped_less_than_sign
3474 return new_character_token '<'
3477 return new_character_token "\ufffd"
3480 tok_state = tok_state_data
3481 cur -= 1 # Reconsume
3484 return new_character_token c
3486 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3487 tok_state_script_data_double_escaped_dash = ->
3488 c = txt.charAt(cur++)
3490 tok_state = tok_state_script_data_double_escaped_dash_dash
3491 return new_character_token '-'
3493 tok_state = tok_state_script_data_double_escaped_less_than_sign
3494 return new_character_token '<'
3497 tok_state = tok_state_script_data_double_escaped
3498 return new_character_token "\ufffd"
3501 tok_state = tok_state_data
3502 cur -= 1 # Reconsume
3505 tok_state = tok_state_script_data_double_escaped
3506 return new_character_token c
3508 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3509 tok_state_script_data_double_escaped_dash_dash = ->
3510 c = txt.charAt(cur++)
3512 return new_character_token '-'
3514 tok_state = tok_state_script_data_double_escaped_less_than_sign
3515 return new_character_token '<'
3517 tok_state = tok_state_script_data
3518 return new_character_token '>'
3521 tok_state = tok_state_script_data_double_escaped
3522 return new_character_token "\ufffd"
3525 tok_state = tok_state_data
3526 cur -= 1 # Reconsume
3529 tok_state = tok_state_script_data_double_escaped
3530 return new_character_token c
3532 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3533 tok_state_script_data_double_escaped_less_than_sign = ->
3534 c = txt.charAt(cur++)
3536 temporary_buffer = ''
3537 tok_state = tok_state_script_data_double_escape_end
3538 return new_character_token '/'
3540 tok_state = tok_state_script_data_double_escaped
3541 cur -= 1 # Reconsume
3544 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3545 tok_state_script_data_double_escape_end = ->
3546 c = txt.charAt(cur++)
3547 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3548 if temporary_buffer is 'script'
3549 tok_state = tok_state_script_data_escaped
3551 tok_state = tok_state_script_data_double_escaped
3552 return new_character_token c
3554 temporary_buffer += c.toLowerCase() # yes, really lowercase
3555 return new_character_token c
3557 temporary_buffer += c
3558 return new_character_token c
3560 tok_state = tok_state_script_data_double_escaped
3561 cur -= 1 # Reconsume
3564 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3565 tok_state_before_attribute_name = ->
3567 switch c = txt.charAt(cur++)
3568 when "\t", "\n", "\u000c", ' '
3571 tok_state = tok_state_self_closing_start_tag
3574 tok_state = tok_state_data
3580 attr_name = "\ufffd"
3581 when '"', "'", '<', '='
3586 tok_state = tok_state_data
3589 attr_name = c.toLowerCase()
3593 tok_cur_tag.attrs_a.unshift [attr_name, '']
3594 tok_state = tok_state_attribute_name
3597 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3598 tok_state_attribute_name = ->
3599 switch c = txt.charAt(cur++)
3600 when "\t", "\n", "\u000c", ' '
3601 tok_state = tok_state_after_attribute_name
3603 tok_state = tok_state_self_closing_start_tag
3605 tok_state = tok_state_before_attribute_value
3607 tok_state = tok_state_data
3613 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3616 tok_cur_tag.attrs_a[0][0] += c
3619 tok_state = tok_state_data
3622 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3624 tok_cur_tag.attrs_a[0][0] += c
3627 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3628 tok_state_after_attribute_name = ->
3629 c = txt.charAt(cur++)
3630 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3633 tok_state = tok_state_self_closing_start_tag
3636 tok_state = tok_state_before_attribute_value
3639 tok_state = tok_state_data
3642 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3643 tok_state = tok_state_attribute_name
3647 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3648 tok_state = tok_state_attribute_name
3652 tok_state = tok_state_data
3653 cur -= 1 # reconsume
3655 if c is '"' or c is "'" or c is '<'
3657 # fall through to Anything else
3659 tok_cur_tag.attrs_a.unshift [c, '']
3660 tok_state = tok_state_attribute_name
3662 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3663 tok_state_before_attribute_value = ->
3664 switch c = txt.charAt(cur++)
3665 when "\t", "\n", "\u000c", ' '
3668 tok_state = tok_state_attribute_value_double_quoted
3670 tok_state = tok_state_attribute_value_unquoted
3673 tok_state = tok_state_attribute_value_single_quoted
3676 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3677 tok_state = tok_state_attribute_value_unquoted
3680 tok_state = tok_state_data
3686 tok_state = tok_state_data
3688 tok_cur_tag.attrs_a[0][1] += c
3689 tok_state = tok_state_attribute_value_unquoted
3692 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3693 tok_state_attribute_value_double_quoted = ->
3694 switch c = txt.charAt(cur++)
3696 tok_state = tok_state_after_attribute_value_quoted
3698 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3701 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3704 tok_state = tok_state_data
3706 tok_cur_tag.attrs_a[0][1] += c
3709 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3710 tok_state_attribute_value_single_quoted = ->
3711 switch c = txt.charAt(cur++)
3713 tok_state = tok_state_after_attribute_value_quoted
3715 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3718 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3721 tok_state = tok_state_data
3723 tok_cur_tag.attrs_a[0][1] += c
3726 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3727 tok_state_attribute_value_unquoted = ->
3728 switch c = txt.charAt(cur++)
3729 when "\t", "\n", "\u000c", ' '
3730 tok_state = tok_state_before_attribute_name
3732 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3734 tok_state = tok_state_data
3739 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3742 tok_state = tok_state_data
3744 # Parse Error if ', <, = or ` (backtick)
3745 tok_cur_tag.attrs_a[0][1] += c
3748 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3749 tok_state_after_attribute_value_quoted = ->
3750 switch c = txt.charAt(cur++)
3751 when "\t", "\n", "\u000c", ' '
3752 tok_state = tok_state_before_attribute_name
3754 tok_state = tok_state_self_closing_start_tag
3756 tok_state = tok_state_data
3762 tok_state = tok_state_data
3765 tok_state = tok_state_before_attribute_name
3766 cur -= 1 # we didn't handle that char
3769 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3770 tok_state_self_closing_start_tag = ->
3771 c = txt.charAt(cur++)
3773 tok_cur_tag.flag 'self-closing', true
3774 tok_state = tok_state_data
3778 tok_state = tok_state_data
3779 cur -= 1 # Reconsume
3783 tok_state = tok_state_before_attribute_name
3784 cur -= 1 # Reconsume
3787 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3788 # WARNING: put a comment token in tok_cur_tag before setting this state
3789 tok_state_bogus_comment = ->
3790 next_gt = txt.indexOf '>', cur
3792 val = txt.substr cur
3795 val = txt.substr cur, (next_gt - cur)
3797 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3798 tok_cur_tag.text += val
3799 tok_state = tok_state_data
3802 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3803 tok_state_markup_declaration_open = ->
3804 if txt.substr(cur, 2) is '--'
3806 tok_cur_tag = new_comment_token ''
3807 tok_state = tok_state_comment_start
3809 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3811 tok_state = tok_state_doctype
3813 acn = adjusted_current_node()
3814 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3816 tok_state = tok_state_cdata_section
3820 tok_cur_tag = new_comment_token ''
3821 tok_state = tok_state_bogus_comment
3824 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3825 tok_state_comment_start = ->
3826 switch c = txt.charAt(cur++)
3828 tok_state = tok_state_comment_start_dash
3831 tok_state = tok_state_comment
3832 return new_character_token "\ufffd"
3835 tok_state = tok_state_data
3839 tok_state = tok_state_data
3840 cur -= 1 # Reconsume
3843 tok_cur_tag.text += c
3844 tok_state = tok_state_comment
3847 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3848 tok_state_comment_start_dash = ->
3849 switch c = txt.charAt(cur++)
3851 tok_state = tok_state_comment_end
3854 tok_cur_tag.text += "-\ufffd"
3855 tok_state = tok_state_comment
3858 tok_state = tok_state_data
3862 tok_state = tok_state_data
3863 cur -= 1 # Reconsume
3866 tok_cur_tag.text += "-#{c}"
3867 tok_state = tok_state_comment
3870 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3871 tok_state_comment = ->
3872 switch c = txt.charAt(cur++)
3874 tok_state = tok_state_comment_end_dash
3877 tok_cur_tag.text += "\ufffd"
3880 tok_state = tok_state_data
3881 cur -= 1 # Reconsume
3884 tok_cur_tag.text += c
3887 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3888 tok_state_comment_end_dash = ->
3889 switch c = txt.charAt(cur++)
3891 tok_state = tok_state_comment_end
3894 tok_cur_tag.text += "-\ufffd"
3895 tok_state = tok_state_comment
3898 tok_state = tok_state_data
3899 cur -= 1 # Reconsume
3902 tok_cur_tag.text += "-#{c}"
3903 tok_state = tok_state_comment
3906 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3907 tok_state_comment_end = ->
3908 switch c = txt.charAt(cur++)
3910 tok_state = tok_state_data
3914 tok_cur_tag.text += "--\ufffd"
3915 tok_state = tok_state_comment
3918 tok_state = tok_state_comment_end_bang
3921 tok_cur_tag.text += '-'
3924 tok_state = tok_state_data
3925 cur -= 1 # Reconsume
3929 tok_cur_tag.text += "--#{c}"
3930 tok_state = tok_state_comment
3933 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3934 tok_state_comment_end_bang = ->
3935 switch c = txt.charAt(cur++)
3937 tok_cur_tag.text += "--!#{c}"
3938 tok_state = tok_state_comment_end_dash
3940 tok_state = tok_state_data
3944 tok_cur_tag.text += "--!\ufffd"
3945 tok_state = tok_state_comment
3948 tok_state = tok_state_data
3949 cur -= 1 # Reconsume
3952 tok_cur_tag.text += "--!#{c}"
3953 tok_state = tok_state_comment
3956 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3957 tok_state_doctype = ->
3958 switch c = txt.charAt(cur++)
3959 when "\t", "\u000a", "\u000c", ' '
3960 tok_state = tok_state_before_doctype_name
3963 tok_state = tok_state_data
3964 el = new_doctype_token ''
3965 el.flag 'force-quirks', true
3966 cur -= 1 # Reconsume
3970 tok_state = tok_state_before_doctype_name
3971 cur -= 1 # Reconsume
3974 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3975 tok_state_before_doctype_name = ->
3976 c = txt.charAt(cur++)
3977 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3980 tok_cur_tag = new_doctype_token c.toLowerCase()
3981 tok_state = tok_state_doctype_name
3985 tok_cur_tag = new_doctype_token "\ufffd"
3986 tok_state = tok_state_doctype_name
3990 el = new_doctype_token ''
3991 el.flag 'force-quirks', true
3992 tok_state = tok_state_data
3996 tok_state = tok_state_data
3997 el = new_doctype_token ''
3998 el.flag 'force-quirks', true
3999 cur -= 1 # Reconsume
4002 tok_cur_tag = new_doctype_token c
4003 tok_state = tok_state_doctype_name
4006 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4007 tok_state_doctype_name = ->
4008 c = txt.charAt(cur++)
4009 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4010 tok_state = tok_state_after_doctype_name
4013 tok_state = tok_state_data
4016 tok_cur_tag.name += c.toLowerCase()
4020 tok_cur_tag.name += "\ufffd"
4024 tok_state = tok_state_data
4025 tok_cur_tag.flag 'force-quirks', true
4026 cur -= 1 # Reconsume
4029 tok_cur_tag.name += c
4032 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4033 tok_state_after_doctype_name = ->
4034 c = txt.charAt(cur++)
4035 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4038 tok_state = tok_state_data
4042 tok_state = tok_state_data
4043 tok_cur_tag.flag 'force-quirks', true
4044 cur -= 1 # Reconsume
4047 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4049 tok_state = tok_state_after_doctype_public_keyword
4051 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4053 tok_state = tok_state_after_doctype_system_keyword
4056 tok_cur_tag.flag 'force-quirks', true
4057 tok_state = tok_state_bogus_doctype
4060 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4061 tok_state_after_doctype_public_keyword = ->
4062 c = txt.charAt(cur++)
4063 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4064 tok_state = tok_state_before_doctype_public_identifier
4068 tok_cur_tag.public_identifier = ''
4069 tok_state = tok_state_doctype_public_identifier_double_quoted
4073 tok_cur_tag.public_identifier = ''
4074 tok_state = tok_state_doctype_public_identifier_single_quoted
4078 tok_cur_tag.flag 'force-quirks', true
4079 tok_state = tok_state_data
4083 tok_state = tok_state_data
4084 tok_cur_tag.flag 'force-quirks', true
4085 cur -= 1 # Reconsume
4089 tok_cur_tag.flag 'force-quirks', true
4090 tok_state = tok_state_bogus_doctype
4093 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4094 tok_state_before_doctype_public_identifier = ->
4095 c = txt.charAt(cur++)
4096 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4100 tok_cur_tag.public_identifier = ''
4101 tok_state = tok_state_doctype_public_identifier_double_quoted
4105 tok_cur_tag.public_identifier = ''
4106 tok_state = tok_state_doctype_public_identifier_single_quoted
4110 tok_cur_tag.flag 'force-quirks', true
4111 tok_state = tok_state_data
4115 tok_state = tok_state_data
4116 tok_cur_tag.flag 'force-quirks', true
4117 cur -= 1 # Reconsume
4121 tok_cur_tag.flag 'force-quirks', true
4122 tok_state = tok_state_bogus_doctype
4126 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4127 tok_state_doctype_public_identifier_double_quoted = ->
4128 c = txt.charAt(cur++)
4130 tok_state = tok_state_after_doctype_public_identifier
4134 tok_cur_tag.public_identifier += "\ufffd"
4138 tok_cur_tag.flag 'force-quirks', true
4139 tok_state = tok_state_data
4143 tok_state = tok_state_data
4144 tok_cur_tag.flag 'force-quirks', true
4145 cur -= 1 # Reconsume
4148 tok_cur_tag.public_identifier += c
4151 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4152 tok_state_doctype_public_identifier_single_quoted = ->
4153 c = txt.charAt(cur++)
4155 tok_state = tok_state_after_doctype_public_identifier
4159 tok_cur_tag.public_identifier += "\ufffd"
4163 tok_cur_tag.flag 'force-quirks', true
4164 tok_state = tok_state_data
4168 tok_state = tok_state_data
4169 tok_cur_tag.flag 'force-quirks', true
4170 cur -= 1 # Reconsume
4173 tok_cur_tag.public_identifier += c
4176 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4177 tok_state_after_doctype_public_identifier = ->
4178 c = txt.charAt(cur++)
4179 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4180 tok_state = tok_state_between_doctype_public_and_system_identifiers
4183 tok_state = tok_state_data
4187 tok_cur_tag.system_identifier = ''
4188 tok_state = tok_state_doctype_system_identifier_double_quoted
4192 tok_cur_tag.system_identifier = ''
4193 tok_state = tok_state_doctype_system_identifier_single_quoted
4197 tok_state = tok_state_data
4198 tok_cur_tag.flag 'force-quirks', true
4199 cur -= 1 # Reconsume
4203 tok_cur_tag.flag 'force-quirks', true
4204 tok_state = tok_state_bogus_doctype
4207 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4208 tok_state_between_doctype_public_and_system_identifiers = ->
4209 c = txt.charAt(cur++)
4210 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4213 tok_state = tok_state_data
4217 tok_cur_tag.system_identifier = ''
4218 tok_state = tok_state_doctype_system_identifier_double_quoted
4222 tok_cur_tag.system_identifier = ''
4223 tok_state = tok_state_doctype_system_identifier_single_quoted
4227 tok_state = tok_state_data
4228 tok_cur_tag.flag 'force-quirks', true
4229 cur -= 1 # Reconsume
4233 tok_cur_tag.flag 'force-quirks', true
4234 tok_state = tok_state_bogus_doctype
4237 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4238 tok_state_after_doctype_system_keyword = ->
4239 c = txt.charAt(cur++)
4240 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4241 tok_state = tok_state_before_doctype_system_identifier
4245 tok_cur_tag.system_identifier = ''
4246 tok_state = tok_state_doctype_system_identifier_double_quoted
4250 tok_cur_tag.system_identifier = ''
4251 tok_state = tok_state_doctype_system_identifier_single_quoted
4255 tok_cur_tag.flag 'force-quirks', true
4256 tok_state = tok_state_data
4260 tok_state = tok_state_data
4261 tok_cur_tag.flag 'force-quirks', true
4262 cur -= 1 # Reconsume
4266 tok_cur_tag.flag 'force-quirks', true
4267 tok_state = tok_state_bogus_doctype
4270 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4271 tok_state_before_doctype_system_identifier = ->
4272 c = txt.charAt(cur++)
4273 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4276 tok_cur_tag.system_identifier = ''
4277 tok_state = tok_state_doctype_system_identifier_double_quoted
4280 tok_cur_tag.system_identifier = ''
4281 tok_state = tok_state_doctype_system_identifier_single_quoted
4285 tok_cur_tag.flag 'force-quirks', true
4286 tok_state = tok_state_data
4290 tok_state = tok_state_data
4291 tok_cur_tag.flag 'force-quirks', true
4292 cur -= 1 # Reconsume
4296 tok_cur_tag.flag 'force-quirks', true
4297 tok_state = tok_state_bogus_doctype
4300 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4301 tok_state_doctype_system_identifier_double_quoted = ->
4302 c = txt.charAt(cur++)
4304 tok_state = tok_state_after_doctype_system_identifier
4308 tok_cur_tag.system_identifier += "\ufffd"
4312 tok_cur_tag.flag 'force-quirks', true
4313 tok_state = tok_state_data
4317 tok_state = tok_state_data
4318 tok_cur_tag.flag 'force-quirks', true
4319 cur -= 1 # Reconsume
4322 tok_cur_tag.system_identifier += c
4325 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4326 tok_state_doctype_system_identifier_single_quoted = ->
4327 c = txt.charAt(cur++)
4329 tok_state = tok_state_after_doctype_system_identifier
4333 tok_cur_tag.system_identifier += "\ufffd"
4337 tok_cur_tag.flag 'force-quirks', true
4338 tok_state = tok_state_data
4342 tok_state = tok_state_data
4343 tok_cur_tag.flag 'force-quirks', true
4344 cur -= 1 # Reconsume
4347 tok_cur_tag.system_identifier += c
4350 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4351 tok_state_after_doctype_system_identifier = ->
4352 c = txt.charAt(cur++)
4353 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4356 tok_state = tok_state_data
4360 tok_state = tok_state_data
4361 tok_cur_tag.flag 'force-quirks', true
4362 cur -= 1 # Reconsume
4366 # do _not_ tok_cur_tag.flag 'force-quirks', true
4367 tok_state = tok_state_bogus_doctype
4370 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4371 tok_state_bogus_doctype = ->
4372 c = txt.charAt(cur++)
4374 tok_state = tok_state_data
4377 tok_state = tok_state_data
4378 cur -= 1 # Reconsume
4383 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4384 tok_state_cdata_section = ->
4385 tok_state = tok_state_data
4386 next_gt = txt.indexOf ']]>', cur
4388 val = txt.substr cur
4391 val = txt.substr cur, (next_gt - cur)
4393 return new_character_token val # fixfull split
4395 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4396 # Don't set this as a state, just call it
4397 # returns a string (NOT a text node)
4398 parse_character_reference = (allowed_char = null, in_attr = false) ->
4399 if cur >= txt.length
4401 switch c = txt.charAt(cur)
4402 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4403 # explicitly not a parse error
4406 # there has to be "one or more" alnums between & and ; to be a parse error
4409 if cur + 1 >= txt.length
4411 if txt.charAt(cur + 1).toLowerCase() is 'x'
4420 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4425 if txt.charAt(start + i) is ';'
4429 code_point = txt.substr(start, i)
4430 while code_point.charAt(0) is '0' and code_point.length > 1
4431 code_point = code_point.substr 1
4432 code_point = parseInt(code_point, base)
4433 if unicode_fixes[code_point]?
4435 return unicode_fixes[code_point]
4437 if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4441 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4443 return from_code_point code_point
4447 if alnum.indexOf(txt.charAt(cur + i)) is -1
4450 # exit early, because parse_error() below needs at least one alnum
4452 if txt.charAt(cur + i) is ';'
4453 i += 1 # include ';' terminator in value
4454 decoded = decode_named_char_ref txt.substr(cur, i)
4461 # no ';' terminator (only legacy char refs)
4463 for i in [2..max] # no prefix matches, so ok to check shortest first
4464 c = legacy_char_refs[txt.substr(cur, i)]
4467 if txt.charAt(cur + i) is '='
4468 # "because some legacy user agents will
4469 # misinterpret the markup in those cases"
4472 if alnum.indexOf(txt.charAt(cur + i)) > -1
4473 # this makes attributes forgiving about url args
4475 # ok, and besides the weird exceptions for attributes...
4476 # return the matching char
4477 cur += i # consume entity chars
4478 parse_error() # because no terminating ";"
4482 return # never reached
4484 # tree constructor initialization
4485 # see comments on TYPE_TAG/etc for the structure of this data
4488 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4490 afe = [] # active formatting elements
4491 template_ins_modes = []
4492 ins_mode = ins_mode_initial
4493 original_ins_mode = ins_mode # TODO check spec
4494 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4495 flag_frameset_ok = true
4497 flag_foster_parenting = false
4498 form_element_pointer = null
4499 temporary_buffer = null
4500 pending_table_character_tokens = []
4501 head_element_pointer = null
4502 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4503 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4505 # tokenizer initialization
4506 tok_state = tok_state_data
4508 # text pre-processing
4509 # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4510 txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4511 txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4512 txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4514 if args.name is "plain-text-unsafe.dat #4"
4517 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4522 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4525 serialize_els = (els, shallow, show_ids) ->
4531 serialized += t.serialize shallow, show_ids
4534 module.exports.parse_html = parse_html
4535 module.exports.debug_log_reset = debug_log_reset
4536 module.exports.debug_log_each = debug_log_each
4537 module.exports.TYPE_TAG = TYPE_TAG
4538 module.exports.TYPE_TEXT = TYPE_TEXT
4539 module.exports.TYPE_COMMENT = TYPE_COMMENT
4540 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4541 module.exports.NS_HTML = NS_HTML
4542 module.exports.NS_MATHML = NS_MATHML
4543 module.exports.NS_SVG = NS_SVG