1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
55 module = exports: window.wheic
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
79 debug_log_each = (cb) ->
80 for str in g_debug_log
85 constructor: (type, args = {}) ->
86 @type = type # one of the TYPE_* constants above
87 @name = args.name ? '' # tag name
88 @text = args.text ? '' # contents for text/comment nodes
89 @attrs = args.attrs ? {}
90 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91 @children = args.children ? []
92 @namespace = args.namespace ? NS_HTML
93 @parent = args.parent ? null
94 @token = args.token ? null
95 @flags = args.flags ? {}
99 @id = "#{++prev_node_id}"
100 acknowledge_self_closing: ->
102 @token.flag 'did_self_close'
104 @flag 'did_self_close', true
105 flag: (key, value = null) ->
110 serialize: (shallow = false, show_ids = false) -> # for unit tests
115 ret += JSON.stringify @name
130 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
136 ret += c.serialize shallow, show_ids
140 ret += JSON.stringify @text
143 ret += JSON.stringify @text
145 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
148 when TYPE_AAA_BOOKMARK
149 ret += 'aaa_bookmark'
152 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
155 # helpers: (only take args that are normally known when parser creates nodes)
156 new_open_tag = (name) ->
157 return new Node TYPE_START_TAG, name: name
158 new_end_tag = (name) ->
159 return new Node TYPE_END_TAG, name: name
160 new_element = (name) ->
161 return new Node TYPE_TAG, name: name
162 new_text_node = (txt) ->
163 return new Node TYPE_TEXT, text: txt
164 new_character_token = new_text_node
165 new_comment_token = (txt) ->
166 return new Node TYPE_COMMENT, text: txt
167 new_doctype_token = (name) ->
168 return new Node TYPE_DOCTYPE, name: name
170 return new Node TYPE_EOF
172 return new Node TYPE_AFE_MARKER
173 new_aaa_bookmark = ->
174 return new Node TYPE_AAA_BOOKMARK
176 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
177 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
178 digits = "0123456789"
179 alnum = lc_alpha + uc_alpha + digits
180 hex_chars = digits + "abcdefABCDEF"
182 is_uc_alpha = (str) ->
183 return str.length is 1 and uc_alpha.indexOf(str) > -1
184 is_lc_alpha = (str) ->
185 return str.length is 1 and lc_alpha.indexOf(str) > -1
187 # some SVG elements have dashes in them
188 tag_name_chars = alnum + "-"
190 # http://www.w3.org/TR/html5/infrastructure.html#space-character
191 space_chars = "\u0009\u000a\u000c\u000d\u0020"
193 return txt.length is 1 and space_chars.indexOf(txt) > -1
194 is_space_tok = (t) ->
195 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
197 is_input_hidden_tok = (t) ->
198 return unless t.type is TYPE_START_TAG
201 if a[1].toLowerCase() is 'hidden'
206 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
207 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
209 # These are the character references that don't need a terminating semicolon
210 # min length: 2, max: 6, none are a prefix of any other.
212 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
213 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
214 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
215 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
216 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
217 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
218 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
219 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
220 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
221 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
222 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
223 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
224 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
225 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
226 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
227 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
228 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
232 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
233 raw_text_elements = ['script', 'style']
234 escapable_raw_text_elements = ['textarea', 'title']
235 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
237 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
238 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
239 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
240 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
241 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
242 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
243 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
244 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
245 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
246 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
247 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
248 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
249 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
250 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
254 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
256 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
257 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
258 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
259 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
260 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
261 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
262 'determinant', 'diff', 'divergence', 'divide', 'domain',
263 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
264 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
265 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
266 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
267 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
268 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
269 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
270 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
271 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
272 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
273 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
274 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
275 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
276 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
277 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
278 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
279 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
280 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
281 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
282 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
283 'vectorproduct', 'xor'
285 # foreign_elements = [svg_elements..., mathml_elements...]
286 #normal_elements = All other allowed HTML elements are normal elements.
290 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
291 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
292 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
293 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
294 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
295 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
296 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
297 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
298 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
299 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
300 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
301 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
302 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
303 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
304 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
305 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
306 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
307 wbr:NS_HTML, xmp:NS_HTML,
310 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
311 'annotation-xml':NS_MATHML,
314 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
317 formatting_elements = {
318 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
319 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
323 mathml_text_integration = {
324 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
326 is_mathml_text_integration_point = (el) ->
327 return mathml_text_integration[el.name] = el.namespace
328 is_html_integration = (el) -> # DON'T PASS A TOKEN
329 if el.namespace is NS_MATHML and el.name is 'annotation-xml'
330 if el.attrs.encoding?
331 if el.attrs.encoding.toLowerCase() is 'text/html'
333 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
336 if el.namespace is NS_SVG
337 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
342 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
346 foster_parenting_targets = {
369 el_is_special = (e) ->
370 return special_elements[e.name] is e.namespace
372 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
373 el_is_special_not_adp = (el) ->
374 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
378 altglyphdef: 'altGlyphDef'
379 altglyphitem: 'altGlyphItem'
380 animatecolor: 'animateColor'
381 animatemotion: 'animateMotion'
382 animatetransform: 'animateTransform'
385 fecolormatrix: 'feColorMatrix'
386 fecomponenttransfer: 'feComponentTransfer'
387 fecomposite: 'feComposite'
388 feconvolvematrix: 'feConvolveMatrix'
389 fediffuselighting: 'feDiffuseLighting'
390 fedisplacementmap: 'feDisplacementMap'
391 fedistantlight: 'feDistantLight'
392 fedropshadow: 'feDropShadow'
398 fegaussianblur: 'feGaussianBlur'
401 femergenode: 'feMergeNode'
402 femorphology: 'feMorphology'
404 fepointlight: 'fePointLight'
405 fespecularlighting: 'feSpecularLighting'
406 fespotlight: 'feSpotLight'
408 feturbulence: 'feTurbulence'
409 foreignobject: 'foreignObject'
411 lineargradient: 'linearGradient'
412 radialgradient: 'radialGradient'
415 svg_attribute_fixes = {
416 attributename: 'attributeName'
417 attributetype: 'attributeType'
418 basefrequency: 'baseFrequency'
419 baseprofile: 'baseProfile'
421 clippathunits: 'clipPathUnits'
422 contentscripttype: 'contentScriptType'
423 contentstyletype: 'contentStyleType'
424 diffuseconstant: 'diffuseConstant'
426 externalresourcesrequired: 'externalResourcesRequired'
427 filterres: 'filterRes'
428 filterunits: 'filterUnits'
430 gradienttransform: 'gradientTransform'
431 gradientunits: 'gradientUnits'
432 kernelmatrix: 'kernelMatrix'
433 kernelunitlength: 'kernelUnitLength'
434 keypoints: 'keyPoints'
435 keysplines: 'keySplines'
437 lengthadjust: 'lengthAdjust'
438 limitingconeangle: 'limitingConeAngle'
439 markerheight: 'markerHeight'
440 markerunits: 'markerUnits'
441 markerwidth: 'markerWidth'
442 maskcontentunits: 'maskContentUnits'
443 maskunits: 'maskUnits'
444 numoctaves: 'numOctaves'
445 pathlength: 'pathLength'
446 patterncontentunits: 'patternContentUnits'
447 patterntransform: 'patternTransform'
448 patternunits: 'patternUnits'
449 pointsatx: 'pointsAtX'
450 pointsaty: 'pointsAtY'
451 pointsatz: 'pointsAtZ'
452 preservealpha: 'preserveAlpha'
453 preserveaspectratio: 'preserveAspectRatio'
454 primitiveunits: 'primitiveUnits'
457 repeatcount: 'repeatCount'
458 repeatdur: 'repeatDur'
459 requiredextensions: 'requiredExtensions'
460 requiredfeatures: 'requiredFeatures'
461 specularconstant: 'specularConstant'
462 specularexponent: 'specularExponent'
463 spreadmethod: 'spreadMethod'
464 startoffset: 'startOffset'
465 stddeviation: 'stdDeviation'
466 stitchtiles: 'stitchTiles'
467 surfacescale: 'surfaceScale'
468 systemlanguage: 'systemLanguage'
469 tablevalues: 'tableValues'
472 textlength: 'textLength'
474 viewtarget: 'viewTarget'
475 xchannelselector: 'xChannelSelector'
476 ychannelselector: 'yChannelSelector'
477 zoomandpan: 'zoomAndPan'
479 adjust_mathml_attributes = (t) ->
481 if a[0] is 'definitionurl'
482 a[0] = 'definitionURL'
484 adjust_svg_attributes = (t) ->
486 if svg_attribute_fixes[a[0]]?
487 a[0] = svg_attribute_fixes[a[0]]
489 adjust_foreign_attributes = (t) ->
493 # decode_named_char_ref()
495 # The list of named character references is _huge_ so ask the browser to decode
496 # for us instead of wasting bandwidth/space on including the table here.
498 # Pass without the "&" but with the ";" examples:
499 # for "&" pass "amp;"
500 # for "′" pass "x2032;"
503 textarea: document.createElement('textarea')
505 # TODO test this in IE8
506 decode_named_char_ref = (txt) ->
508 decoded = g_dncr.cache[txt]
509 return decoded if decoded?
510 g_dncr.textarea.innerHTML = txt
511 decoded = g_dncr.textarea.value
512 return null if decoded is txt
513 return g_dncr.cache[txt] = decoded
515 parse_html = (txt, parse_error_cb = null) ->
516 cur = 0 # index of next char in txt to be parsed
517 # declare doc and tokenizer variables so they're in scope below
519 open_els = null # stack of open elements
520 afe = null # active formatting elements
521 template_ins_modes = null
523 original_ins_mode = null
525 tok_cur_tag = null # partially parsed tag
526 flag_scripting = null
527 flag_frameset_ok = null
529 flag_foster_parenting = null
530 form_element_pointer = null
531 temporary_buffer = null
532 pending_table_character_tokens = null
533 head_element_pointer = null
534 flag_fragment_parsing = null
535 context_element = null
544 console.log "Parse error at character #{cur} of #{txt.length}"
546 afe_push = (new_el) ->
549 if el.name is new_el.name and el.namespace is new_el.namespace
551 continue unless new_el.attrs[k] is v
552 for k, v of new_el.attrs
553 continue unless el.attrs[k] is v
560 afe.unshift new_afe_marker()
562 # the functions below impliment the Tree Contstruction algorithm
563 # http://www.w3.org/TR/html5/syntax.html#tree-construction
565 # But first... the helpers
566 template_tag_is_open = ->
568 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
571 is_in_scope_x = (tag_name, scope, namespace) ->
573 if t.name is tag_name and (namespace is null or namespace is t.namespace)
575 if scope[t.name] is t.namespace
578 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
580 if t.name is tag_name and (namespace is null or namespace is t.namespace)
582 if scope[t.name] is t.namespace
584 if scope2[t.name] is t.namespace
588 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
589 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
590 template: NS_HTML, mi: NS_MATHML,
592 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
593 'annotation-xml': NS_MATHML,
595 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
597 button_scopers = button: NS_HTML
598 li_scopers = ol: NS_HTML, ul: NS_HTML
599 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
600 is_in_scope = (tag_name, namespace = null) ->
601 return is_in_scope_x tag_name, standard_scopers, namespace
602 is_in_button_scope = (tag_name, namespace = null) ->
603 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
604 is_in_table_scope = (tag_name, namespace = null) ->
605 return is_in_scope_x tag_name, table_scopers, namespace
606 # aka is_in_list_item_scope
607 is_in_li_scope = (tag_name, namespace = null) ->
608 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
609 is_in_select_scope = (tag_name, namespace = null) ->
611 if t.name is tag_name and (namespace is null or namespace is t.namespace)
613 if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
616 # this checks for a particular element, not by name
617 el_is_in_scope = (el) ->
621 if standard_scopers[t.name] is t.namespace
625 clear_to_table_stopers = {
630 clear_stack_to_table_context = ->
632 if clear_to_table_stopers[open_els[0].name]?
636 clear_to_table_body_stopers = {
643 clear_stack_to_table_body_context = ->
645 if clear_to_table_body_stopers[open_els[0].name]?
649 clear_to_table_row_stopers = {
654 clear_stack_to_table_row_context = ->
656 if clear_to_table_row_stopers[open_els[0].name]?
660 clear_afe_to_marker = ->
662 return unless afe.length > 0 # this happens in fragment case, ?spec error
664 if el.type is TYPE_AFE_MARKER
669 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
671 # 1. Let last be false.
673 # 2. Let node be the last node in the stack of open elements.
675 node = open_els[node_i]
676 # 3. Loop: If node is the first node in the stack of open elements,
677 # then set last to true, and, if the parser was originally created as
678 # part of the HTML fragment parsing algorithm (fragment case) set node
679 # to the context element.
681 if node_i is open_els.length - 1
683 # fixfull (fragment case)
685 # 4. If node is a select element, run these substeps:
686 if node.name is 'select'
687 # 1. If last is true, jump to the step below labeled done.
689 # 2. Let ancestor be node.
692 # 3. Loop: If ancestor is the first node in the stack of
693 # open elements, jump to the step below labeled done.
695 if ancestor_i is open_els.length - 1
697 # 4. Let ancestor be the node before ancestor in the stack
700 ancestor = open_els[ancestor_i]
701 # 5. If ancestor is a template node, jump to the step below
703 if ancestor.name is 'template'
705 # 6. If ancestor is a table node, switch the insertion mode
706 # to "in select in table" and abort these steps.
707 if ancestor.name is 'table'
708 ins_mode = ins_mode_in_select_in_table
710 # 7. Jump back to the step labeled loop.
711 # 8. Done: Switch the insertion mode to "in select" and abort
713 ins_mode = ins_mode_in_select
715 # 5. If node is a td or th element and last is false, then switch
716 # the insertion mode to "in cell" and abort these steps.
717 if (node.name is 'td' or node.name is 'th') and last is false
718 ins_mode = ins_mode_in_cell
720 # 6. If node is a tr element, then switch the insertion mode to "in
721 # row" and abort these steps.
723 ins_mode = ins_mode_in_row
725 # 7. If node is a tbody, thead, or tfoot element, then switch the
726 # insertion mode to "in table body" and abort these steps.
727 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
728 ins_mode = ins_mode_in_table_body
730 # 8. If node is a caption element, then switch the insertion mode
731 # to "in caption" and abort these steps.
732 if node.name is 'caption'
733 ins_mode = ins_mode_in_caption
735 # 9. If node is a colgroup element, then switch the insertion mode
736 # to "in column group" and abort these steps.
737 if node.name is 'colgroup'
738 ins_mode = ins_mode_in_column_group
740 # 10. If node is a table element, then switch the insertion mode to
741 # "in table" and abort these steps.
742 if node.name is 'table'
743 ins_mode = ins_mode_in_table
745 # 11. If node is a template element, then switch the insertion mode
746 # to the current template insertion mode and abort these steps.
747 # fixfull (template insertion mode stack)
749 # 12. If node is a head element and last is true, then switch the
750 # insertion mode to "in body" ("in body"! not "in head"!) and abort
751 # these steps. (fragment case)
752 if node.name is 'head' and last
753 ins_mode = ins_mode_in_body
755 # 13. If node is a head element and last is false, then switch the
756 # insertion mode to "in head" and abort these steps.
757 if node.name is 'head' and last is false
758 ins_mode = ins_mode_in_head
760 # 14. If node is a body element, then switch the insertion mode to
761 # "in body" and abort these steps.
762 if node.name is 'body'
763 ins_mode = ins_mode_in_body
765 # 15. If node is a frameset element, then switch the insertion mode
766 # to "in frameset" and abort these steps. (fragment case)
767 if node.name is 'frameset'
768 ins_mode = ins_mode_in_frameset
770 # 16. If node is an html element, run these substeps:
771 if node.name is 'html'
772 # 1. If the head element pointer is null, switch the insertion
773 # mode to "before head" and abort these steps. (fragment case)
774 if head_element_pointer is null
775 ins_mode = ins_mode_before_head
777 # 2. Otherwise, the head element pointer is not null,
778 # switch the insertion mode to "after head" and abort these
780 ins_mode = ins_mode_after_head
782 # 17. If last is true, then switch the insertion mode to "in body"
783 # and abort these steps. (fragment case)
785 ins_mode = ins_mode_in_body
787 # 18. Let node now be the node before node in the stack of open
790 node = open_els[node_i]
791 # 19. Return to the step labeled loop.
795 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
796 adjusted_current_node = ->
797 if open_els.length is 1 and flag_fragment_parsing
798 return context_element
801 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
802 # this implementation is structured (mostly) as described at the link above.
803 # capitalized comments are the "labels" described at the link above.
805 return if afe.length is 0
806 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
811 if i is afe.length - 1
814 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
819 el = insert_html_element afe[i].token
824 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
825 # adoption agency algorithm
827 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
828 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
829 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
830 adoption_agency = (subject) ->
831 debug_log "adoption_agency()"
832 debug_log "tree: #{serialize_els doc.children, false, true}"
833 debug_log "open_els: #{serialize_els open_els, true, true}"
834 debug_log "afe: #{serialize_els afe, true, true}"
835 if open_els[0].name is subject
838 # remove it from the list of active formatting elements (if found)
843 debug_log "aaa: starting off with subject on top of stack, exiting"
850 # 5. Let formatting element be the last element in the list of
851 # active formatting elements that: is between the end of the list
852 # and the last scope marker in the list, if any, or the start of
853 # the list otherwise, and has the tag name subject.
855 for t, fe_of_afe in afe
856 if t.type is TYPE_AFE_MARKER
861 # If there is no such element, then abort these steps and instead
862 # act as described in the "any other end tag" entry above.
864 debug_log "aaa: fe not found in afe"
865 in_body_any_other_end_tag subject
867 # 6. If formatting element is not in the stack of open elements,
868 # then this is a parse error; remove the element from the list, and
871 for t, fe_of_open_els in open_els
876 debug_log "aaa: fe not found in open_els"
878 # "remove it from the list" must mean afe, since it's not in open_els
879 afe.splice fe_of_afe, 1
881 # 7. If formatting element is in the stack of open elements, but
882 # the element is not in scope, then this is a parse error; abort
884 unless el_is_in_scope fe
885 debug_log "aaa: fe not in scope"
888 # 8. If formatting element is not the current node, this is a parse
889 # error. (But do not abort these steps.)
890 unless open_els[0] is fe
893 # 9. Let furthest block be the topmost node in the stack of open
894 # elements that is lower in the stack than formatting element, and
895 # is an element in the special category. There might not be one.
897 fb_of_open_els = null
904 # and continue, to see if there's one that's more "topmost"
905 # 10. If there is no furthest block, then the UA must first pop all
906 # the nodes from the bottom of the stack of open elements, from the
907 # current node up to and including formatting element, then remove
908 # formatting element from the list of active formatting elements,
909 # and finally abort these steps.
911 debug_log "aaa: no fb"
915 afe.splice fe_of_afe, 1
917 # 11. Let common ancestor be the element immediately above
918 # formatting element in the stack of open elements.
919 ca = open_els[fe_of_open_els + 1] # common ancestor
921 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
922 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
923 bookmark = new_aaa_bookmark()
926 afe.splice i, 0, bookmark
928 node = last_node = fb
932 # 3. Let node be the element immediately above node in the
933 # stack of open elements, or if node is no longer in the stack
934 # of open elements (e.g. because it got removed by this
935 # algorithm), the element that was immediately above node in
936 # the stack of open elements before node was removed.
940 node_next = open_els[i + 1]
942 node = node_next ? node_above
943 debug_log "inner loop #{inner}"
944 debug_log "tree: #{serialize_els doc.children, false, true}"
945 debug_log "open_els: #{serialize_els open_els, true, true}"
946 debug_log "afe: #{serialize_els afe, true, true}"
947 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
948 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
949 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
950 debug_log "node: #{node.serialize true, true}"
951 # TODO make sure node_above gets re-set if/when node is removed from open_els
953 # 4. If node is formatting element, then go to the next step in
954 # the overall algorithm.
958 # 5. If inner loop counter is greater than three and node is in
959 # the list of active formatting elements, then remove node from
960 # the list of active formatting elements.
966 debug_log "max out inner"
971 # 6. If node is not in the list of active formatting elements,
972 # then remove node from the stack of open elements and then go
973 # back to the step labeled inner loop.
975 debug_log "not in afe"
978 node_above = open_els[i + 1]
982 debug_log "the bones"
983 # 7. create an element for the token for which the element node
984 # was created, in the HTML namespace, with common ancestor as
985 # the intended parent; replace the entry for node in the list
986 # of active formatting elements with an entry for the new
987 # element, replace the entry for node in the stack of open
988 # elements with an entry for the new element, and let node be
990 new_node = token_to_element node.token, NS_HTML, ca
994 debug_log "replaced in afe"
998 node_above = open_els[i + 1]
999 open_els[i] = new_node
1000 debug_log "replaced in open_els"
1003 # 8. If last node is furthest block, then move the
1004 # aforementioned bookmark to be immediately after the new node
1005 # in the list of active formatting elements.
1010 debug_log "removed bookmark"
1014 # "after" means lower
1015 afe.splice i, 0, bookmark # "after as <-
1016 debug_log "placed bookmark after node"
1017 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1019 # 9. Insert last node into node, first removing it from its
1020 # previous parent node if any.
1021 if last_node.parent?
1022 debug_log "last_node has parent"
1023 for c, i in last_node.parent.children
1025 debug_log "removing last_node from parent"
1026 last_node.parent.children.splice i, 1
1028 node.children.push last_node
1029 last_node.parent = node
1030 # 10. Let last node be node.
1033 # 11. Return to the step labeled inner loop.
1034 # 14. Insert whatever last node ended up being in the previous step
1035 # at the appropriate place for inserting a node, but using common
1036 # ancestor as the override target.
1038 # In the case where fe is immediately followed by fb:
1039 # * inner loop exits out early (node==fe)
1041 # * last_node is still in the tree (not a duplicate)
1042 if last_node.parent?
1043 debug_log "FEFIRST? last_node has parent"
1044 for c, i in last_node.parent.children
1046 debug_log "removing last_node from parent"
1047 last_node.parent.children.splice i, 1
1050 debug_log "after aaa inner loop"
1051 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1052 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1053 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1054 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1055 debug_log "tree: #{serialize_els doc.children, false, true}"
1060 # can't use standard insert token thing, because it's already in
1061 # open_els and must stay at it's current position in open_els
1062 dest = adjusted_insertion_location ca
1063 dest[0].children.splice dest[1], 0, last_node
1064 last_node.parent = dest[0]
1067 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1068 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1069 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1070 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1071 debug_log "tree: #{serialize_els doc.children, false, true}"
1073 # 15. Create an element for the token for which formatting element
1074 # was created, in the HTML namespace, with furthest block as the
1076 new_element = token_to_element fe.token, NS_HTML, fb
1077 # 16. Take all of the child nodes of furthest block and append them
1078 # to the element created in the last step.
1079 while fb.children.length
1080 t = fb.children.shift()
1081 t.parent = new_element
1082 new_element.children.push t
1083 # 17. Append that new element to furthest block.
1084 new_element.parent = fb
1085 fb.children.push new_element
1086 # 18. Remove formatting element from the list of active formatting
1087 # elements, and insert the new element into the list of active
1088 # formatting elements at the position of the aforementioned
1096 afe[i] = new_element
1098 # 19. Remove formatting element from the stack of open elements,
1099 # and insert the new element into the stack of open elements
1100 # immediately below the position of furthest block in that stack.
1101 for t, i in open_els
1103 open_els.splice i, 1
1105 for t, i in open_els
1107 open_els.splice i, 0, new_element
1109 # 20. Jump back to the step labeled outer loop.
1110 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1111 debug_log "tree: #{serialize_els doc.children, false, true}"
1112 debug_log "open_els: #{serialize_els open_els, true, true}"
1113 debug_log "afe: #{serialize_els afe, true, true}"
1114 debug_log "AAA DONE"
1116 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1117 close_p_element = ->
1118 generate_implied_end_tags 'p' # arg is exception
1119 if open_els[0].name isnt 'p'
1121 while open_els.length > 1 # just in case
1122 el = open_els.shift()
1125 close_p_if_in_button_scope = ->
1126 if is_in_button_scope 'p'
1129 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1130 # aka insert_a_character = (t) ->
1131 insert_character = (t) ->
1132 dest = adjusted_insertion_location()
1133 # fixfull check for Document node
1135 prev = dest[0].children[dest[1] - 1]
1136 if prev.type is TYPE_TEXT
1139 dest[0].children.splice dest[1], 0, t
1142 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1143 process_token = (t) ->
1144 acn = adjusted_current_node()
1148 if acn.namespace is NS_HTML
1151 if is_mathml_text_integration_point(acn)
1152 if t.type is TYPE_START_TAG and (t.name is 'mglyph' or t.name is 'malignmark')
1155 if t.type is TYPE_TEXT
1158 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1161 if is_html_integration acn
1162 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1165 if t.type is TYPE_EOF
1168 in_foreign_content t
1172 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1173 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1174 adjusted_insertion_location = (override_target = null) ->
1175 # 1. If there was an override target specified, then let target be the
1178 target = override_target
1179 else # Otherwise, let target be the current node.
1180 target = open_els[0]
1181 # 2. Determine the adjusted insertion location using the first matching
1182 # steps from the following list:
1184 # If foster parenting is enabled and target is a table, tbody, tfoot,
1185 # thead, or tr element Foster parenting happens when content is
1186 # misnested in tables.
1187 if flag_foster_parenting and foster_parenting_targets[target.name]
1188 loop # once. this is here so we can ``break`` to "abort these substeps"
1189 # 1. Let last template be the last template element in the
1190 # stack of open elements, if any.
1191 last_template = null
1192 last_template_i = null
1193 for el, i in open_els
1194 if el.name is 'template' and el.namespace is NS_HTML
1198 # 2. Let last table be the last table element in the stack of
1199 # open elements, if any.
1202 for el, i in open_els
1203 if el.name is 'table' and el.namespace is NS_HTML
1207 # 3. If there is a last template and either there is no last
1208 # table, or there is one, but last template is lower (more
1209 # recently added) than last table in the stack of open
1210 # elements, then: let adjusted insertion location be inside
1211 # last template's template contents, after its last child (if
1212 # any), and abort these substeps.
1213 if last_template and (last_table is null or last_template_i < last_table_i)
1214 target = last_template # fixfull should be it's contents
1215 target_i = target.children.length
1217 # 4. If there is no last table, then let adjusted insertion
1218 # location be inside the first element in the stack of open
1219 # elements (the html element), after its last child (if any),
1220 # and abort these substeps. (fragment case)
1221 if last_table is null
1223 target = open_els[open_els.length - 1]
1224 target_i = target.children.length
1226 # 5. If last table has a parent element, then let adjusted
1227 # insertion location be inside last table's parent element,
1228 # immediately before last table, and abort these substeps.
1229 if last_table.parent?
1230 for c, i in last_table.parent.children
1232 target = last_table.parent
1236 # 6. Let previous element be the element immediately above last
1237 # table in the stack of open elements.
1239 # huh? how could it not have a parent?
1240 previous_element = open_els[last_table_i + 1]
1241 # 7. Let adjusted insertion location be inside previous
1242 # element, after its last child (if any).
1243 target = previous_element
1244 target_i = target.children.length
1245 # Note: These steps are involved in part because it's possible
1246 # for elements, the table element in this case in particular,
1247 # to have been moved by a script around in the DOM, or indeed
1248 # removed from the DOM entirely, after the element was inserted
1250 break # don't really loop
1252 # Otherwise Let adjusted insertion location be inside target, after
1253 # its last child (if any).
1254 target_i = target.children.length
1256 # 3. If the adjusted insertion location is inside a template element,
1257 # let it instead be inside the template element's template contents,
1258 # after its last child (if any).
1259 # fixfull (template)
1261 # 4. Return the adjusted insertion location.
1262 return [target, target_i]
1264 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1265 # aka create_an_element_for_token
1266 token_to_element = (t, namespace, intended_parent) ->
1267 # convert attributes into a hash
1270 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1271 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1273 # TODO 2. If the newly created element has an xmlns attribute in the
1274 # XMLNS namespace whose value is not exactly the same as the element's
1275 # namespace, that is a parse error. Similarly, if the newly created
1276 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1277 # value is not the XLink Namespace, that is a parse error.
1279 # fixfull: the spec says stuff about form pointers and ownerDocument
1283 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1284 insert_foreign_element = (token, namespace) ->
1285 ail = adjusted_insertion_location()
1288 el = token_to_element token, namespace, ail_el
1289 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1291 ail_el.children.splice ail_i, 0, el
1294 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1295 insert_html_element = (token) ->
1296 insert_foreign_element token, NS_HTML
1298 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1299 # position should be [node, index_within_children]
1300 insert_comment = (t, position = null) ->
1301 position ?= adjusted_insertion_location()
1302 position[0].children.splice position[1], 0, t
1305 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1306 parse_generic_raw_text = (t) ->
1307 insert_html_element t
1308 tok_state = tok_state_rawtext
1309 original_ins_mode = ins_mode
1310 ins_mode = ins_mode_text
1311 parse_generic_rcdata_text = (t) ->
1312 insert_html_element t
1313 tok_state = tok_state_rcdata
1314 original_ins_mode = ins_mode
1315 ins_mode = ins_mode_text
1317 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1318 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1319 generate_implied_end_tags = (except = null) ->
1320 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1323 # 8.2.5.4 The rules for parsing tokens in HTML content
1324 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1326 # 8.2.5.4.1 The "initial" insertion mode
1327 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1328 ins_mode_initial = (t) ->
1331 if t.type is TYPE_COMMENT
1335 if t.type is TYPE_DOCTYPE
1336 # FIXME check identifiers, set quirks, etc
1339 ins_mode = ins_mode_before_html
1342 #fixfull (iframe, quirks)
1343 ins_mode = ins_mode_before_html
1347 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1348 ins_mode_before_html = (t) ->
1349 if t.type is TYPE_DOCTYPE
1352 if t.type is TYPE_COMMENT
1357 if t.type is TYPE_START_TAG and t.name is 'html'
1358 el = token_to_element t, NS_HTML, doc
1359 doc.children.push el
1360 open_els.unshift(el)
1361 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1362 ins_mode = ins_mode_before_head
1364 if t.type is TYPE_END_TAG
1365 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1366 # fall through to "anything else"
1371 html_tok = new_open_tag 'html'
1372 el = token_to_element html_tok, NS_HTML, doc
1373 doc.children.push el
1375 # ?fixfull browsing context
1376 ins_mode = ins_mode_before_head
1380 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1381 ins_mode_before_head = (t) ->
1384 if t.type is TYPE_COMMENT
1387 if t.type is TYPE_DOCTYPE
1390 if t.type is TYPE_START_TAG and t.name is 'html'
1393 if t.type is TYPE_START_TAG and t.name is 'head'
1394 el = insert_html_element t
1395 head_element_pointer = el
1396 ins_mode = ins_mode_in_head
1397 if t.type is TYPE_END_TAG
1398 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1399 # fall through to Anything else below
1404 head_tok = new_open_tag 'head'
1405 el = insert_html_element head_tok
1406 head_element_pointer = el
1407 ins_mode = ins_mode_in_head
1410 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1411 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1412 open_els.shift() # spec says this will be a 'head' node
1413 ins_mode = ins_mode_after_head
1415 ins_mode_in_head = (t) ->
1416 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1419 if t.type is TYPE_COMMENT
1422 if t.type is TYPE_DOCTYPE
1425 if t.type is TYPE_START_TAG and t.name is 'html'
1428 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1429 el = insert_html_element t
1431 t.acknowledge_self_closing()
1433 if t.type is TYPE_START_TAG and t.name is 'meta'
1434 el = insert_html_element t
1436 t.acknowledge_self_closing()
1437 # fixfull encoding stuff
1439 if t.type is TYPE_START_TAG and t.name is 'title'
1440 parse_generic_rcdata_text t
1442 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1443 parse_generic_raw_text t
1445 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1446 insert_html_element t
1447 ins_mode = ins_mode_in_head_noscript
1449 if t.type is TYPE_START_TAG and t.name is 'script'
1450 ail = adjusted_insertion_location()
1451 el = token_to_element t, NS_HTML, ail
1452 el.flag 'parser-inserted', true
1453 # fixfull frament case
1454 ail[0].children.splice ail[1], 0, el
1456 tok_state = tok_state_script_data
1457 original_ins_mode = ins_mode # make sure orig... is defined
1458 ins_mode = ins_mode_text
1460 if t.type is TYPE_END_TAG and t.name is 'head'
1461 open_els.shift() # will be a head element... spec says so
1462 ins_mode = ins_mode_after_head
1464 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1465 ins_mode_in_head_else t
1467 if t.type is TYPE_START_TAG and t.name is 'template'
1468 insert_html_element t
1470 flag_frameset_ok = false
1471 ins_mode = ins_mode_in_template
1472 template_ins_modes.unshift ins_mode_in_template
1474 if t.type is TYPE_END_TAG and t.name is 'template'
1475 if template_tag_is_open()
1476 generate_implied_end_tags
1477 if open_els[0].name isnt 'template'
1480 el = open_els.shift()
1481 if el.name is 'template'
1483 clear_afe_to_marker()
1484 template_ins_modes.shift()
1489 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1492 ins_mode_in_head_else t
1494 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1495 ins_mode_in_head_noscript_else = (t) ->
1498 ins_mode = ins_mode_in_head
1500 ins_mode_in_head_noscript = (t) ->
1501 if t.type is TYPE_DOCTYPE
1504 if t.type is TYPE_START_TAG
1507 if t.type is TYPE_END_TAG and t.name is 'noscript'
1509 ins_mode = ins_mode_in_head
1511 if (t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\u000a" or t.text is "\u000c" or t.text is "\u000d" or t.text is ' ')) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1514 if t.type is TYPE_END_TAG and t.name is 'br'
1515 ins_mode_in_head_noscript_else t
1517 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1521 ins_mode_in_head_noscript_else t
1526 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1527 ins_mode_after_head_else = (t) ->
1528 body_tok = new_open_tag 'body'
1529 insert_html_element body_tok
1530 ins_mode = ins_mode_in_body
1533 ins_mode_after_head = (t) ->
1537 if t.type is TYPE_COMMENT
1540 if t.type is TYPE_DOCTYPE
1543 if t.type is TYPE_START_TAG and t.name is 'html'
1546 if t.type is TYPE_START_TAG and t.name is 'body'
1547 insert_html_element t
1548 flag_frameset_ok = false
1549 ins_mode = ins_mode_in_body
1551 if t.type is TYPE_START_TAG and t.name is 'frameset'
1552 insert_html_element t
1553 ins_mode = ins_mode_in_frameset
1555 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1557 open_els.unshift head_element_pointer
1559 for el, i of open_els
1560 if el is head_element_pointer
1561 open_els.splice i, 1
1563 console.log "warning: 23904 couldn't find head element in open_els"
1565 if t.type is TYPE_END_TAG and t.name is 'template'
1568 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1569 ins_mode_after_head_else t
1571 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1575 ins_mode_after_head_else t
1577 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1578 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1579 for el, i in open_els
1580 if el.namespace is NS_HTML and el.name is name
1581 generate_implied_end_tags name # arg is exception
1582 parse_error() unless i is 0
1587 if special_elements[el.name] is el.namespace
1591 ins_mode_in_body = (t) ->
1592 if t.type is TYPE_TEXT and t.text is "\u0000"
1599 if t.type is TYPE_TEXT
1602 flag_frameset_ok = false
1604 if t.type is TYPE_COMMENT
1607 if t.type is TYPE_DOCTYPE
1610 if t.type is TYPE_START_TAG and t.name is 'html'
1612 return if template_tag_is_open()
1613 root_attrs = open_els[open_els.length - 1].attrs
1615 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1618 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1621 if t.type is TYPE_START_TAG and t.name is 'body'
1623 return if open_els.length < 2
1624 second = open_els[open_els.length - 2]
1625 return unless second.ns is NS_HTML
1626 return unless second.name is 'body'
1627 return if template_tag_is_open()
1628 frameset_ok_flag = false
1630 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1632 if t.type is TYPE_START_TAG and t.name is 'frameset'
1634 return if open_els.length < 2
1635 second_i = open_els.length - 2
1636 second = open_els[second_i]
1637 return unless second.ns is NS_HTML
1638 return unless second.name is 'body'
1639 flag_frameset_ok = false
1641 for el, i in second.parent.children
1643 second.parent.children.splice i, 1
1645 open_els.splice second_i, 1
1646 # pop everything except the "root html element"
1647 while open_els.length > 1
1649 insert_html_element t
1650 ins_mode = ins_mode_in_frameset
1652 if t.type is TYPE_EOF
1654 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1655 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1656 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1659 unless ok_tags[t.name] is el.namespace
1662 if template_ins_modes.length > 0
1663 ins_mode_in_template t
1667 if t.type is TYPE_END_TAG and t.name is 'body'
1668 unless is_in_scope 'body'
1672 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1673 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1674 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1675 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1679 unless ok_tags[t.name] is el.namespace
1682 ins_mode = ins_mode_after_body
1684 if t.type is TYPE_END_TAG and t.name is 'html'
1685 unless is_in_scope 'body'
1689 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1690 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1691 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1692 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1696 unless ok_tags[t.name] is el.namespace
1699 ins_mode = ins_mode_after_body
1702 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1703 close_p_if_in_button_scope()
1704 insert_html_element t
1706 if t.type is TYPE_START_TAG and h_tags[t.name]?
1707 close_p_if_in_button_scope()
1708 if h_tags[open_els[0]] is NS_HTML
1711 insert_html_element t
1713 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1714 close_p_if_in_button_scope()
1715 insert_html_element t
1716 # spec: If the next token is a "LF" (U+000A) character token, then
1717 # ignore that token and move on to the next one. (Newlines at the
1718 # start of pre blocks are ignored as an authoring convenience.)
1719 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1721 flag_frameset_ok = false
1723 if t.type is TYPE_START_TAG and t.name is 'form'
1724 unless form_element_pointer is null or template_tag_is_open()
1727 close_p_if_in_button_scope()
1728 el = insert_html_element t
1729 unless template_tag_is_open()
1730 form_element_pointer = el
1732 if t.type is TYPE_START_TAG and t.name is 'li'
1733 flag_frameset_ok = false
1734 for node in open_els
1735 if node.name is 'li' and node.namespace is NS_HTML
1736 generate_implied_end_tags 'li' # arg is exception
1737 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1740 el = open_els.shift()
1741 if el.name is 'li' and el.namespace is NS_HTML
1744 if el_is_special_not_adp node
1746 close_p_if_in_button_scope()
1747 insert_html_element t
1749 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1750 flag_frameset_ok = false
1751 for node in open_els
1752 if node.name is 'dd' and node.namespace is NS_HTML
1753 generate_implied_end_tags 'dd' # arg is exception
1754 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1757 el = open_els.shift()
1758 if el.name is 'dd' and el.namespace is NS_HTML
1761 if node.name is 'dt' and node.namespace is NS_HTML
1762 generate_implied_end_tags 'dt' # arg is exception
1763 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1766 el = open_els.shift()
1767 if el.name is 'dt' and el.namespace is NS_HTML
1770 if el_is_special_not_adp node
1772 close_p_if_in_button_scope()
1773 insert_html_element t
1775 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1776 close_p_if_in_button_scope()
1777 insert_html_element t
1778 tok_state = tok_state_plaintext
1780 if t.type is TYPE_START_TAG and t.name is 'button'
1781 if is_in_scope 'button', NS_HTML
1783 generate_implied_end_tags()
1785 el = open_els.shift()
1786 if el.name is 'button' and el.namespace is NS_HTML
1789 insert_html_element t
1790 flag_frameset_ok = false
1792 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1793 unless is_in_scope t.name, NS_HTML
1796 generate_implied_end_tags()
1797 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1800 el = open_els.shift()
1801 if el.name is t.name and el.namespace is NS_HTML
1804 if t.type is TYPE_END_TAG and t.name is 'form'
1805 unless template_tag_is_open()
1806 node = form_element_pointer
1807 form_element_pointer = null
1808 if node is null or not el_is_in_scope node
1811 generate_implied_end_tags()
1812 if open_els[0] isnt node
1814 for el, i in open_els
1816 open_els.splice i, 1
1819 unless is_in_scope 'form', NS_HTML
1822 generate_implied_end_tags()
1823 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1826 el = open_els.shift()
1827 if el.name is 'form' and el.namespace is NS_HTML
1830 if t.type is TYPE_END_TAG and t.name is 'p'
1831 unless is_in_button_scope 'p', NS_HTML
1833 insert_html_element new_open_tag 'p'
1836 if t.type is TYPE_END_TAG and t.name is 'li'
1837 unless is_in_li_scope 'li', NS_HTML
1840 generate_implied_end_tags 'li' # arg is exception
1841 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1844 el = open_els.shift()
1845 if el.name is 'li' and el.namespace is NS_HTML
1848 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1849 unless is_in_scope t.name, NS_HTML
1852 generate_implied_end_tags t.name # arg is exception
1853 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1856 el = open_els.shift()
1857 if el.name is t.name and el.namespace is NS_HTML
1860 if t.type is TYPE_END_TAG and h_tags[t.name]?
1863 if h_tags[el.name] is el.namespace
1866 if standard_scopers[el.name] is el.namespace
1871 generate_implied_end_tags()
1872 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1875 el = open_els.shift()
1876 if h_tags[el.name] is el.namespace
1880 if t.type is TYPE_START_TAG and t.name is 'a'
1881 # If the list of active formatting elements contains an a element
1882 # between the end of the list and the last marker on the list (or
1883 # the start of the list if there is no marker on the list), then
1884 # this is a parse error; run the adoption agency algorithm for the
1885 # tag name "a", then remove that element from the list of active
1886 # formatting elements and the stack of open elements if the
1887 # adoption agency algorithm didn't already remove it (it might not
1888 # have if the element is not in table scope).
1891 if el.type is TYPE_AFE_MARKER
1893 if el.name is 'a' and el.namespace is NS_HTML
1901 for el, i in open_els
1903 open_els.splice i, 1
1905 el = insert_html_element t
1908 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1910 el = insert_html_element t
1913 if t.type is TYPE_START_TAG and t.name is 'nobr'
1915 el = insert_html_element t
1918 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1919 adoption_agency t.name
1921 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1923 insert_html_element t
1925 flag_frameset_ok = false
1927 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1928 unless is_in_scope t.name, NS_HTML
1931 generate_implied_end_tags()
1932 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1935 el = open_els.shift()
1936 if el.name is t.name and el.namespace is NS_HTML
1938 clear_afe_to_marker()
1940 if t.type is TYPE_START_TAG and t.name is 'table'
1941 close_p_if_in_button_scope() # fixfull quirksmode thing
1942 insert_html_element t
1943 flag_frameset_ok = false
1944 ins_mode = ins_mode_in_table
1946 if t.type is TYPE_END_TAG and t.name is 'br'
1948 t.type is TYPE_START_TAG
1950 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
1952 insert_html_element t
1954 t.acknowledge_self_closing()
1955 flag_frameset_ok = false
1957 if t.type is TYPE_START_TAG and t.name is 'input'
1959 insert_html_element t
1961 t.acknowledge_self_closing()
1962 unless is_input_hidden_tok t
1963 flag_frameset_ok = false
1965 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
1966 insert_html_element t
1968 t.acknowledge_self_closing()
1970 if t.type is TYPE_START_TAG and t.name is 'hr'
1971 close_p_if_in_button_scope()
1972 insert_html_element t
1974 t.acknowledge_self_closing()
1975 flag_frameset_ok = false
1977 if t.type is TYPE_START_TAG and t.name is 'image'
1982 if t.type is TYPE_START_TAG and t.name is 'isindex'
1984 if template_tag_is_open() is false and form_element_pointer isnt null
1986 t.acknowledge_self_closing()
1987 flag_frameset_ok = false
1988 close_p_if_in_button_scope()
1989 el = insert_html_element new_open_tag 'form'
1990 unless template_tag_is_open()
1991 form_element_pointer = el
1994 el.attrs['action'] = a[1]
1996 insert_html_element new_open_tag 'hr'
1999 insert_html_element new_open_tag 'label'
2000 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2001 input_el = new_open_tag 'input'
2006 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2007 input_el.attrs_a.push [a[0], a[1]]
2008 input_el.attrs_a.push ['name', 'isindex']
2009 # fixfull this next bit is in english... internationalize?
2010 prompt ?= "This is a searchable index. Enter search keywords: "
2011 insert_character new_character_token prompt # fixfull split
2012 # TODO submit typo "balue" in spec
2013 insert_html_element input_el
2015 # insert_character '' # you can put chars here if promt attr missing
2017 insert_html_element new_open_tag 'hr'
2020 unless template_tag_is_open()
2021 form_element_pointer = null
2023 if t.type is TYPE_START_TAG and t.name is 'textarea'
2024 insert_html_element t
2025 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2027 tok_state = tok_state_rcdata
2028 original_ins_mode = ins_mode
2029 flag_frameset_ok = false
2030 ins_mode = ins_mode_text
2032 if t.type is TYPE_START_TAG and t.name is 'xmp'
2033 close_p_if_in_button_scope()
2035 flag_frameset_ok = false
2036 parse_generic_raw_text t
2038 if t.type is TYPE_START_TAG and t.name is 'iframe'
2039 flag_frameset_ok = false
2040 parse_generic_raw_text t
2042 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2043 parse_generic_raw_text t
2045 if t.type is TYPE_START_TAG and t.name is 'select'
2047 insert_html_element t
2048 flag_frameset_ok = false
2049 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2050 ins_mode = ins_mode_in_select_in_table
2052 ins_mode = ins_mode_in_select
2054 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2055 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2058 insert_html_element t
2060 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2061 if is_in_scope 'ruby', NS_HTML
2062 generate_implied_end_tags()
2063 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2065 insert_html_element t
2067 if t.type is TYPE_START_TAG and t.name is 'rt'
2068 if is_in_scope 'ruby', NS_HTML
2069 generate_implied_end_tags 'rtc' # arg is exception
2070 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2072 insert_html_element t
2074 if t.type is TYPE_START_TAG and t.name is 'math'
2076 adjust_mathml_attributes t
2077 adjust_foreign_attributes t
2078 insert_foreign_element t, NS_MATHML
2079 if t.flag 'self-closing'
2081 t.acknowledge_self_closing()
2083 if t.type is TYPE_START_TAG and t.name is 'svg'
2085 adjust_svg_attributes t
2086 adjust_foreign_attributes t
2087 insert_foreign_element t, NS_SVG
2088 if t.flag 'self-closing'
2090 t.acknowledge_self_closing()
2092 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2095 if t.type is TYPE_START_TAG # any other start tag
2097 insert_html_element t
2099 if t.type is TYPE_END_TAG # any other end tag
2100 in_body_any_other_end_tag t.name
2104 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2105 ins_mode_text = (t) ->
2106 if t.type is TYPE_TEXT
2109 if t.type is TYPE_EOF
2111 if open_els[0].name is 'script'
2112 open_els[0].flag 'already started', true
2114 ins_mode = original_ins_mode
2117 if t.type is TYPE_END_TAG and t.name is 'script'
2119 ins_mode = original_ins_mode
2120 # fixfull the spec seems to assume that I'm going to run the script
2121 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2123 if t.type is TYPE_END_TAG
2125 ins_mode = original_ins_mode
2127 console.log 'warning: end of ins_mode_text reached'
2129 # the functions below implement the tokenizer stats described here:
2130 # http://www.w3.org/TR/html5/syntax.html#tokenization
2132 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2133 ins_mode_in_table_else = (t) ->
2135 flag_foster_parenting = true
2137 flag_foster_parenting = false
2139 can_in_table = { # FIXME do this inline like everywhere else
2146 ins_mode_in_table = (t) ->
2149 if can_in_table[t.name]
2150 original_ins_mode = ins_mode
2151 ins_mode = ins_mode_in_table_text
2154 ins_mode_in_table_else t
2162 clear_stack_to_table_context()
2164 insert_html_element t
2165 ins_mode = ins_mode_in_caption
2167 clear_stack_to_table_context()
2168 insert_html_element t
2169 ins_mode = ins_mode_in_column_group
2171 clear_stack_to_table_context()
2172 insert_html_element new_open_tag 'colgroup'
2173 ins_mode = ins_mode_in_column_group
2175 when 'tbody', 'tfoot', 'thead'
2176 clear_stack_to_table_context()
2177 insert_html_element t
2178 ins_mode = ins_mode_in_table_body
2179 when 'td', 'th', 'tr'
2180 clear_stack_to_table_context()
2181 insert_html_element new_open_tag 'tbody'
2182 ins_mode = ins_mode_in_table_body
2186 if is_in_table_scope 'table'
2188 el = open_els.shift()
2189 if el.name is 'table'
2193 when 'style', 'script', 'template'
2196 unless is_input_hidden_tok t
2197 ins_mode_in_table_else t
2200 el = insert_html_element t
2202 t.acknowledge_self_closing()
2205 if form_element_pointer?
2207 if template_tag_is_open()
2209 form_element_pointer = insert_html_element t
2212 ins_mode_in_table_else t
2216 if is_in_table_scope 'table'
2218 el = open_els.shift()
2219 if el.name is 'table'
2224 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2229 ins_mode_in_table_else t
2233 ins_mode_in_table_else t
2236 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2237 ins_mode_in_table_text = (t) ->
2238 if t.type is TYPE_TEXT and t.text is "\u0000"
2239 # huh? I thought the tokenizer didn't emit these
2242 if t.type is TYPE_TEXT
2243 pending_table_character_tokens.push t
2247 for old in pending_table_character_tokens
2248 unless is_space_tok old
2252 for old in pending_table_character_tokens
2253 insert_character old
2255 for old in pending_table_character_tokens
2256 ins_mode_table_else old
2257 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
2258 ins_mode = original_ins_mode
2261 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2262 ins_mode_in_caption = (t) ->
2263 if t.type is TYPE_END_TAG and t.name is 'caption'
2264 if is_in_table_scope 'caption'
2265 generate_implied_end_tags()
2266 if open_els[0].name isnt 'caption'
2269 el = open_els.shift()
2270 if el.name is 'caption'
2272 clear_afe_to_marker()
2273 ins_mode = ins_mode_in_table
2278 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2280 if is_in_table_scope 'caption'
2282 el = open_els.shift()
2283 if el.name is 'caption'
2285 clear_afe_to_marker()
2286 ins_mode = ins_mode_in_table
2288 # else fragment case
2290 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2296 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2297 ins_mode_in_column_group = (t) ->
2301 if t.type is TYPE_COMMENT
2304 if t.type is TYPE_DOCTYPE
2307 if t.type is TYPE_START_TAG and t.name is 'html'
2310 if t.type is TYPE_START_TAG and t.name is 'col'
2311 el = insert_html_element t
2313 t.acknowledge_self_closing()
2315 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2316 if open_els[0].name is 'colgroup'
2318 ins_mode = ins_mode_in_table
2322 if t.type is TYPE_END_TAG and t.name is 'col'
2325 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2328 if t.type is TYPE_EOF
2332 if open_els[0].name isnt 'colgroup'
2336 ins_mode = ins_mode_in_table
2340 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2341 ins_mode_in_table_body = (t) ->
2342 if t.type is TYPE_START_TAG and t.name is 'tr'
2343 clear_stack_to_table_body_context()
2344 insert_html_element t
2345 ins_mode = ins_mode_in_row
2347 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2349 clear_stack_to_table_body_context()
2350 insert_html_element new_open_tag 'tr'
2351 ins_mode = ins_mode_in_row
2354 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2355 unless is_in_table_scope t.name # fixfull check namespace
2358 clear_stack_to_table_body_context()
2360 ins_mode = ins_mode_in_table
2362 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2365 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
2368 if table_scopers[el.name]
2373 clear_stack_to_table_body_context()
2375 ins_mode = ins_mode_in_table
2378 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2384 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2385 ins_mode_in_row = (t) ->
2386 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2387 clear_stack_to_table_row_context()
2388 insert_html_element t
2389 ins_mode = ins_mode_in_cell
2392 if t.type is TYPE_END_TAG and t.name is 'tr'
2393 if is_in_table_scope 'tr'
2394 clear_stack_to_table_row_context()
2396 ins_mode = ins_mode_in_table_body
2400 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2401 if is_in_table_scope 'tr'
2402 clear_stack_to_table_row_context()
2404 ins_mode = ins_mode_in_table_body
2409 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2410 if is_in_table_scope t.name # fixfull namespace
2411 if is_in_table_scope 'tr'
2412 clear_stack_to_table_row_context()
2414 ins_mode = ins_mode_in_table_body
2419 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2425 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2427 generate_implied_end_tags()
2428 unless open_els[0].name is 'td' or open_els[0] is 'th'
2431 el = open_els.shift()
2432 if el.name is 'td' or el.name is 'th'
2434 clear_afe_to_marker()
2435 ins_mode = ins_mode_in_row
2437 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2438 ins_mode_in_cell = (t) ->
2439 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2440 if is_in_table_scope t.name
2441 generate_implied_end_tags()
2442 if open_els[0].name isnt t.name
2445 el = open_els.shift()
2446 if el.name is t.name
2448 clear_afe_to_marker()
2449 ins_mode = ins_mode_in_row
2453 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2456 if el.name is 'td' or el.name is 'th'
2459 if table_scopers[el.name]
2467 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2470 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2471 if is_in_table_scope t.name # fixfull namespace
2480 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2481 ins_mode_in_select = (t) ->
2482 if t.type is TYPE_TEXT and t.text is "\u0000"
2485 if t.type is TYPE_TEXT
2488 if t.type is TYPE_COMMENT
2491 if t.type is TYPE_DOCTYPE
2494 if t.type is TYPE_START_TAG and t.name is 'html'
2497 if t.type is TYPE_START_TAG and t.name is 'option'
2498 if open_els[0].name is 'option'
2500 insert_html_element t
2502 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2503 if open_els[0].name is 'option'
2505 if open_els[0].name is 'optgroup'
2507 insert_html_element t
2509 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2510 if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
2512 if open_els[0].name is 'optgroup'
2517 if t.type is TYPE_END_TAG and t.name is 'option'
2518 if open_els[0].name is 'option'
2523 if t.type is TYPE_END_TAG and t.name is 'select'
2524 if is_in_select_scope 'select'
2526 el = open_els.shift()
2527 if el.name is 'select'
2533 if t.type is TYPE_START_TAG and t.name is 'select'
2536 el = open_els.shift()
2537 if el.name is 'select'
2540 # spec says that this is the same as </select> but it doesn't say
2541 # to check scope first
2543 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2545 if is_in_select_scope 'select'
2548 el = open_els.shift()
2549 if el.name is 'select'
2554 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2557 if t.type is TYPE_EOF
2564 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2565 ins_mode_in_select_in_table = (t) ->
2566 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2569 el = open_els.shift()
2570 if el.name is 'select'
2575 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2577 unless is_in_table_scope t.name, NS_HTML
2580 el = open_els.shift()
2581 if el.name is 'select'
2587 ins_mode_in_select t
2590 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2591 ins_mode_in_template = (t) ->
2592 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2595 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2598 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2599 template_ins_modes.shift()
2600 template_ins_modes.unshift ins_mode_in_table
2601 ins_mode = ins_mode_in_table
2604 if t.type is TYPE_START_TAG and t.name is 'col'
2605 template_ins_modes.shift()
2606 template_ins_modes.unshift ins_mode_in_column_group
2607 ins_mode = ins_mode_in_column_group
2610 if t.type is TYPE_START_TAG and t.name is 'tr'
2611 template_ins_modes.shift()
2612 template_ins_modes.unshift ins_mode_in_table_body
2613 ins_mode = ins_mode_in_table_body
2616 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2617 template_ins_modes.shift()
2618 template_ins_modes.unshift ins_mode_in_row
2619 ins_mode = ins_mode_in_row
2622 if t.type is TYPE_START_TAG
2623 template_ins_modes.shift()
2624 template_ins_modes.unshift ins_mode_in_body
2625 ins_mode = ins_mode_in_body
2628 if t.type is TYPE_END_TAG
2631 if t.type is TYPE_EOF
2632 unless template_tag_is_open()
2637 el = open_els.shift()
2638 if el.name is 'template' # fixfull check namespace
2640 clear_afe_to_marker()
2641 template_ins_modes.shift()
2645 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2646 ins_mode_after_body = (t) ->
2650 if t.type is TYPE_COMMENT
2651 insert_comment t, [open_els[0], open_els[0].children.length]
2653 if t.type is TYPE_DOCTYPE
2656 if t.type is TYPE_START_TAG and t.name is 'html'
2659 if t.type is TYPE_END_TAG and t.name is 'html'
2660 # fixfull fragment case
2661 ins_mode = ins_mode_after_after_body
2663 if t.type is TYPE_EOF
2668 ins_mode = ins_mode_in_body
2671 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2672 ins_mode_in_frameset = (t) ->
2676 if t.type is TYPE_COMMENT
2679 if t.type is TYPE_DOCTYPE
2682 if t.type is TYPE_START_TAG and t.name is 'html'
2685 if t.type is TYPE_START_TAG and t.name is 'frameset'
2686 insert_html_element t
2688 if t.type is TYPE_END_TAG and t.name is 'frameset'
2689 # TODO ?correct for: "if the current node is the root html element"
2690 if open_els.length is 1
2692 return # fragment case
2694 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2695 ins_mode = ins_mode_after_frameset
2697 if t.type is TYPE_START_TAG and t.name is 'frame'
2698 insert_html_element t
2700 t.acknowledge_self_closing()
2702 if t.type is TYPE_START_TAG and t.name is 'noframes'
2705 if t.type is TYPE_EOF
2706 # TODO ?correct for: "if the current node is not the root html element"
2707 if open_els.length isnt 1
2715 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2716 ins_mode_after_frameset = (t) ->
2720 if t.type is TYPE_COMMENT
2723 if t.type is TYPE_DOCTYPE
2726 if t.type is TYPE_START_TAG and t.name is 'html'
2729 if t.type is TYPE_END_TAG and t.name is 'html'
2730 insert_mode = ins_mode_after_after_frameset
2732 if t.type is TYPE_START_TAG and t.name is 'noframes'
2735 if t.type is TYPE_EOF
2742 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2743 ins_mode_after_after_body = (t) ->
2744 if t.type is TYPE_COMMENT
2745 insert_comment t, [doc, doc.children.length]
2747 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2750 if t.type is TYPE_EOF
2755 ins_mode = ins_mode_in_body
2758 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2759 ins_mode_after_after_frameset = (t) ->
2760 if t.type is TYPE_COMMENT
2761 insert_comment t, [doc, doc.children.length]
2763 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2766 if t.type is TYPE_EOF
2769 if t.type is TYPE_START_TAG and t.name is 'noframes'
2776 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2777 has_color_face_or_size = (t) ->
2779 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2782 in_foreign_content_end_script = ->
2786 in_foreign_content_other_start = (t) ->
2787 acn = adjusted_current_node()
2788 if acn.namespace is NS_MATHML
2789 adjust_mathml_attributes t
2790 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2791 t.name = svg_name_fixes[t.name]
2792 if acn.namespace is NS_SVG
2793 adjust_svg_attributes t
2794 adjust_foreign_attributes t
2795 insert_foreign_element t, acn.namespace
2796 if t.flag 'self-closing'
2797 if t.name is 'script'
2798 t.acknowledge_self_closing()
2799 in_foreign_content_end_script()
2802 t.acknowledge_self_closing()
2804 in_foreign_content = (t) ->
2805 if t.type is TYPE_TEXT and t.text is "\u0000"
2807 insert_character new_character_token "\ufffd"
2812 if t.type is TYPE_TEXT
2813 flag_frameset_ok = false
2816 if t.type is TYPE_COMMENT
2819 if t.type is TYPE_DOCTYPE
2822 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
2824 if flag_fragment_parsing
2825 in_foreign_content_other_start t
2827 loop # is this safe?
2830 if is_mathml_text_integration_point(cn) or is_html_integration(cn) or cn.namespace is NS_HTML
2834 if t.type is TYPE_START_TAG
2835 in_foreign_content_other_start t
2837 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
2838 in_foreign_content_end_script()
2840 if t.type is TYPE_END_TAG
2841 if open_els[0].name.toLowerCase() isnt t.name
2843 for node in open_els
2844 if node is open_els[open_els.length - 1]
2846 if node.name.toLowerCase() is t.name
2848 el = open_els.shift()
2851 if node.namespace is NS_HTML
2853 ins_mode t # explicitly call HTML insertion mode
2856 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2858 switch c = txt.charAt(cur++)
2860 return new_text_node parse_character_reference()
2862 tok_state = tok_state_tag_open
2865 return new_text_node c
2867 return new_eof_token()
2869 return new_text_node c
2872 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2873 # not needed: tok_state_character_reference_in_data = ->
2874 # just call parse_character_reference()
2876 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2877 tok_state_rcdata = ->
2878 switch c = txt.charAt(cur++)
2880 return new_text_node parse_character_reference()
2882 tok_state = tok_state_rcdata_less_than_sign
2885 return new_character_token "\ufffd"
2887 return new_eof_token()
2889 return new_character_token c
2892 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2893 # not needed: tok_state_character_reference_in_rcdata = ->
2894 # just call parse_character_reference()
2896 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2897 tok_state_rawtext = ->
2898 switch c = txt.charAt(cur++)
2900 tok_state = tok_state_rawtext_less_than_sign
2903 return new_character_token "\ufffd"
2905 return new_eof_token()
2907 return new_character_token c
2910 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2911 tok_state_script_data = ->
2912 switch c = txt.charAt(cur++)
2914 tok_state = tok_state_script_data_less_than_sign
2917 return new_character_token "\ufffd"
2919 return new_eof_token()
2921 return new_character_token c
2924 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2925 tok_state_plaintext = ->
2926 switch c = txt.charAt(cur++)
2929 return new_character_token "\ufffd"
2931 return new_eof_token()
2933 return new_character_token c
2937 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2938 tok_state_tag_open = ->
2939 switch c = txt.charAt(cur++)
2941 tok_state = tok_state_markup_declaration_open
2943 tok_state = tok_state_end_tag_open
2946 tok_cur_tag = new_comment_token '?'
2947 tok_state = tok_state_bogus_comment
2950 tok_cur_tag = new_open_tag c
2951 tok_state = tok_state_tag_name
2952 else if is_uc_alpha(c)
2953 tok_cur_tag = new_open_tag c.toLowerCase()
2954 tok_state = tok_state_tag_name
2957 tok_state = tok_state_data
2958 cur -= 1 # we didn't parse/handle the char after <
2959 return new_text_node '<'
2962 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2963 tok_state_end_tag_open = ->
2964 switch c = txt.charAt(cur++)
2967 tok_state = tok_state_data
2970 tok_state = tok_state_data
2971 return new_text_node '</'
2974 tok_cur_tag = new_end_tag c.toLowerCase()
2975 tok_state = tok_state_tag_name
2976 else if is_lc_alpha(c)
2977 tok_cur_tag = new_end_tag c
2978 tok_state = tok_state_tag_name
2981 tok_cur_tag = new_comment_token '/'
2982 tok_state = tok_state_bogus_comment
2985 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2986 tok_state_tag_name = ->
2987 switch c = txt.charAt(cur++)
2988 when "\t", "\n", "\u000c", ' '
2989 tok_state = tok_state_before_attribute_name
2991 tok_state = tok_state_self_closing_start_tag
2993 tok_state = tok_state_data
2999 tok_cur_tag.name += "\ufffd"
3002 tok_state = tok_state_data
3005 tok_cur_tag.name += c.toLowerCase()
3007 tok_cur_tag.name += c
3010 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3011 tok_state_rcdata_less_than_sign = ->
3012 c = txt.charAt(cur++)
3014 temporary_buffer = ''
3015 tok_state = tok_state_rcdata_end_tag_open
3018 tok_state = tok_state_rcdata
3019 cur -= 1 # reconsume the input character
3020 return new_character_token '<'
3022 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3023 tok_state_rcdata_end_tag_open = ->
3024 c = txt.charAt(cur++)
3026 tok_cur_tag = new_end_tag c.toLowerCase()
3027 temporary_buffer += c
3028 tok_state = tok_state_rcdata_end_tag_name
3031 tok_cur_tag = new_end_tag c
3032 temporary_buffer += c
3033 tok_state = tok_state_rcdata_end_tag_name
3036 tok_state = tok_state_rcdata
3037 cur -= 1 # reconsume the input character
3038 return new_character_token "</" # fixfull separate these
3040 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3041 is_appropriate_end_tag = (t) ->
3042 # spec says to check against "the tag name of the last start tag to
3043 # have been emitted from this tokenizer", but this is only called from
3044 # the various "raw" states, which I'm pretty sure all push the start
3045 # token onto open_els. TODO: verify this after the script data states
3047 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3048 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3050 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3051 tok_state_rcdata_end_tag_name = ->
3052 c = txt.charAt(cur++)
3053 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3054 if is_appropriate_end_tag tok_cur_tag
3055 tok_state = tok_state_before_attribute_name
3057 # else fall through to "Anything else"
3059 if is_appropriate_end_tag tok_cur_tag
3060 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3062 # else fall through to "Anything else"
3064 if is_appropriate_end_tag tok_cur_tag
3065 tok_state = tok_state_data
3067 # else fall through to "Anything else"
3069 tok_cur_tag.name += c.toLowerCase()
3070 temporary_buffer += c
3073 tok_cur_tag.name += c
3074 temporary_buffer += c
3077 tok_state = tok_state_rcdata
3078 cur -= 1 # reconsume the input character
3079 return new_character_token '</' + temporary_buffer # fixfull separate these
3081 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3082 tok_state_rawtext_less_than_sign = ->
3083 c = txt.charAt(cur++)
3085 temporary_buffer = ''
3086 tok_state = tok_state_rawtext_end_tag_open
3089 tok_state = tok_state_rawtext
3090 cur -= 1 # reconsume the input character
3091 return new_character_token '<'
3093 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3094 tok_state_rawtext_end_tag_open = ->
3095 c = txt.charAt(cur++)
3097 tok_cur_tag = new_end_tag c.toLowerCase()
3098 temporary_buffer += c
3099 tok_state = tok_state_rawtext_end_tag_name
3102 tok_cur_tag = new_end_tag c
3103 temporary_buffer += c
3104 tok_state = tok_state_rawtext_end_tag_name
3107 tok_state = tok_state_rawtext
3108 cur -= 1 # reconsume the input character
3109 return new_character_token "</" # fixfull separate these
3111 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3112 tok_state_rawtext_end_tag_name = ->
3113 c = txt.charAt(cur++)
3114 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3115 if is_appropriate_end_tag tok_cur_tag
3116 tok_state = tok_state_before_attribute_name
3118 # else fall through to "Anything else"
3120 if is_appropriate_end_tag tok_cur_tag
3121 tok_state = tok_state_self_closing_start_tag
3123 # else fall through to "Anything else"
3125 if is_appropriate_end_tag tok_cur_tag
3126 tok_state = tok_state_data
3128 # else fall through to "Anything else"
3130 tok_cur_tag.name += c.toLowerCase()
3131 temporary_buffer += c
3134 tok_cur_tag.name += c
3135 temporary_buffer += c
3138 tok_state = tok_state_rawtext
3139 cur -= 1 # reconsume the input character
3140 return new_character_token '</' + temporary_buffer # fixfull separate these
3142 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3143 tok_state_script_data_less_than_sign = ->
3144 c = txt.charAt(cur++)
3146 temporary_buffer = ''
3147 tok_state = tok_state_script_data_end_tag_open
3150 tok_state = tok_state_script_data_escape_start
3151 return new_character_token '<!' # fixfull split
3153 tok_state = tok_state_script_data
3154 cur -= 1 # Reconsume
3155 return new_character_token '<'
3157 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3158 tok_state_script_data_end_tag_open = ->
3159 c = txt.charAt(cur++)
3161 tok_cur_tag = new_end_tag c.toLowerCase()
3162 temporary_buffer += c
3163 tok_state = tok_state_script_data_end_tag_name
3166 tok_cur_tag = new_end_tag c
3167 temporary_buffer += c
3168 tok_state = tok_state_script_data_end_tag_name
3171 tok_state = tok_state_script_data
3172 cur -= 1 # Reconsume
3173 return new_character_token '</'
3175 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3176 tok_state_script_data_end_tag_name = ->
3177 c = txt.charAt(cur++)
3178 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3179 if is_appropriate_end_tag tok_cur_tag
3180 tok_state = tok_state_before_attribute_name
3184 if is_appropriate_end_tag tok_cur_tag
3185 tok_state = tok_state_self_closing_start_tag
3189 tok_cur_tag.name += c.toLowerCase()
3190 temporary_buffer += c
3193 tok_cur_tag.name += c
3194 temporary_buffer += c
3197 tok_state = tok_state_script_data
3198 cur -= 1 # Reconsume
3199 return new_character_token "</#{temporary_buffer}" # fixfull split
3201 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3202 tok_state_script_data_escape_start = ->
3203 c = txt.charAt(cur++)
3205 tok_state = tok_state_script_data_escape_start_dash
3206 return new_character_token '-'
3208 tok_state = tok_state_script_data
3209 cur -= 1 # Reconsume
3212 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3213 tok_state_script_data_escape_start_dash = ->
3214 c = txt.charAt(cur++)
3216 tok_state = tok_state_script_data_escaped_dash_dash
3217 return new_character_token '-'
3219 tok_state = tok_state_script_data
3220 cur -= 1 # Reconsume
3223 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3224 tok_state_script_data_escaped = ->
3225 c = txt.charAt(cur++)
3227 tok_state = tok_state_script_data_escaped_dash
3228 return new_character_token '-'
3230 tok_state = tok_state_script_data_escaped_less_than_sign
3234 return new_character_token "\ufffd"
3236 tok_state = tok_state_data
3238 cur -= 1 # Reconsume
3241 return new_character_token c
3243 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3244 tok_state_script_data_escaped_dash = ->
3245 c = txt.charAt(cur++)
3247 tok_state = tok_state_script_data_escaped_dash_dash
3248 return new_character_token '-'
3250 tok_state = tok_state_script_data_escaped_less_than_sign
3254 tok_state = tok_state_script_data_escaped
3255 return new_character_token "\ufffd"
3257 tok_state = tok_state_data
3259 cur -= 1 # Reconsume
3262 tok_state = tok_state_script_data_escaped
3263 return new_character_token c
3265 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3266 tok_state_script_data_escaped_dash_dash = ->
3267 c = txt.charAt(cur++)
3269 return new_character_token '-'
3271 tok_state = tok_state_script_data_escaped_less_than_sign
3274 tok_state = tok_state_script_data
3275 return new_character_token '>'
3278 tok_state = tok_state_script_data_escaped
3279 return new_character_token "\ufffd"
3282 tok_state = tok_state_data
3283 cur -= 1 # Reconsume
3286 tok_state = tok_state_script_data_escaped
3287 return new_character_token c
3289 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3290 tok_state_script_data_escaped_less_than_sign = ->
3291 c = txt.charAt(cur++)
3293 temporary_buffer = ''
3294 tok_state = tok_state_script_data_escaped_end_tag_open
3297 temporary_buffer = c.toLowerCase() # yes, really
3298 tok_state = tok_state_script_data_double_escape_start
3299 return new_character_token "<#{c}" # fixfull split
3301 temporary_buffer = c
3302 tok_state = tok_state_script_data_double_escape_start
3303 return new_character_token "<#{c}" # fixfull split
3305 tok_state = tok_state_script_data_escaped
3306 cur -= 1 # Reconsume
3307 return new_character_token c
3309 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3310 tok_state_script_data_escaped_end_tag_open = ->
3311 c = txt.charAt(cur++)
3313 tok_cur_tag = new_end_tag c.toLowerCase()
3314 temporary_buffer += c
3315 tok_state = tok_state_script_data_escaped_end_tag_name
3318 tok_cur_tag = new_end_tag c
3319 temporary_buffer += c
3320 tok_state = tok_state_script_data_escaped_end_tag_name
3323 tok_state = tok_state_script_data_escaped
3324 cur -= 1 # Reconsume
3325 return new_character_token '</' # fixfull split
3327 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3328 tok_state_script_data_escaped_end_tag_name = ->
3329 c = txt.charAt(cur++)
3330 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3331 if is_appropriate_end_tag tok_cur_tag
3332 tok_state = tok_state_before_attribute_name
3336 if is_appropriate_end_tag tok_cur_tag
3337 tok_state = tok_state_self_closing_start_tag
3341 tok_cur_tag.name += c.toLowerCase()
3342 temporary_buffer += c.toLowerCase()
3345 tok_cur_tag.name += c
3346 temporary_buffer += c.toLowerCase()
3349 tok_state = tok_state_script_data_escaped
3350 cur -= 1 # Reconsume
3351 return new_character_token "</#{temporary_buffer}" # fixfull split
3353 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3354 tok_state_script_data_double_escape_start = ->
3355 c = txt.charAt(cur++)
3356 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3357 if temporary_buffer is 'script'
3358 tok_state = tok_state_script_data_double_escaped
3360 tok_state = tok_state_script_data_escaped
3361 return new_character_token c
3363 temporary_buffer += c.toLowerCase() # yes, really lowercase
3364 return new_character_token c
3366 temporary_buffer += c
3367 return new_character_token c
3369 tok_state = tok_state_script_data_escaped
3370 cur -= 1 # Reconsume
3373 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3374 tok_state_script_data_double_escaped = ->
3375 c = txt.charAt(cur++)
3377 tok_state = tok_state_script_data_double_escaped_dash
3378 return new_character_token '-'
3380 tok_state = tok_state_script_data_double_escaped_less_than_sign
3381 return new_character_token '<'
3384 return new_character_token "\ufffd"
3387 tok_state = tok_state_data
3388 cur -= 1 # Reconsume
3391 return new_character_token c
3393 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3394 tok_state_script_data_double_escaped_dash = ->
3395 c = txt.charAt(cur++)
3397 tok_state = tok_state_script_data_double_escaped_dash_dash
3398 return new_character_token '-'
3400 tok_state = tok_state_script_data_double_escaped_less_than_sign
3401 return new_character_token '<'
3404 tok_state = tok_state_script_data_double_escaped
3405 return new_character_token "\ufffd"
3408 tok_state = tok_state_data
3409 cur -= 1 # Reconsume
3412 tok_state = tok_state_script_data_double_escaped
3413 return new_character_token c
3415 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3416 tok_state_script_data_double_escaped_dash_dash = ->
3417 c = txt.charAt(cur++)
3419 return new_character_token '-'
3421 tok_state = tok_state_script_data_double_escaped_less_than_sign
3422 return new_character_token '<'
3424 tok_state = tok_state_script_data
3425 return new_character_token '>'
3428 tok_state = tok_state_script_data_double_escaped
3429 return new_character_token "\ufffd"
3432 tok_state = tok_state_data
3433 cur -= 1 # Reconsume
3436 tok_state = tok_state_script_data_double_escaped
3437 return new_character_token c
3439 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3440 tok_state_script_data_double_escaped_less_than_sign = ->
3441 c = txt.charAt(cur++)
3443 temporary_buffer = ''
3444 tok_state = tok_state_script_data_double_escape_end
3445 return new_character_token '/'
3447 tok_state = tok_state_script_data_double_escaped
3448 cur -= 1 # Reconsume
3451 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3452 tok_state_script_data_double_escape_end = ->
3453 c = txt.charAt(cur++)
3454 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3455 if temporary_buffer is 'script'
3456 tok_state = tok_state_script_data_escaped
3458 tok_state = tok_state_script_data_double_escaped
3459 return new_character_token c
3461 temporary_buffer += c.toLowerCase() # yes, really lowercase
3462 return new_character_token c
3464 temporary_buffer += c
3465 return new_character_token c
3467 tok_state = tok_state_script_data_double_escaped
3468 cur -= 1 # Reconsume
3471 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3472 tok_state_before_attribute_name = ->
3474 switch c = txt.charAt(cur++)
3475 when "\t", "\n", "\u000c", ' '
3478 tok_state = tok_state_self_closing_start_tag
3481 tok_state = tok_state_data
3487 attr_name = "\ufffd"
3488 when '"', "'", '<', '='
3493 tok_state = tok_state_data
3496 attr_name = c.toLowerCase()
3500 tok_cur_tag.attrs_a.unshift [attr_name, '']
3501 tok_state = tok_state_attribute_name
3504 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3505 tok_state_attribute_name = ->
3506 switch c = txt.charAt(cur++)
3507 when "\t", "\n", "\u000c", ' '
3508 tok_state = tok_state_after_attribute_name
3510 tok_state = tok_state_self_closing_start_tag
3512 tok_state = tok_state_before_attribute_value
3514 tok_state = tok_state_data
3520 tok_cur_tag.attrs_a[0][0] = "\ufffd"
3523 tok_cur_tag.attrs_a[0][0] = c
3526 tok_state = tok_state_data
3529 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
3531 tok_cur_tag.attrs_a[0][0] += c
3534 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3535 tok_state_after_attribute_name = ->
3536 c = txt.charAt(cur++)
3537 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3540 tok_state = tok_state_self_closing_start_tag
3543 tok_state = tok_state_before_attribute_value
3546 tok_state = tok_state_data
3549 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3550 tok_state = tok_state_attribute_name
3554 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3555 tok_state = tok_state_attribute_name
3559 tok_state = tok_state_data
3560 cur -= 1 # reconsume
3562 if c is '"' or c is "'" or c is '<'
3564 # fall through to Anything else
3566 tok_cur_tag.attrs_a.unshift [c, '']
3567 tok_state = tok_state_attribute_name
3569 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3570 tok_state_before_attribute_value = ->
3571 switch c = txt.charAt(cur++)
3572 when "\t", "\n", "\u000c", ' '
3575 tok_state = tok_state_attribute_value_double_quoted
3577 tok_state = tok_state_attribute_value_unquoted
3580 tok_state = tok_state_attribute_value_single_quoted
3583 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3584 tok_state = tok_state_attribute_value_unquoted
3587 tok_state = tok_state_data
3593 tok_state = tok_state_data
3595 tok_cur_tag.attrs_a[0][1] += c
3596 tok_state = tok_state_attribute_value_unquoted
3599 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3600 tok_state_attribute_value_double_quoted = ->
3601 switch c = txt.charAt(cur++)
3603 tok_state = tok_state_after_attribute_value_quoted
3605 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3608 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3611 tok_state = tok_state_data
3613 tok_cur_tag.attrs_a[0][1] += c
3616 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3617 tok_state_attribute_value_single_quoted = ->
3618 switch c = txt.charAt(cur++)
3620 tok_state = tok_state_after_attribute_value_quoted
3622 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3625 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3628 tok_state = tok_state_data
3630 tok_cur_tag.attrs_a[0][1] += c
3633 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3634 tok_state_attribute_value_unquoted = ->
3635 switch c = txt.charAt(cur++)
3636 when "\t", "\n", "\u000c", ' '
3637 tok_state = tok_state_before_attribute_name
3639 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3641 tok_state = tok_state_data
3646 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3649 tok_state = tok_state_data
3651 # Parse Error if ', <, = or ` (backtick)
3652 tok_cur_tag.attrs_a[0][1] += c
3655 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3656 tok_state_after_attribute_value_quoted = ->
3657 switch c = txt.charAt(cur++)
3658 when "\t", "\n", "\u000c", ' '
3659 tok_state = tok_state_before_attribute_name
3661 tok_state = tok_state_self_closing_start_tag
3663 tok_state = tok_state_data
3669 tok_state = tok_state_data
3672 tok_state = tok_state_before_attribute_name
3673 cur -= 1 # we didn't handle that char
3676 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3677 tok_state_self_closing_start_tag = ->
3678 c = txt.charAt(cur++)
3680 tok_cur_tag.flag 'self-closing'
3681 tok_state = tok_state_data
3685 tok_state = tok_state_data
3686 cur -= 1 # Reconsume
3690 tok_state = tok_state_before_attribute_name
3691 cur -= 1 # Reconsume
3694 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3695 # WARNING: put a comment token in tok_cur_tag before setting this state
3696 tok_state_bogus_comment = ->
3697 next_gt = txt.indexOf '>', cur
3699 val = txt.substr cur
3702 val = txt.substr cur, (next_gt - cur)
3704 val = val.replace "\u0000", "\ufffd"
3705 tok_cur_tag.text += val
3706 tok_state = tok_state_data
3709 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3710 tok_state_markup_declaration_open = ->
3711 if txt.substr(cur, 2) is '--'
3713 tok_cur_tag = new_comment_token ''
3714 tok_state = tok_state_comment_start
3716 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3718 tok_state = tok_state_doctype
3720 acn = adjusted_current_node()
3721 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3723 tok_state = tok_state_cdata_section
3727 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
3728 tok_state = tok_state_bogus_comment
3731 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3732 tok_state_comment_start = ->
3733 switch c = txt.charAt(cur++)
3735 tok_state = tok_state_comment_start_dash
3738 return new_character_token "\ufffd"
3741 tok_state = tok_state_data
3745 tok_state = tok_state_data
3746 cur -= 1 # Reconsume
3749 tok_cur_tag.text += c
3752 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3753 tok_state_comment_start_dash = ->
3754 switch c = txt.charAt(cur++)
3756 tok_state = tok_state_comment_end
3759 tok_cur_tag.text += "-\ufffd"
3760 tok_state = tok_state_comment
3763 tok_state = tok_state_data
3767 tok_state = tok_state_data
3768 cur -= 1 # Reconsume
3771 tok_cur_tag.text += "-#{c}"
3772 tok_state = tok_state_comment
3775 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3776 tok_state_comment = ->
3777 switch c = txt.charAt(cur++)
3779 tok_state = tok_state_comment_end_dash
3782 tok_cur_tag.text += "\ufffd"
3785 tok_state = tok_state_data
3786 cur -= 1 # Reconsume
3789 tok_cur_tag.text += c
3792 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3793 tok_state_comment_end_dash = ->
3794 switch c = txt.charAt(cur++)
3796 tok_state = tok_state_comment_end
3799 tok_cur_tag.text += "-\ufffd"
3800 tok_state = tok_state_comment
3803 tok_state = tok_state_data
3804 cur -= 1 # Reconsume
3807 tok_cur_tag.text += "-#{c}"
3808 tok_state = tok_state_comment
3811 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3812 tok_state_comment_end = ->
3813 switch c = txt.charAt(cur++)
3815 tok_state = tok_state_data
3819 tok_cur_tag.text += "--\ufffd"
3820 tok_state = tok_state_comment
3823 tok_state = tok_state_comment_end_bang
3826 tok_cur_tag.text += '-'
3829 tok_state = tok_state_data
3830 cur -= 1 # Reconsume
3834 tok_cur_tag.text += "--#{c}"
3835 tok_state = tok_state_comment
3838 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3839 tok_state_comment_end_bang = ->
3840 switch c = txt.charAt(cur++)
3842 tok_cur_tag.text += "--!#{c}"
3843 tok_state = tok_state_comment_end_dash
3845 tok_state = tok_state_data
3849 tok_cur_tag.text += "--!\ufffd"
3850 tok_state = tok_state_comment
3853 tok_state = tok_state_data
3854 cur -= 1 # Reconsume
3857 tok_cur_tag.text += "--!#{c}"
3858 tok_state = tok_state_comment
3861 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3862 tok_state_doctype = ->
3863 switch c = txt.charAt(cur++)
3864 when "\t", "\u000a", "\u000c", ' '
3865 tok_state = tok_state_before_doctype_name
3868 tok_state = tok_state_data
3869 el = new_doctype_token ''
3870 el.flag 'force-quirks', true
3871 cur -= 1 # Reconsume
3875 tok_state = tok_state_before_doctype_name
3876 cur -= 1 # Reconsume
3879 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3880 tok_state_before_doctype_name = ->
3881 c = txt.charAt(cur++)
3882 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3885 tok_cur_tag = new_doctype_token c.toLowerCase()
3886 tok_state = tok_state_doctype_name
3890 tok_cur_tag = new_doctype_token "\ufffd"
3891 tok_state = tok_state_doctype_name
3895 el = new_doctype_token ''
3896 el.flag 'force-quirks', true
3897 tok_state = tok_state_data
3901 tok_state = tok_state_data
3902 el = new_doctype_token ''
3903 el.flag 'force-quirks', true
3904 cur -= 1 # Reconsume
3907 tok_cur_tag = new_doctype_token c
3908 tok_state = tok_state_doctype_name
3911 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3912 tok_state_doctype_name = ->
3913 c = txt.charAt(cur++)
3914 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3915 tok_state = tok_state_after_doctype_name
3918 tok_state = tok_state_data
3921 tok_cur_tag.name += c.toLowerCase()
3925 tok_cur_tag.name += "\ufffd"
3929 tok_state = tok_state_data
3930 tok_cur_tag.flag 'force-quirks', true
3931 cur -= 1 # Reconsume
3934 tok_cur_tag.name += c
3937 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3938 tok_state_after_doctype_name = ->
3939 c = txt.charAt(cur++)
3940 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3943 tok_state = tok_state_data
3947 tok_state = tok_state_data
3948 tok_cur_tag.flag 'force-quirks', true
3949 cur -= 1 # Reconsume
3952 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3954 tok_state = tok_state_after_doctype_public_keyword
3956 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3958 tok_state = tok_state_after_doctype_system_keyword
3961 tok_cur_tag.flag 'force-quirks', true
3962 tok_state = tok_state_bogus_doctype
3965 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3966 tok_state_after_doctype_public_keyword = ->
3967 c = txt.charAt(cur++)
3968 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3969 tok_state = tok_state_before_doctype_public_identifier
3973 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3974 tok_state = tok_state_doctype_public_identifier_double_quoted
3978 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3979 tok_state = tok_state_doctype_public_identifier_single_quoted
3983 tok_cur_tag.flag 'force-quirks', true
3984 tok_state = tok_state_data
3988 tok_state = tok_state_data
3989 tok_cur_tag.flag 'force-quirks', true
3990 cur -= 1 # Reconsume
3994 tok_cur_tag.flag 'force-quirks', true
3995 tok_state = tok_state_bogus_doctype
3998 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
3999 tok_state_before_doctype_public_identifier = ->
4000 c = txt.charAt(cur++)
4001 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4005 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
4006 tok_state = tok_state_doctype_public_identifier_double_quoted
4010 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
4011 tok_state = tok_state_doctype_public_identifier_single_quoted
4015 tok_cur_tag.flag 'force-quirks', true
4016 tok_state = tok_state_data
4020 tok_state = tok_state_data
4021 tok_cur_tag.flag 'force-quirks', true
4022 cur -= 1 # Reconsume
4026 tok_cur_tag.flag 'force-quirks', true
4027 tok_state = tok_state_bogus_doctype
4031 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4032 tok_state_doctype_public_identifier_double_quoted = ->
4033 c = txt.charAt(cur++)
4035 tok_state = tok_state_after_doctype_public_identifier
4039 tok_cur_tag.public_identifier += "\ufffd"
4043 tok_cur_tag.flag 'force-quirks', true
4044 tok_state = tok_state_data
4048 tok_state = tok_state_data
4049 tok_cur_tag.flag 'force-quirks', true
4050 cur -= 1 # Reconsume
4053 tok_cur_tag.public_identifier += c
4056 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4057 tok_state_doctype_public_identifier_single_quoted = ->
4058 c = txt.charAt(cur++)
4060 tok_state = tok_state_after_doctype_public_identifier
4064 tok_cur_tag.public_identifier += "\ufffd"
4068 tok_cur_tag.flag 'force-quirks', true
4069 tok_state = tok_state_data
4073 tok_state = tok_state_data
4074 tok_cur_tag.flag 'force-quirks', true
4075 cur -= 1 # Reconsume
4078 tok_cur_tag.public_identifier += c
4081 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4082 tok_state_after_doctype_public_identifier = ->
4083 c = txt.charAt(cur++)
4084 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4085 tok_state = tok_state_between_doctype_public_and_system_identifiers
4088 tok_state = tok_state_data
4092 tok_cur_tag.system_identifier = ''
4093 tok_state = tok_state_doctype_system_identifier_double_quoted
4097 tok_cur_tag.system_identifier = ''
4098 tok_state = tok_state_doctype_system_identifier_single_quoted
4102 tok_state = tok_state_data
4103 tok_cur_tag.flag 'force-quirks', true
4104 cur -= 1 # Reconsume
4108 tok_cur_tag.flag 'force-quirks', true
4109 tok_state = tok_state_bogus_doctype
4112 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4113 tok_state_between_doctype_public_and_system_identifiers = ->
4114 c = txt.charAt(cur++)
4115 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4118 tok_state = tok_state_data
4122 tok_cur_tag.system_identifier = ''
4123 tok_state = tok_state_doctype_system_identifier_double_quoted
4127 tok_cur_tag.system_identifier = ''
4128 tok_state = tok_state_doctype_system_identifier_single_quoted
4132 tok_state = tok_state_data
4133 tok_cur_tag.flag 'force-quirks', true
4134 cur -= 1 # Reconsume
4138 tok_cur_tag.flag 'force-quirks', true
4139 tok_state = tok_state_bogus_doctype
4142 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4143 tok_state_after_doctype_system_keyword = ->
4144 c = txt.charAt(cur++)
4145 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4146 tok_state = tok_state_before_doctype_system_identifier
4150 tok_cur_tag.system_identifier = ''
4151 tok_state = tok_state_doctype_system_identifier_double_quoted
4155 tok_cur_tag.system_identifier = ''
4156 tok_state = tok_state_doctype_system_identifier_single_quoted
4160 tok_cur_tag.flag 'force-quirks', true
4161 tok_state = tok_state_data
4165 tok_state = tok_state_data
4166 tok_cur_tag.flag 'force-quirks', true
4167 cur -= 1 # Reconsume
4171 tok_cur_tag.flag 'force-quirks', true
4172 tok_state = tok_state_bogus_doctype
4175 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4176 tok_state_before_doctype_system_identifier = ->
4177 c = txt.charAt(cur++)
4178 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4181 tok_cur_tag.system_identifier = ''
4182 tok_state = tok_state_doctype_system_identifier_double_quoted
4185 tok_cur_tag.system_identifier = ''
4186 tok_state = tok_state_doctype_system_identifier_single_quoted
4190 tok_cur_tag.flag 'force-quirks', true
4191 tok_state = tok_state_data
4195 tok_state = tok_state_data
4196 tok_cur_tag.flag 'force-quirks', true
4197 cur -= 1 # Reconsume
4201 tok_cur_tag.flag 'force-quirks', true
4202 tok_state = tok_state_bogus_doctype
4205 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4206 tok_state_doctype_system_identifier_double_quoted = ->
4207 c = txt.charAt(cur++)
4209 tok_state = tok_state_after_doctype_system_identifier
4213 tok_cur_tag.system_identifier += "\ufffd"
4217 tok_cur_tag.flag 'force-quirks', true
4218 tok_state = tok_state_data
4222 tok_state = tok_state_data
4223 tok_cur_tag.flag 'force-quirks', true
4224 cur -= 1 # Reconsume
4227 tok_cur_tag.system_identifier += c
4230 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4231 tok_state_doctype_system_identifier_single_quoted = ->
4232 c = txt.charAt(cur++)
4234 tok_state = tok_state_after_doctype_system_identifier
4238 tok_cur_tag.system_identifier += "\ufffd"
4242 tok_cur_tag.flag 'force-quirks', true
4243 tok_state = tok_state_data
4247 tok_state = tok_state_data
4248 tok_cur_tag.flag 'force-quirks', true
4249 cur -= 1 # Reconsume
4252 tok_cur_tag.system_identifier += c
4255 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4256 tok_state_after_doctype_system_identifier = ->
4257 c = txt.charAt(cur++)
4258 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4261 tok_state = tok_state_data
4265 tok_state = tok_state_data
4266 tok_cur_tag.flag 'force-quirks', true
4267 cur -= 1 # Reconsume
4271 # do _not_ tok_cur_tag.flag 'force-quirks', true
4272 tok_state = tok_state_bogus_doctype
4275 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4276 tok_state_bogus_doctype = ->
4277 c = txt.charAt(cur++)
4279 tok_state = tok_state_data
4282 tok_state = tok_state_data
4283 cur -= 1 # Reconsume
4288 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4289 tok_state_cdata_section = ->
4290 tok_state = tok_state_data
4291 next_gt = txt.indexOf ']]>', cur
4293 val = txt.substr cur
4296 val = txt.substr cur, (next_gt - cur)
4298 val = val.replace "\u0000", "\ufffd" # fixfull spec doesn't say this
4299 return new_character_token val # fixfull split
4301 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4302 # Don't set this as a state, just call it
4303 # returns a string (NOT a text node)
4304 parse_character_reference = (allowed_char = null, in_attr = false) ->
4305 if cur >= txt.length
4307 switch c = txt.charAt(cur)
4308 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4309 # explicitly not a parse error
4312 # there has to be "one or more" alnums between & and ; to be a parse error
4315 if cur + 1 >= txt.length
4317 if txt.charAt(cur + 1).toLowerCase() is 'x'
4326 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4330 if txt.charAt(start + i) is ';'
4332 # FIXME This is supposed to generate parse errors for some chars
4333 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
4340 if alnum.indexOf(txt.charAt(cur + i)) is -1
4343 # exit early, because parse_error() below needs at least one alnum
4345 if txt.charAt(cur + i) is ';'
4346 i += 1 # include ';' terminator in value
4347 decoded = decode_named_char_ref txt.substr(cur, i)
4354 # no ';' terminator (only legacy char refs)
4356 for i in [2..max] # no prefix matches, so ok to check shortest first
4357 c = legacy_char_refs[txt.substr(cur, i)]
4360 if txt.charAt(cur + i) is '='
4361 # "because some legacy user agents will
4362 # misinterpret the markup in those cases"
4365 if alnum.indexOf(txt.charAt(cur + i)) > -1
4366 # this makes attributes forgiving about url args
4368 # ok, and besides the weird exceptions for attributes...
4369 # return the matching char
4370 cur += i # consume entity chars
4371 parse_error() # because no terminating ";"
4375 return # never reached
4377 # tree constructor initialization
4378 # see comments on TYPE_TAG/etc for the structure of this data
4379 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4381 afe = [] # active formatting elements
4382 template_ins_modes = []
4383 ins_mode = ins_mode_initial
4384 original_ins_mode = ins_mode # TODO check spec
4385 flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
4386 flag_frameset_ok = true
4388 flag_foster_parenting = false
4389 form_element_pointer = null
4390 temporary_buffer = null
4391 pending_table_character_tokens = []
4392 head_element_pointer = null
4393 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4394 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4396 # tokenizer initialization
4397 tok_state = tok_state_data
4400 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4405 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4408 serialize_els = (els, shallow, show_ids) ->
4414 serialized += t.serialize shallow, show_ids
4417 # TODO export TYPE_*
4418 module.exports.parse_html = parse_html
4419 module.exports.debug_log_reset = debug_log_reset
4420 module.exports.debug_log_each = debug_log_each
4421 module.exports.TYPE_TAG = TYPE_TAG
4422 module.exports.TYPE_TEXT = TYPE_TEXT
4423 module.exports.TYPE_COMMENT = TYPE_COMMENT
4424 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4425 module.exports.NS_HTML = NS_HTML
4426 module.exports.NS_MATHML = NS_MATHML
4427 module.exports.NS_SVG = NS_SVG