1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
55 module = exports: window.wheic
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
79 debug_log_each = (cb) ->
80 for str in g_debug_log
85 constructor: (type, args = {}) ->
86 @type = type # one of the TYPE_* constants above
87 @name = args.name ? '' # tag name
88 @text = args.text ? '' # contents for text/comment nodes
89 @attrs = args.attrs ? {}
90 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91 @children = args.children ? []
92 @namespace = args.namespace ? NS_HTML
93 @parent = args.parent ? null
94 @token = args.token ? null
95 @flags = args.flags ? {}
99 @id = "#{++prev_node_id}"
100 acknowledge_self_closing: ->
102 @token.flag 'did_self_close'
104 @flag 'did_self_close', true
105 flag: (key, value = null) ->
110 serialize: (shallow = false, show_ids = false) -> # for unit tests
115 ret += JSON.stringify @name
130 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
136 ret += c.serialize shallow, show_ids
140 ret += JSON.stringify @text
143 ret += JSON.stringify @text
145 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
148 when TYPE_AAA_BOOKMARK
149 ret += 'aaa_bookmark'
152 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
155 # helpers: (only take args that are normally known when parser creates nodes)
156 new_open_tag = (name) ->
157 return new Node TYPE_START_TAG, name: name
158 new_end_tag = (name) ->
159 return new Node TYPE_END_TAG, name: name
160 new_element = (name) ->
161 return new Node TYPE_TAG, name: name
162 new_text_node = (txt) ->
163 return new Node TYPE_TEXT, text: txt
164 new_character_token = new_text_node
165 new_comment_token = (txt) ->
166 return new Node TYPE_COMMENT, text: txt
167 new_doctype_token = (name) ->
168 return new Node TYPE_DOCTYPE, name: name
170 return new Node TYPE_EOF
172 return new Node TYPE_AFE_MARKER
173 new_aaa_bookmark = ->
174 return new Node TYPE_AAA_BOOKMARK
176 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
177 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
178 digits = "0123456789"
179 alnum = lc_alpha + uc_alpha + digits
180 hex_chars = digits + "abcdefABCDEF"
182 is_uc_alpha = (str) ->
183 return str.length is 1 and uc_alpha.indexOf(str) > -1
184 is_lc_alpha = (str) ->
185 return str.length is 1 and lc_alpha.indexOf(str) > -1
187 # some SVG elements have dashes in them
188 tag_name_chars = alnum + "-"
190 # http://www.w3.org/TR/html5/infrastructure.html#space-character
191 space_chars = "\u0009\u000a\u000c\u000d\u0020"
193 return txt.length is 1 and space_chars.indexOf(txt) > -1
194 is_space_tok = (t) ->
195 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
197 is_input_hidden_tok = (t) ->
198 return unless t.type is TYPE_START_TAG
201 if a[1].toLowerCase() is 'hidden'
206 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
207 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
209 # These are the character references that don't need a terminating semicolon
210 # min length: 2, max: 6, none are a prefix of any other.
212 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
213 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
214 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
215 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
216 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
217 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
218 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
219 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
220 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
221 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
222 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
223 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
224 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
225 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
226 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
227 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
228 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
232 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
233 raw_text_elements = ['script', 'style']
234 escapable_raw_text_elements = ['textarea', 'title']
235 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
237 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
238 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
239 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
240 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
241 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
242 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
243 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
244 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
245 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
246 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
247 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
248 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
249 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
250 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
254 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
256 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
257 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
258 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
259 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
260 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
261 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
262 'determinant', 'diff', 'divergence', 'divide', 'domain',
263 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
264 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
265 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
266 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
267 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
268 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
269 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
270 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
271 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
272 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
273 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
274 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
275 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
276 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
277 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
278 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
279 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
280 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
281 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
282 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
283 'vectorproduct', 'xor'
285 # foreign_elements = [svg_elements..., mathml_elements...]
286 #normal_elements = All other allowed HTML elements are normal elements.
290 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
291 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
292 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
293 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
294 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
295 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
296 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
297 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
298 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
299 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
300 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
301 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
302 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
303 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
304 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
305 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
306 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
307 wbr:NS_HTML, xmp:NS_HTML,
310 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
311 'annotation-xml':NS_MATHML,
314 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
317 formatting_elements = {
318 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
319 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
323 mathml_text_integration = {
324 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
326 is_mathml_text_integration_point = (el) ->
327 return mathml_text_integration[el.name] = el.namespace
328 is_html_integration = (el) -> # DON'T PASS A TOKEN
329 if el.namespace is NS_MATHML and el.name is 'annotation-xml'
330 if el.attrs.encoding?
331 if el.attrs.encoding.toLowerCase() is 'text/html'
333 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
336 if el.namespace is NS_SVG
337 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
342 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
346 foster_parenting_targets = {
369 el_is_special = (e) ->
370 return special_elements[e.name] is e.namespace
372 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
373 el_is_special_not_adp = (el) ->
374 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
378 altglyphdef: 'altGlyphDef'
379 altglyphitem: 'altGlyphItem'
380 animatecolor: 'animateColor'
381 animatemotion: 'animateMotion'
382 animatetransform: 'animateTransform'
385 fecolormatrix: 'feColorMatrix'
386 fecomponenttransfer: 'feComponentTransfer'
387 fecomposite: 'feComposite'
388 feconvolvematrix: 'feConvolveMatrix'
389 fediffuselighting: 'feDiffuseLighting'
390 fedisplacementmap: 'feDisplacementMap'
391 fedistantlight: 'feDistantLight'
392 fedropshadow: 'feDropShadow'
398 fegaussianblur: 'feGaussianBlur'
401 femergenode: 'feMergeNode'
402 femorphology: 'feMorphology'
404 fepointlight: 'fePointLight'
405 fespecularlighting: 'feSpecularLighting'
406 fespotlight: 'feSpotLight'
408 feturbulence: 'feTurbulence'
409 foreignobject: 'foreignObject'
411 lineargradient: 'linearGradient'
412 radialgradient: 'radialGradient'
415 svg_attribute_fixes = {
416 attributename: 'attributeName'
417 attributetype: 'attributeType'
418 basefrequency: 'baseFrequency'
419 baseprofile: 'baseProfile'
421 clippathunits: 'clipPathUnits'
422 contentscripttype: 'contentScriptType'
423 contentstyletype: 'contentStyleType'
424 diffuseconstant: 'diffuseConstant'
426 externalresourcesrequired: 'externalResourcesRequired'
427 filterres: 'filterRes'
428 filterunits: 'filterUnits'
430 gradienttransform: 'gradientTransform'
431 gradientunits: 'gradientUnits'
432 kernelmatrix: 'kernelMatrix'
433 kernelunitlength: 'kernelUnitLength'
434 keypoints: 'keyPoints'
435 keysplines: 'keySplines'
437 lengthadjust: 'lengthAdjust'
438 limitingconeangle: 'limitingConeAngle'
439 markerheight: 'markerHeight'
440 markerunits: 'markerUnits'
441 markerwidth: 'markerWidth'
442 maskcontentunits: 'maskContentUnits'
443 maskunits: 'maskUnits'
444 numoctaves: 'numOctaves'
445 pathlength: 'pathLength'
446 patterncontentunits: 'patternContentUnits'
447 patterntransform: 'patternTransform'
448 patternunits: 'patternUnits'
449 pointsatx: 'pointsAtX'
450 pointsaty: 'pointsAtY'
451 pointsatz: 'pointsAtZ'
452 preservealpha: 'preserveAlpha'
453 preserveaspectratio: 'preserveAspectRatio'
454 primitiveunits: 'primitiveUnits'
457 repeatcount: 'repeatCount'
458 repeatdur: 'repeatDur'
459 requiredextensions: 'requiredExtensions'
460 requiredfeatures: 'requiredFeatures'
461 specularconstant: 'specularConstant'
462 specularexponent: 'specularExponent'
463 spreadmethod: 'spreadMethod'
464 startoffset: 'startOffset'
465 stddeviation: 'stdDeviation'
466 stitchtiles: 'stitchTiles'
467 surfacescale: 'surfaceScale'
468 systemlanguage: 'systemLanguage'
469 tablevalues: 'tableValues'
472 textlength: 'textLength'
474 viewtarget: 'viewTarget'
475 xchannelselector: 'xChannelSelector'
476 ychannelselector: 'yChannelSelector'
477 zoomandpan: 'zoomAndPan'
479 adjust_mathml_attributes = (t) ->
481 if a[0] is 'definitionurl'
482 a[0] = 'definitionURL'
484 adjust_svg_attributes = (t) ->
486 if svg_attribute_fixes[a[0]]?
487 a[0] = svg_attribute_fixes[a[0]]
489 adjust_foreign_attributes = (t) ->
493 # decode_named_char_ref()
495 # The list of named character references is _huge_ so ask the browser to decode
496 # for us instead of wasting bandwidth/space on including the table here.
498 # Pass without the "&" but with the ";" examples:
499 # for "&" pass "amp;"
500 # for "′" pass "x2032;"
503 textarea: document.createElement('textarea')
505 # TODO test this in IE8
506 decode_named_char_ref = (txt) ->
508 decoded = g_dncr.cache[txt]
509 return decoded if decoded?
510 g_dncr.textarea.innerHTML = txt
511 decoded = g_dncr.textarea.value
512 return null if decoded is txt
513 return g_dncr.cache[txt] = decoded
515 parse_html = (txt, parse_error_cb = null) ->
516 cur = 0 # index of next char in txt to be parsed
517 # declare doc and tokenizer variables so they're in scope below
519 open_els = null # stack of open elements
520 afe = null # active formatting elements
521 template_ins_modes = null
523 original_ins_mode = null
525 tok_cur_tag = null # partially parsed tag
526 flag_scripting = null
527 flag_frameset_ok = null
529 flag_foster_parenting = null
530 form_element_pointer = null
531 temporary_buffer = null
532 pending_table_character_tokens = null
533 head_element_pointer = null
534 flag_fragment_parsing = null
535 context_element = null
544 console.log "Parse error at character #{cur} of #{txt.length}"
546 afe_push = (new_el) ->
549 if el.name is new_el.name and el.namespace is new_el.namespace
551 continue unless new_el.attrs[k] is v
552 for k, v of new_el.attrs
553 continue unless el.attrs[k] is v
560 afe.unshift new_afe_marker()
562 # the functions below impliment the Tree Contstruction algorithm
563 # http://www.w3.org/TR/html5/syntax.html#tree-construction
565 # But first... the helpers
566 template_tag_is_open = ->
568 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
571 is_in_scope_x = (tag_name, scope, namespace) ->
573 if t.name is tag_name and (namespace is null or namespace is t.namespace)
575 if scope[t.name] is t.namespace
578 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
580 if t.name is tag_name and (namespace is null or namespace is t.namespace)
582 if scope[t.name] is t.namespace
584 if scope2[t.name] is t.namespace
588 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
589 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
590 template: NS_HTML, mi: NS_MATHML,
592 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
593 'annotation-xml': NS_MATHML,
595 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
597 button_scopers = button: NS_HTML
598 li_scopers = ol: NS_HTML, ul: NS_HTML
599 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
600 is_in_scope = (tag_name, namespace = null) ->
601 return is_in_scope_x tag_name, standard_scopers, namespace
602 is_in_button_scope = (tag_name, namespace = null) ->
603 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
604 is_in_table_scope = (tag_name, namespace = null) ->
605 return is_in_scope_x tag_name, table_scopers, namespace
606 # aka is_in_list_item_scope
607 is_in_li_scope = (tag_name, namespace = null) ->
608 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
609 is_in_select_scope = (tag_name, namespace = null) ->
611 if t.name is tag_name and (namespace is null or namespace is t.namespace)
613 if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
616 # this checks for a particular element, not by name
617 el_is_in_scope = (el) ->
621 if standard_scopers[t.name] is t.namespace
625 clear_to_table_stopers = {
630 clear_stack_to_table_context = ->
632 if clear_to_table_stopers[open_els[0].name]?
636 clear_to_table_body_stopers = {
643 clear_stack_to_table_body_context = ->
645 if clear_to_table_body_stopers[open_els[0].name]?
649 clear_to_table_row_stopers = {
654 clear_stack_to_table_row_context = ->
656 if clear_to_table_row_stopers[open_els[0].name]?
660 clear_afe_to_marker = ->
662 return unless afe.length > 0 # this happens in fragment case, ?spec error
664 if el.type is TYPE_AFE_MARKER
669 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
671 # 1. Let last be false.
673 # 2. Let node be the last node in the stack of open elements.
675 node = open_els[node_i]
676 # 3. Loop: If node is the first node in the stack of open elements,
677 # then set last to true, and, if the parser was originally created as
678 # part of the HTML fragment parsing algorithm (fragment case) set node
679 # to the context element.
681 if node_i is open_els.length - 1
683 # fixfull (fragment case)
685 # 4. If node is a select element, run these substeps:
686 if node.name is 'select'
687 # 1. If last is true, jump to the step below labeled done.
689 # 2. Let ancestor be node.
692 # 3. Loop: If ancestor is the first node in the stack of
693 # open elements, jump to the step below labeled done.
695 if ancestor_i is open_els.length - 1
697 # 4. Let ancestor be the node before ancestor in the stack
700 ancestor = open_els[ancestor_i]
701 # 5. If ancestor is a template node, jump to the step below
703 if ancestor.name is 'template'
705 # 6. If ancestor is a table node, switch the insertion mode
706 # to "in select in table" and abort these steps.
707 if ancestor.name is 'table'
708 ins_mode = ins_mode_in_select_in_table
710 # 7. Jump back to the step labeled loop.
711 # 8. Done: Switch the insertion mode to "in select" and abort
713 ins_mode = ins_mode_in_select
715 # 5. If node is a td or th element and last is false, then switch
716 # the insertion mode to "in cell" and abort these steps.
717 if (node.name is 'td' or node.name is 'th') and last is false
718 ins_mode = ins_mode_in_cell
720 # 6. If node is a tr element, then switch the insertion mode to "in
721 # row" and abort these steps.
723 ins_mode = ins_mode_in_row
725 # 7. If node is a tbody, thead, or tfoot element, then switch the
726 # insertion mode to "in table body" and abort these steps.
727 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
728 ins_mode = ins_mode_in_table_body
730 # 8. If node is a caption element, then switch the insertion mode
731 # to "in caption" and abort these steps.
732 if node.name is 'caption'
733 ins_mode = ins_mode_in_caption
735 # 9. If node is a colgroup element, then switch the insertion mode
736 # to "in column group" and abort these steps.
737 if node.name is 'colgroup'
738 ins_mode = ins_mode_in_column_group
740 # 10. If node is a table element, then switch the insertion mode to
741 # "in table" and abort these steps.
742 if node.name is 'table'
743 ins_mode = ins_mode_in_table
745 # 11. If node is a template element, then switch the insertion mode
746 # to the current template insertion mode and abort these steps.
747 # fixfull (template insertion mode stack)
749 # 12. If node is a head element and last is true, then switch the
750 # insertion mode to "in body" ("in body"! not "in head"!) and abort
751 # these steps. (fragment case)
752 if node.name is 'head' and last
753 ins_mode = ins_mode_in_body
755 # 13. If node is a head element and last is false, then switch the
756 # insertion mode to "in head" and abort these steps.
757 if node.name is 'head' and last is false
758 ins_mode = ins_mode_in_head
760 # 14. If node is a body element, then switch the insertion mode to
761 # "in body" and abort these steps.
762 if node.name is 'body'
763 ins_mode = ins_mode_in_body
765 # 15. If node is a frameset element, then switch the insertion mode
766 # to "in frameset" and abort these steps. (fragment case)
767 if node.name is 'frameset'
768 ins_mode = ins_mode_in_frameset
770 # 16. If node is an html element, run these substeps:
771 if node.name is 'html'
772 # 1. If the head element pointer is null, switch the insertion
773 # mode to "before head" and abort these steps. (fragment case)
774 if head_element_pointer is null
775 ins_mode = ins_mode_before_head
777 # 2. Otherwise, the head element pointer is not null,
778 # switch the insertion mode to "after head" and abort these
780 ins_mode = ins_mode_after_head
782 # 17. If last is true, then switch the insertion mode to "in body"
783 # and abort these steps. (fragment case)
785 ins_mode = ins_mode_in_body
787 # 18. Let node now be the node before node in the stack of open
790 node = open_els[node_i]
791 # 19. Return to the step labeled loop.
795 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
796 adjusted_current_node = ->
797 if open_els.length is 1 and flag_fragment_parsing
798 return context_element
801 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
802 # this implementation is structured (mostly) as described at the link above.
803 # capitalized comments are the "labels" described at the link above.
805 return if afe.length is 0
806 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
811 if i is afe.length - 1
814 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
819 el = insert_html_element afe[i].token
824 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
825 # adoption agency algorithm
827 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
828 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
829 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
830 adoption_agency = (subject) ->
831 debug_log "adoption_agency()"
832 debug_log "tree: #{serialize_els doc.children, false, true}"
833 debug_log "open_els: #{serialize_els open_els, true, true}"
834 debug_log "afe: #{serialize_els afe, true, true}"
835 if open_els[0].name is subject
838 # remove it from the list of active formatting elements (if found)
843 debug_log "aaa: starting off with subject on top of stack, exiting"
850 # 5. Let formatting element be the last element in the list of
851 # active formatting elements that: is between the end of the list
852 # and the last scope marker in the list, if any, or the start of
853 # the list otherwise, and has the tag name subject.
855 for t, fe_of_afe in afe
856 if t.type is TYPE_AFE_MARKER
861 # If there is no such element, then abort these steps and instead
862 # act as described in the "any other end tag" entry above.
864 debug_log "aaa: fe not found in afe"
865 in_body_any_other_end_tag subject
867 # 6. If formatting element is not in the stack of open elements,
868 # then this is a parse error; remove the element from the list, and
871 for t, fe_of_open_els in open_els
876 debug_log "aaa: fe not found in open_els"
878 # "remove it from the list" must mean afe, since it's not in open_els
879 afe.splice fe_of_afe, 1
881 # 7. If formatting element is in the stack of open elements, but
882 # the element is not in scope, then this is a parse error; abort
884 unless el_is_in_scope fe
885 debug_log "aaa: fe not in scope"
888 # 8. If formatting element is not the current node, this is a parse
889 # error. (But do not abort these steps.)
890 unless open_els[0] is fe
893 # 9. Let furthest block be the topmost node in the stack of open
894 # elements that is lower in the stack than formatting element, and
895 # is an element in the special category. There might not be one.
897 fb_of_open_els = null
904 # and continue, to see if there's one that's more "topmost"
905 # 10. If there is no furthest block, then the UA must first pop all
906 # the nodes from the bottom of the stack of open elements, from the
907 # current node up to and including formatting element, then remove
908 # formatting element from the list of active formatting elements,
909 # and finally abort these steps.
911 debug_log "aaa: no fb"
915 afe.splice fe_of_afe, 1
917 # 11. Let common ancestor be the element immediately above
918 # formatting element in the stack of open elements.
919 ca = open_els[fe_of_open_els + 1] # common ancestor
921 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
922 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
923 bookmark = new_aaa_bookmark()
926 afe.splice i, 0, bookmark
928 node = last_node = fb
932 # 3. Let node be the element immediately above node in the
933 # stack of open elements, or if node is no longer in the stack
934 # of open elements (e.g. because it got removed by this
935 # algorithm), the element that was immediately above node in
936 # the stack of open elements before node was removed.
940 node_next = open_els[i + 1]
942 node = node_next ? node_above
943 debug_log "inner loop #{inner}"
944 debug_log "tree: #{serialize_els doc.children, false, true}"
945 debug_log "open_els: #{serialize_els open_els, true, true}"
946 debug_log "afe: #{serialize_els afe, true, true}"
947 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
948 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
949 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
950 debug_log "node: #{node.serialize true, true}"
951 # TODO make sure node_above gets re-set if/when node is removed from open_els
953 # 4. If node is formatting element, then go to the next step in
954 # the overall algorithm.
958 # 5. If inner loop counter is greater than three and node is in
959 # the list of active formatting elements, then remove node from
960 # the list of active formatting elements.
966 debug_log "max out inner"
971 # 6. If node is not in the list of active formatting elements,
972 # then remove node from the stack of open elements and then go
973 # back to the step labeled inner loop.
975 debug_log "not in afe"
978 node_above = open_els[i + 1]
982 debug_log "the bones"
983 # 7. create an element for the token for which the element node
984 # was created, in the HTML namespace, with common ancestor as
985 # the intended parent; replace the entry for node in the list
986 # of active formatting elements with an entry for the new
987 # element, replace the entry for node in the stack of open
988 # elements with an entry for the new element, and let node be
990 new_node = token_to_element node.token, NS_HTML, ca
994 debug_log "replaced in afe"
998 node_above = open_els[i + 1]
999 open_els[i] = new_node
1000 debug_log "replaced in open_els"
1003 # 8. If last node is furthest block, then move the
1004 # aforementioned bookmark to be immediately after the new node
1005 # in the list of active formatting elements.
1010 debug_log "removed bookmark"
1014 # "after" means lower
1015 afe.splice i, 0, bookmark # "after as <-
1016 debug_log "placed bookmark after node"
1017 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1019 # 9. Insert last node into node, first removing it from its
1020 # previous parent node if any.
1021 if last_node.parent?
1022 debug_log "last_node has parent"
1023 for c, i in last_node.parent.children
1025 debug_log "removing last_node from parent"
1026 last_node.parent.children.splice i, 1
1028 node.children.push last_node
1029 last_node.parent = node
1030 # 10. Let last node be node.
1033 # 11. Return to the step labeled inner loop.
1034 # 14. Insert whatever last node ended up being in the previous step
1035 # at the appropriate place for inserting a node, but using common
1036 # ancestor as the override target.
1038 # In the case where fe is immediately followed by fb:
1039 # * inner loop exits out early (node==fe)
1041 # * last_node is still in the tree (not a duplicate)
1042 if last_node.parent?
1043 debug_log "FEFIRST? last_node has parent"
1044 for c, i in last_node.parent.children
1046 debug_log "removing last_node from parent"
1047 last_node.parent.children.splice i, 1
1050 debug_log "after aaa inner loop"
1051 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1052 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1053 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1054 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1055 debug_log "tree: #{serialize_els doc.children, false, true}"
1060 # can't use standard insert token thing, because it's already in
1061 # open_els and must stay at it's current position in open_els
1062 dest = adjusted_insertion_location ca
1063 dest[0].children.splice dest[1], 0, last_node
1064 last_node.parent = dest[0]
1067 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1068 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1069 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1070 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1071 debug_log "tree: #{serialize_els doc.children, false, true}"
1073 # 15. Create an element for the token for which formatting element
1074 # was created, in the HTML namespace, with furthest block as the
1076 new_element = token_to_element fe.token, NS_HTML, fb
1077 # 16. Take all of the child nodes of furthest block and append them
1078 # to the element created in the last step.
1079 while fb.children.length
1080 t = fb.children.shift()
1081 t.parent = new_element
1082 new_element.children.push t
1083 # 17. Append that new element to furthest block.
1084 new_element.parent = fb
1085 fb.children.push new_element
1086 # 18. Remove formatting element from the list of active formatting
1087 # elements, and insert the new element into the list of active
1088 # formatting elements at the position of the aforementioned
1096 afe[i] = new_element
1098 # 19. Remove formatting element from the stack of open elements,
1099 # and insert the new element into the stack of open elements
1100 # immediately below the position of furthest block in that stack.
1101 for t, i in open_els
1103 open_els.splice i, 1
1105 for t, i in open_els
1107 open_els.splice i, 0, new_element
1109 # 20. Jump back to the step labeled outer loop.
1110 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1111 debug_log "tree: #{serialize_els doc.children, false, true}"
1112 debug_log "open_els: #{serialize_els open_els, true, true}"
1113 debug_log "afe: #{serialize_els afe, true, true}"
1114 debug_log "AAA DONE"
1116 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1117 close_p_element = ->
1118 generate_implied_end_tags 'p' # arg is exception
1119 if open_els[0].name isnt 'p'
1121 while open_els.length > 1 # just in case
1122 el = open_els.shift()
1125 close_p_if_in_button_scope = ->
1126 if is_in_button_scope 'p'
1129 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1130 # aka insert_a_character = (t) ->
1131 insert_character = (t) ->
1132 dest = adjusted_insertion_location()
1133 # fixfull check for Document node
1135 prev = dest[0].children[dest[1] - 1]
1136 if prev.type is TYPE_TEXT
1139 dest[0].children.splice dest[1], 0, t
1142 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1143 process_token = (t) ->
1144 acn = adjusted_current_node()
1148 if acn.namespace is NS_HTML
1151 if is_mathml_text_integration_point(acn)
1152 if t.type is TYPE_START_TAG and (t.name is 'mglyph' or t.name is 'malignmark')
1155 if t.type is TYPE_TEXT
1158 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1161 if is_html_integration acn
1162 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1165 if t.type is TYPE_EOF
1168 in_foreign_content t
1172 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1173 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1174 adjusted_insertion_location = (override_target = null) ->
1175 # 1. If there was an override target specified, then let target be the
1178 target = override_target
1179 else # Otherwise, let target be the current node.
1180 target = open_els[0]
1181 # 2. Determine the adjusted insertion location using the first matching
1182 # steps from the following list:
1184 # If foster parenting is enabled and target is a table, tbody, tfoot,
1185 # thead, or tr element Foster parenting happens when content is
1186 # misnested in tables.
1187 if flag_foster_parenting and foster_parenting_targets[target.name]
1188 loop # once. this is here so we can ``break`` to "abort these substeps"
1189 # 1. Let last template be the last template element in the
1190 # stack of open elements, if any.
1191 last_template = null
1192 last_template_i = null
1193 for el, i in open_els
1194 if el.name is 'template'
1198 # 2. Let last table be the last table element in the stack of
1199 # open elements, if any.
1202 for el, i in open_els
1203 if el.name is 'table'
1207 # 3. If there is a last template and either there is no last
1208 # table, or there is one, but last template is lower (more
1209 # recently added) than last table in the stack of open
1210 # elements, then: let adjusted insertion location be inside
1211 # last template's template contents, after its last child (if
1212 # any), and abort these substeps.
1213 if last_template and (last_table is null or last_template_i < last_table_i)
1214 target = last_template # fixfull should be it's contents
1215 target_i = target.children.length
1217 # 4. If there is no last table, then let adjusted insertion
1218 # location be inside the first element in the stack of open
1219 # elements (the html element), after its last child (if any),
1220 # and abort these substeps. (fragment case)
1221 if last_table is null
1223 target = open_els[open_els.length - 1]
1224 target_i = target.children.length
1225 # 5. If last table has a parent element, then let adjusted
1226 # insertion location be inside last table's parent element,
1227 # immediately before last table, and abort these substeps.
1228 if last_table.parent?
1229 for c, i in last_table.parent.children
1231 target = last_table.parent
1235 # 6. Let previous element be the element immediately above last
1236 # table in the stack of open elements.
1238 # huh? how could it not have a parent?
1239 previous_element = open_els[last_table_i + 1]
1240 # 7. Let adjusted insertion location be inside previous
1241 # element, after its last child (if any).
1242 target = previous_element
1243 target_i = target.children.length
1244 # Note: These steps are involved in part because it's possible
1245 # for elements, the table element in this case in particular,
1246 # to have been moved by a script around in the DOM, or indeed
1247 # removed from the DOM entirely, after the element was inserted
1249 break # don't really loop
1251 # Otherwise Let adjusted insertion location be inside target, after
1252 # its last child (if any).
1253 target_i = target.children.length
1255 # 3. If the adjusted insertion location is inside a template element,
1256 # let it instead be inside the template element's template contents,
1257 # after its last child (if any).
1258 # fixfull (template)
1260 # 4. Return the adjusted insertion location.
1261 return [target, target_i]
1263 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1264 # aka create_an_element_for_token
1265 token_to_element = (t, namespace, intended_parent) ->
1266 # convert attributes into a hash
1269 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1270 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1272 # TODO 2. If the newly created element has an xmlns attribute in the
1273 # XMLNS namespace whose value is not exactly the same as the element's
1274 # namespace, that is a parse error. Similarly, if the newly created
1275 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1276 # value is not the XLink Namespace, that is a parse error.
1278 # fixfull: the spec says stuff about form pointers and ownerDocument
1282 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1283 insert_foreign_element = (token, namespace) ->
1284 ail = adjusted_insertion_location()
1287 el = token_to_element token, namespace, ail_el
1288 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1290 ail_el.children.splice ail_i, 0, el
1293 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1294 insert_html_element = (token) ->
1295 insert_foreign_element token, NS_HTML
1297 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1298 # position should be [node, index_within_children]
1299 insert_comment = (t, position = null) ->
1300 position ?= adjusted_insertion_location()
1301 position[0].children.splice position[1], 0, t
1304 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1305 parse_generic_raw_text = (t) ->
1306 insert_html_element t
1307 tok_state = tok_state_rawtext
1308 original_ins_mode = ins_mode
1309 ins_mode = ins_mode_text
1310 parse_generic_rcdata_text = (t) ->
1311 insert_html_element t
1312 tok_state = tok_state_rcdata
1313 original_ins_mode = ins_mode
1314 ins_mode = ins_mode_text
1316 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1317 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1318 generate_implied_end_tags = (except = null) ->
1319 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1322 # 8.2.5.4 The rules for parsing tokens in HTML content
1323 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1325 # 8.2.5.4.1 The "initial" insertion mode
1326 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1327 ins_mode_initial = (t) ->
1330 if t.type is TYPE_COMMENT
1334 if t.type is TYPE_DOCTYPE
1335 # FIXME check identifiers, set quirks, etc
1338 ins_mode = ins_mode_before_html
1341 #fixfull (iframe, quirks)
1342 ins_mode = ins_mode_before_html
1346 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1347 ins_mode_before_html = (t) ->
1348 if t.type is TYPE_DOCTYPE
1351 if t.type is TYPE_COMMENT
1356 if t.type is TYPE_START_TAG and t.name is 'html'
1357 el = token_to_element t, NS_HTML, doc
1358 doc.children.push el
1359 open_els.unshift(el)
1360 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1361 ins_mode = ins_mode_before_head
1363 if t.type is TYPE_END_TAG
1364 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1365 # fall through to "anything else"
1370 html_tok = new_open_tag 'html'
1371 el = token_to_element html_tok, NS_HTML, doc
1372 doc.children.push el
1374 # ?fixfull browsing context
1375 ins_mode = ins_mode_before_head
1379 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1380 ins_mode_before_head = (t) ->
1383 if t.type is TYPE_COMMENT
1386 if t.type is TYPE_DOCTYPE
1389 if t.type is TYPE_START_TAG and t.name is 'html'
1392 if t.type is TYPE_START_TAG and t.name is 'head'
1393 el = insert_html_element t
1394 head_element_pointer = el
1395 ins_mode = ins_mode_in_head
1396 if t.type is TYPE_END_TAG
1397 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1398 # fall through to Anything else below
1403 head_tok = new_open_tag 'head'
1404 el = insert_html_element head_tok
1405 head_element_pointer = el
1406 ins_mode = ins_mode_in_head
1409 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1410 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1411 open_els.shift() # spec says this will be a 'head' node
1412 ins_mode = ins_mode_after_head
1414 ins_mode_in_head = (t) ->
1415 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1418 if t.type is TYPE_COMMENT
1421 if t.type is TYPE_DOCTYPE
1424 if t.type is TYPE_START_TAG and t.name is 'html'
1427 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1428 el = insert_html_element t
1430 t.acknowledge_self_closing()
1432 if t.type is TYPE_START_TAG and t.name is 'meta'
1433 el = insert_html_element t
1435 t.acknowledge_self_closing()
1436 # fixfull encoding stuff
1438 if t.type is TYPE_START_TAG and t.name is 'title'
1439 parse_generic_rcdata_text t
1441 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1442 parse_generic_raw_text t
1444 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1445 insert_html_element t
1446 ins_mode = ins_mode_in_head_noscript
1448 if t.type is TYPE_START_TAG and t.name is 'script'
1449 ail = adjusted_insertion_location()
1450 el = token_to_element t, NS_HTML, ail
1451 el.flag 'parser-inserted', true
1452 # fixfull frament case
1453 ail[0].children.splice ail[1], 0, el
1455 tok_state = tok_state_script_data
1456 original_ins_mode = ins_mode # make sure orig... is defined
1457 ins_mode = ins_mode_text
1459 if t.type is TYPE_END_TAG and t.name is 'head'
1460 open_els.shift() # will be a head element... spec says so
1461 ins_mode = ins_mode_after_head
1463 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1464 ins_mode_in_head_else t
1466 if t.type is TYPE_START_TAG and t.name is 'template'
1467 insert_html_element t
1469 flag_frameset_ok = false
1470 ins_mode = ins_mode_in_template
1471 template_ins_modes.unshift ins_mode_in_template
1473 if t.type is TYPE_END_TAG and t.name is 'template'
1474 if template_tag_is_open()
1475 generate_implied_end_tags
1476 if open_els[0].name isnt 'template'
1479 el = open_els.shift()
1480 if el.name is 'template'
1482 clear_afe_to_marker()
1483 template_ins_modes.shift()
1488 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1491 ins_mode_in_head_else t
1493 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1494 ins_mode_in_head_noscript_else = (t) ->
1497 ins_mode = ins_mode_in_head
1499 ins_mode_in_head_noscript = (t) ->
1500 if t.type is TYPE_DOCTYPE
1503 if t.type is TYPE_START_TAG
1506 if t.type is TYPE_END_TAG and t.name is 'noscript'
1508 ins_mode = ins_mode_in_head
1510 if (t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\u000a" or t.text is "\u000c" or t.text is "\u000d" or t.text is ' ')) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1513 if t.type is TYPE_END_TAG and t.name is 'br'
1514 ins_mode_in_head_noscript_else t
1516 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1520 ins_mode_in_head_noscript_else t
1525 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1526 ins_mode_after_head_else = (t) ->
1527 body_tok = new_open_tag 'body'
1528 insert_html_element body_tok
1529 ins_mode = ins_mode_in_body
1532 ins_mode_after_head = (t) ->
1536 if t.type is TYPE_COMMENT
1539 if t.type is TYPE_DOCTYPE
1542 if t.type is TYPE_START_TAG and t.name is 'html'
1545 if t.type is TYPE_START_TAG and t.name is 'body'
1546 insert_html_element t
1547 flag_frameset_ok = false
1548 ins_mode = ins_mode_in_body
1550 if t.type is TYPE_START_TAG and t.name is 'frameset'
1551 insert_html_element t
1552 ins_mode = ins_mode_in_frameset
1554 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1556 open_els.unshift head_element_pointer
1558 for el, i of open_els
1559 if el is head_element_pointer
1560 open_els.splice i, 1
1562 console.log "warning: 23904 couldn't find head element in open_els"
1564 if t.type is TYPE_END_TAG and t.name is 'template'
1567 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1568 ins_mode_after_head_else t
1570 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1574 ins_mode_after_head_else t
1576 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1577 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1578 for el, i in open_els
1579 if el.namespace is NS_HTML and el.name is name
1580 generate_implied_end_tags name # arg is exception
1581 parse_error() unless i is 0
1586 if special_elements[el.name] is el.namespace
1590 ins_mode_in_body = (t) ->
1591 if t.type is TYPE_TEXT and t.text is "\u0000"
1598 if t.type is TYPE_TEXT
1601 flag_frameset_ok = false
1603 if t.type is TYPE_COMMENT
1606 if t.type is TYPE_DOCTYPE
1609 if t.type is TYPE_START_TAG and t.name is 'html'
1611 return if template_tag_is_open()
1612 root_attrs = open_els[open_els.length - 1].attrs
1614 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1617 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1620 if t.type is TYPE_START_TAG and t.name is 'body'
1622 return if open_els.length < 2
1623 second = open_els[open_els.length - 2]
1624 return unless second.ns is NS_HTML
1625 return unless second.name is 'body'
1626 return if template_tag_is_open()
1627 frameset_ok_flag = false
1629 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1631 if t.type is TYPE_START_TAG and t.name is 'frameset'
1633 return if open_els.length < 2
1634 second_i = open_els.length - 2
1635 second = open_els[second_i]
1636 return unless second.ns is NS_HTML
1637 return unless second.name is 'body'
1638 flag_frameset_ok = false
1640 for el, i in second.parent.children
1642 second.parent.children.splice i, 1
1644 open_els.splice second_i, 1
1645 # pop everything except the "root html element"
1646 while open_els.length > 1
1648 insert_html_element t
1649 ins_mode = ins_mode_in_frameset
1651 if t.type is TYPE_EOF
1653 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1654 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1655 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1658 unless ok_tags[t.name] is el.namespace
1661 if template_ins_modes.length > 0
1662 ins_mode_in_template t
1666 if t.type is TYPE_END_TAG and t.name is 'body'
1667 unless is_in_scope 'body'
1671 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1672 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1673 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1674 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1678 unless ok_tags[t.name] is el.namespace
1681 ins_mode = ins_mode_after_body
1683 if t.type is TYPE_END_TAG and t.name is 'html'
1684 unless is_in_scope 'body'
1688 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1689 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1690 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1691 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1695 unless ok_tags[t.name] is el.namespace
1698 ins_mode = ins_mode_after_body
1701 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1702 close_p_if_in_button_scope()
1703 insert_html_element t
1705 if t.type is TYPE_START_TAG and h_tags[t.name]?
1706 close_p_if_in_button_scope()
1707 if h_tags[open_els[0]] is NS_HTML
1710 insert_html_element t
1712 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1713 close_p_if_in_button_scope()
1714 insert_html_element t
1715 # spec: If the next token is a "LF" (U+000A) character token, then
1716 # ignore that token and move on to the next one. (Newlines at the
1717 # start of pre blocks are ignored as an authoring convenience.)
1718 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1720 flag_frameset_ok = false
1722 if t.type is TYPE_START_TAG and t.name is 'form'
1723 unless form_element_pointer is null or template_tag_is_open()
1726 close_p_if_in_button_scope()
1727 el = insert_html_element t
1728 unless template_tag_is_open()
1729 form_element_pointer = el
1731 if t.type is TYPE_START_TAG and t.name is 'li'
1732 flag_frameset_ok = false
1733 for node in open_els
1734 if node.name is 'li' and node.namespace is NS_HTML
1735 generate_implied_end_tags 'li' # arg is exception
1736 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1739 el = open_els.shift()
1740 if el.name is 'li' and el.namespace is NS_HTML
1743 if el_is_special_not_adp node
1745 close_p_if_in_button_scope()
1746 insert_html_element t
1748 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1749 flag_frameset_ok = false
1750 for node in open_els
1751 if node.name is 'dd' and node.namespace is NS_HTML
1752 generate_implied_end_tags 'dd' # arg is exception
1753 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1756 el = open_els.shift()
1757 if el.name is 'dd' and el.namespace is NS_HTML
1760 if node.name is 'dt' and node.namespace is NS_HTML
1761 generate_implied_end_tags 'dt' # arg is exception
1762 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1765 el = open_els.shift()
1766 if el.name is 'dt' and el.namespace is NS_HTML
1769 if el_is_special_not_adp node
1771 close_p_if_in_button_scope()
1772 insert_html_element t
1774 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1775 close_p_if_in_button_scope()
1776 insert_html_element t
1777 tok_state = tok_state_plaintext
1779 if t.type is TYPE_START_TAG and t.name is 'button'
1780 if is_in_scope 'button', NS_HTML
1782 generate_implied_end_tags()
1784 el = open_els.shift()
1785 if el.name is 'button' and el.namespace is NS_HTML
1788 insert_html_element t
1789 flag_frameset_ok = false
1791 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1792 unless is_in_scope t.name, NS_HTML
1795 generate_implied_end_tags()
1796 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1799 el = open_els.shift()
1800 if el.name is t.name and el.namespace is NS_HTML
1803 if t.type is TYPE_END_TAG and t.name is 'form'
1804 unless template_tag_is_open()
1805 node = form_element_pointer
1806 form_element_pointer = null
1807 if node is null or not el_is_in_scope node
1810 generate_implied_end_tags()
1811 if open_els[0] isnt node
1813 for el, i in open_els
1815 open_els.splice i, 1
1818 unless is_in_scope 'form', NS_HTML
1821 generate_implied_end_tags()
1822 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1825 el = open_els.shift()
1826 if el.name is 'form' and el.namespace is NS_HTML
1829 if t.type is TYPE_END_TAG and t.name is 'p'
1830 unless is_in_button_scope 'p', NS_HTML
1832 insert_html_element new_open_tag 'p'
1835 if t.type is TYPE_END_TAG and t.name is 'li'
1836 unless is_in_li_scope 'li', NS_HTML
1839 generate_implied_end_tags 'li' # arg is exception
1840 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1843 el = open_els.shift()
1844 if el.name is 'li' and el.namespace is NS_HTML
1847 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1848 unless is_in_scope t.name, NS_HTML
1851 generate_implied_end_tags t.name # arg is exception
1852 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1855 el = open_els.shift()
1856 if el.name is t.name and el.namespace is NS_HTML
1859 if t.type is TYPE_END_TAG and h_tags[t.name]?
1862 if h_tags[el.name] is el.namespace
1865 if standard_scopers[el.name] is el.namespace
1870 generate_implied_end_tags()
1871 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1874 el = open_els.shift()
1875 if h_tags[el.name] is el.namespace
1879 if t.type is TYPE_START_TAG and t.name is 'a'
1880 # If the list of active formatting elements contains an a element
1881 # between the end of the list and the last marker on the list (or
1882 # the start of the list if there is no marker on the list), then
1883 # this is a parse error; run the adoption agency algorithm for the
1884 # tag name "a", then remove that element from the list of active
1885 # formatting elements and the stack of open elements if the
1886 # adoption agency algorithm didn't already remove it (it might not
1887 # have if the element is not in table scope).
1890 if el.type is TYPE_AFE_MARKER
1892 if el.name is 'a' and el.namespace is NS_HTML
1900 for el, i in open_els
1902 open_els.splice i, 1
1904 el = insert_html_element t
1907 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1909 el = insert_html_element t
1912 if t.type is TYPE_START_TAG and t.name is 'nobr'
1914 el = insert_html_element t
1917 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1918 adoption_agency t.name
1920 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1922 insert_html_element t
1924 flag_frameset_ok = false
1926 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1927 unless is_in_scope t.name, NS_HTML
1930 generate_implied_end_tags()
1931 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1934 el = open_els.shift()
1935 if el.name is t.name and el.namespace is NS_HTML
1937 clear_afe_to_marker()
1939 if t.type is TYPE_START_TAG and t.name is 'table'
1940 close_p_if_in_button_scope() # fixfull quirksmode thing
1941 insert_html_element t
1942 flag_frameset_ok = false
1943 ins_mode = ins_mode_in_table
1945 if t.type is TYPE_END_TAG and t.name is 'br'
1947 t.type is TYPE_START_TAG
1949 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
1951 insert_html_element t
1953 t.acknowledge_self_closing()
1954 flag_frameset_ok = false
1956 if t.type is TYPE_START_TAG and t.name is 'input'
1958 insert_html_element t
1960 t.acknowledge_self_closing()
1961 unless is_input_hidden_tok t
1962 flag_frameset_ok = false
1964 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
1965 insert_html_element t
1967 t.acknowledge_self_closing()
1969 if t.type is TYPE_START_TAG and t.name is 'hr'
1970 close_p_if_in_button_scope()
1971 insert_html_element t
1973 t.acknowledge_self_closing()
1974 flag_frameset_ok = false
1976 if t.type is TYPE_START_TAG and t.name is 'image'
1981 if t.type is TYPE_START_TAG and t.name is 'isindex'
1983 if template_tag_is_open() is false and form_element_pointer isnt null
1985 t.acknowledge_self_closing()
1986 flag_frameset_ok = false
1987 close_p_if_in_button_scope()
1988 el = insert_html_element new_open_tag 'form'
1989 unless template_tag_is_open()
1990 form_element_pointer = el
1993 el.attrs['action'] = a[1]
1995 insert_html_element new_open_tag 'hr'
1998 insert_html_element new_open_tag 'label'
1999 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2000 input_el = new_open_tag 'input'
2005 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2006 input_el.attrs_a.push [a[0], a[1]]
2007 input_el.attrs_a.push ['name', 'isindex']
2008 # fixfull this next bit is in english... internationalize?
2009 prompt ?= "This is a searchable index. Enter search keywords: "
2010 insert_character new_character_token prompt # fixfull split
2011 # TODO submit typo "balue" in spec
2012 insert_html_element input_el
2014 # insert_character '' # you can put chars here if promt attr missing
2016 insert_html_element new_open_tag 'hr'
2019 unless template_tag_is_open()
2020 form_element_pointer = null
2022 if t.type is TYPE_START_TAG and t.name is 'textarea'
2023 insert_html_element t
2024 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2026 tok_state = tok_state_rcdata
2027 original_ins_mode = ins_mode
2028 flag_frameset_ok = false
2029 ins_mode = ins_mode_text
2031 if t.type is TYPE_START_TAG and t.name is 'xmp'
2032 close_p_if_in_button_scope()
2034 flag_frameset_ok = false
2035 parse_generic_raw_text t
2037 if t.type is TYPE_START_TAG and t.name is 'iframe'
2038 flag_frameset_ok = false
2039 parse_generic_raw_text t
2041 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2042 parse_generic_raw_text t
2044 if t.type is TYPE_START_TAG and t.name is 'select'
2046 insert_html_element t
2047 flag_frameset_ok = false
2048 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2049 ins_mode = ins_mode_in_select_in_table
2051 ins_mode = ins_mode_in_select
2053 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2054 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2057 insert_html_element t
2059 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2060 if is_in_scope 'ruby', NS_HTML
2061 generate_implied_end_tags()
2062 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2064 insert_html_element t
2066 if t.type is TYPE_START_TAG and t.name is 'rt'
2067 if is_in_scope 'ruby', NS_HTML
2068 generate_implied_end_tags 'rtc' # arg is exception
2069 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2071 insert_html_element t
2073 if t.type is TYPE_START_TAG and t.name is 'math'
2075 adjust_mathml_attributes t
2076 adjust_foreign_attributes t
2077 insert_foreign_element t, NS_MATHML
2078 if t.flag 'self-closing'
2080 t.acknowledge_self_closing()
2082 if t.type is TYPE_START_TAG and t.name is 'svg'
2084 adjust_svg_attributes t
2085 adjust_foreign_attributes t
2086 insert_foreign_element t, NS_SVG
2087 if t.flag 'self-closing'
2089 t.acknowledge_self_closing()
2091 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2094 if t.type is TYPE_START_TAG # any other start tag
2096 insert_html_element t
2098 if t.type is TYPE_END_TAG # any other end tag
2099 in_body_any_other_end_tag t.name
2103 ins_mode_in_table_else = (t) ->
2105 flag_foster_parenting = true # FIXME
2107 flag_foster_parenting = false
2108 can_in_table = { # FIXME do this inline like everywhere else
2116 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2117 ins_mode_text = (t) ->
2118 if t.type is TYPE_TEXT
2121 if t.type is TYPE_EOF
2123 if open_els[0].name is 'script'
2124 open_els[0].flag 'already started', true
2126 ins_mode = original_ins_mode
2129 if t.type is TYPE_END_TAG and t.name is 'script'
2131 ins_mode = original_ins_mode
2132 # fixfull the spec seems to assume that I'm going to run the script
2133 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2135 if t.type is TYPE_END_TAG
2137 ins_mode = original_ins_mode
2139 console.log 'warning: end of ins_mode_text reached'
2141 # the functions below implement the tokenizer stats described here:
2142 # http://www.w3.org/TR/html5/syntax.html#tokenization
2144 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2145 ins_mode_in_table = (t) ->
2148 if can_in_table[t.name]
2149 original_ins_mode = ins_mode
2150 ins_mode = ins_mode_in_table_text
2153 ins_mode_in_table_else t
2161 clear_stack_to_table_context()
2163 insert_html_element t
2164 ins_mode = ins_mode_in_caption
2166 clear_stack_to_table_context()
2167 insert_html_element t
2168 ins_mode = ins_mode_in_column_group
2170 clear_stack_to_table_context()
2171 insert_html_element new_open_tag 'colgroup'
2172 ins_mode = ins_mode_in_column_group
2174 when 'tbody', 'tfoot', 'thead'
2175 clear_stack_to_table_context()
2176 insert_html_element t
2177 ins_mode = ins_mode_in_table_body
2178 when 'td', 'th', 'tr'
2179 clear_stack_to_table_context()
2180 insert_html_element new_open_tag 'tbody'
2181 ins_mode = ins_mode_in_table_body
2185 if is_in_table_scope 'table'
2187 el = open_els.shift()
2188 if el.name is 'table'
2192 when 'style', 'script', 'template'
2195 if is_input_hidden_tok t
2196 ins_mode_in_table_else t
2199 el = insert_html_element t
2201 t.acknowledge_self_closing()
2204 if form_element_pointer?
2206 if template_tag_is_open()
2208 form_element_pointer = insert_html_element t
2211 ins_mode_in_table_else t
2215 if is_in_table_scope 'table'
2217 el = open_els.shift()
2218 if el.name is 'table'
2223 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2228 ins_mode_in_table_else t
2232 ins_mode_in_table_else t
2235 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2236 ins_mode_in_table_text = (t) ->
2237 if t.type is TYPE_TEXT and t.text is "\u0000"
2238 # huh? I thought the tokenizer didn't emit these
2241 if t.type is TYPE_TEXT
2242 pending_table_character_tokens.push t
2246 for old in pending_table_character_tokens
2247 unless is_space_tok old
2251 for old in pending_table_character_tokens
2252 insert_character old
2254 for old in pending_table_character_tokens
2255 ins_mode_table_else old
2256 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
2257 ins_mode = original_ins_mode
2260 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2261 ins_mode_in_caption = (t) ->
2262 if t.type is TYPE_END_TAG and t.name is 'caption'
2263 if is_in_table_scope 'caption'
2264 generate_implied_end_tags()
2265 if open_els[0].name isnt 'caption'
2268 el = open_els.shift()
2269 if el.name is 'caption'
2271 clear_afe_to_marker()
2272 ins_mode = ins_mode_in_table
2277 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2279 if is_in_table_scope 'caption'
2281 el = open_els.shift()
2282 if el.name is 'caption'
2284 clear_afe_to_marker()
2285 ins_mode = ins_mode_in_table
2287 # else fragment case
2289 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2295 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2296 ins_mode_in_column_group = (t) ->
2300 if t.type is TYPE_COMMENT
2303 if t.type is TYPE_DOCTYPE
2306 if t.type is TYPE_START_TAG and t.name is 'html'
2309 if t.type is TYPE_START_TAG and t.name is 'col'
2310 el = insert_html_element t
2312 t.acknowledge_self_closing()
2314 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2315 if open_els[0].name is 'colgroup'
2317 ins_mode = ins_mode_in_table
2321 if t.type is TYPE_END_TAG and t.name is 'col'
2324 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2327 if t.type is TYPE_EOF
2331 if open_els[0].name isnt 'colgroup'
2335 ins_mode = ins_mode_in_table
2339 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2340 ins_mode_in_table_body = (t) ->
2341 if t.type is TYPE_START_TAG and t.name is 'tr'
2342 clear_stack_to_table_body_context()
2343 insert_html_element t
2344 ins_mode = ins_mode_in_row
2346 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2348 clear_stack_to_table_body_context()
2349 insert_html_element new_open_tag 'tr'
2350 ins_mode = ins_mode_in_row
2353 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2354 unless is_in_table_scope t.name # fixfull check namespace
2357 clear_stack_to_table_body_context()
2359 ins_mode = ins_mode_in_table
2361 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2364 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
2367 if table_scopers[el.name]
2372 clear_stack_to_table_body_context()
2374 ins_mode = ins_mode_in_table
2377 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2383 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2384 ins_mode_in_row = (t) ->
2385 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2386 clear_stack_to_table_row_context()
2387 insert_html_element t
2388 ins_mode = ins_mode_in_cell
2391 if t.type is TYPE_END_TAG and t.name is 'tr'
2392 if is_in_table_scope 'tr'
2393 clear_stack_to_table_row_context()
2395 ins_mode = ins_mode_in_table_body
2399 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2400 if is_in_table_scope 'tr'
2401 clear_stack_to_table_row_context()
2403 ins_mode = ins_mode_in_table_body
2408 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2409 if is_in_table_scope t.name # fixfull namespace
2410 if is_in_table_scope 'tr'
2411 clear_stack_to_table_row_context()
2413 ins_mode = ins_mode_in_table_body
2418 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2424 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2426 generate_implied_end_tags()
2427 unless open_els[0].name is 'td' or open_els[0] is 'th'
2430 el = open_els.shift()
2431 if el.name is 'td' or el.name is 'th'
2433 clear_afe_to_marker()
2434 ins_mode = ins_mode_in_row
2436 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2437 ins_mode_in_cell = (t) ->
2438 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2439 if is_in_table_scope t.name
2440 generate_implied_end_tags()
2441 if open_els[0].name isnt t.name
2444 el = open_els.shift()
2445 if el.name is t.name
2447 clear_afe_to_marker()
2448 ins_mode = ins_mode_in_row
2452 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2455 if el.name is 'td' or el.name is 'th'
2458 if table_scopers[el.name]
2466 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2469 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2470 if is_in_table_scope t.name # fixfull namespace
2479 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2480 ins_mode_in_select = (t) ->
2481 if t.type is TYPE_TEXT and t.text is "\u0000"
2484 if t.type is TYPE_TEXT
2487 if t.type is TYPE_COMMENT
2490 if t.type is TYPE_DOCTYPE
2493 if t.type is TYPE_START_TAG and t.name is 'html'
2496 if t.type is TYPE_START_TAG and t.name is 'option'
2497 if open_els[0].name is 'option'
2499 insert_html_element t
2501 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2502 if open_els[0].name is 'option'
2504 if open_els[0].name is 'optgroup'
2506 insert_html_element t
2508 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2509 if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
2511 if open_els[0].name is 'optgroup'
2516 if t.type is TYPE_END_TAG and t.name is 'option'
2517 if open_els[0].name is 'option'
2522 if t.type is TYPE_END_TAG and t.name is 'select'
2523 if is_in_select_scope 'select'
2525 el = open_els.shift()
2526 if el.name is 'select'
2532 if t.type is TYPE_START_TAG and t.name is 'select'
2535 el = open_els.shift()
2536 if el.name is 'select'
2539 # spec says that this is the same as </select> but it doesn't say
2540 # to check scope first
2542 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2544 if is_in_select_scope 'select'
2547 el = open_els.shift()
2548 if el.name is 'select'
2553 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2556 if t.type is TYPE_EOF
2563 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2564 ins_mode_in_select_in_table = (t) ->
2565 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2568 el = open_els.shift()
2569 if el.name is 'select'
2574 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2576 unless is_in_table_scope t.name, NS_HTML
2579 el = open_els.shift()
2580 if el.name is 'select'
2586 ins_mode_in_select t
2589 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2590 ins_mode_in_template = (t) ->
2591 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2594 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2597 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2598 template_ins_modes.shift()
2599 template_ins_modes.unshift ins_mode_in_table
2600 ins_mode = ins_mode_in_table
2603 if t.type is TYPE_START_TAG and t.name is 'col'
2604 template_ins_modes.shift()
2605 template_ins_modes.unshift ins_mode_in_column_group
2606 ins_mode = ins_mode_in_column_group
2609 if t.type is TYPE_START_TAG and t.name is 'tr'
2610 template_ins_modes.shift()
2611 template_ins_modes.unshift ins_mode_in_table_body
2612 ins_mode = ins_mode_in_table_body
2615 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2616 template_ins_modes.shift()
2617 template_ins_modes.unshift ins_mode_in_row
2618 ins_mode = ins_mode_in_row
2621 if t.type is TYPE_START_TAG
2622 template_ins_modes.shift()
2623 template_ins_modes.unshift ins_mode_in_body
2624 ins_mode = ins_mode_in_body
2627 if t.type is TYPE_END_TAG
2630 if t.type is TYPE_EOF
2631 unless template_tag_is_open()
2636 el = open_els.shift()
2637 if el.name is 'template' # fixfull check namespace
2639 clear_afe_to_marker()
2640 template_ins_modes.shift()
2644 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2645 ins_mode_after_body = (t) ->
2649 if t.type is TYPE_COMMENT
2650 insert_comment t, [open_els[0], open_els[0].children.length]
2652 if t.type is TYPE_DOCTYPE
2655 if t.type is TYPE_START_TAG and t.name is 'html'
2658 if t.type is TYPE_END_TAG and t.name is 'html'
2659 # fixfull fragment case
2660 ins_mode = ins_mode_after_after_body
2662 if t.type is TYPE_EOF
2667 ins_mode = ins_mode_in_body
2670 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2671 ins_mode_in_frameset = (t) ->
2675 if t.type is TYPE_COMMENT
2678 if t.type is TYPE_DOCTYPE
2681 if t.type is TYPE_START_TAG and t.name is 'html'
2684 if t.type is TYPE_START_TAG and t.name is 'frameset'
2685 insert_html_element t
2687 if t.type is TYPE_END_TAG and t.name is 'frameset'
2688 # TODO ?correct for: "if the current node is the root html element"
2689 if open_els.length is 1
2691 return # fragment case
2693 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2694 ins_mode = ins_mode_after_frameset
2696 if t.type is TYPE_START_TAG and t.name is 'frame'
2697 insert_html_element t
2699 t.acknowledge_self_closing()
2701 if t.type is TYPE_START_TAG and t.name is 'noframes'
2704 if t.type is TYPE_EOF
2705 # TODO ?correct for: "if the current node is not the root html element"
2706 if open_els.length isnt 1
2714 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2715 ins_mode_after_frameset = (t) ->
2719 if t.type is TYPE_COMMENT
2722 if t.type is TYPE_DOCTYPE
2725 if t.type is TYPE_START_TAG and t.name is 'html'
2728 if t.type is TYPE_END_TAG and t.name is 'html'
2729 insert_mode = ins_mode_after_after_frameset
2731 if t.type is TYPE_START_TAG and t.name is 'noframes'
2734 if t.type is TYPE_EOF
2741 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2742 ins_mode_after_after_body = (t) ->
2743 if t.type is TYPE_COMMENT
2744 insert_comment t, [doc, doc.children.length]
2746 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2749 if t.type is TYPE_EOF
2754 ins_mode = ins_mode_in_body
2757 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2758 ins_mode_after_after_frameset = (t) ->
2759 if t.type is TYPE_COMMENT
2760 insert_comment t, [doc, doc.children.length]
2762 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2765 if t.type is TYPE_EOF
2768 if t.type is TYPE_START_TAG and t.name is 'noframes'
2775 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2776 has_color_face_or_size = (t) ->
2778 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2781 in_foreign_content_end_script = ->
2785 in_foreign_content_other_start = (t) ->
2786 acn = adjusted_current_node()
2787 if acn.namespace is NS_MATHML
2788 adjust_mathml_attributes t
2789 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2790 t.name = svg_name_fixes[t.name]
2791 if acn.namespace is NS_SVG
2792 adjust_svg_attributes t
2793 adjust_foreign_attributes t
2794 insert_foreign_element t, acn.namespace
2795 if t.flag 'self-closing'
2796 if t.name is 'script'
2797 t.acknowledge_self_closing()
2798 in_foreign_content_end_script()
2801 t.acknowledge_self_closing()
2803 in_foreign_content = (t) ->
2804 if t.type is TYPE_TEXT and t.text is "\u0000"
2806 insert_character new_character_token "\ufffd"
2811 if t.type is TYPE_TEXT
2812 flag_frameset_ok = false
2815 if t.type is TYPE_COMMENT
2818 if t.type is TYPE_DOCTYPE
2821 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
2823 if flag_fragment_parsing
2824 in_foreign_content_other_start t
2826 loop # is this safe?
2829 if is_mathml_text_integration_point(cn) or is_html_integration(cn) or cn.namespace is NS_HTML
2833 if t.type is TYPE_START_TAG
2834 in_foreign_content_other_start t
2836 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
2837 in_foreign_content_end_script()
2839 if t.type is TYPE_END_TAG
2840 if open_els[0].name.toLowerCase() isnt t.name
2842 for node in open_els
2843 if node is open_els[open_els.length - 1]
2845 if node.name.toLowerCase() is t.name
2847 el = open_els.shift()
2850 if node.namespace is NS_HTML
2852 ins_mode t # explicitly call HTML insertion mode
2855 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2857 switch c = txt.charAt(cur++)
2859 return new_text_node parse_character_reference()
2861 tok_state = tok_state_tag_open
2864 return new_text_node c
2866 return new_eof_token()
2868 return new_text_node c
2871 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2872 # not needed: tok_state_character_reference_in_data = ->
2873 # just call parse_character_reference()
2875 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2876 tok_state_rcdata = ->
2877 switch c = txt.charAt(cur++)
2879 return new_text_node parse_character_reference()
2881 tok_state = tok_state_rcdata_less_than_sign
2884 return new_character_token "\ufffd"
2886 return new_eof_token()
2888 return new_character_token c
2891 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2892 # not needed: tok_state_character_reference_in_rcdata = ->
2893 # just call parse_character_reference()
2895 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2896 tok_state_rawtext = ->
2897 switch c = txt.charAt(cur++)
2899 tok_state = tok_state_rawtext_less_than_sign
2902 return new_character_token "\ufffd"
2904 return new_eof_token()
2906 return new_character_token c
2909 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2910 tok_state_script_data = ->
2911 switch c = txt.charAt(cur++)
2913 tok_state = tok_state_script_data_less_than_sign
2916 return new_character_token "\ufffd"
2918 return new_eof_token()
2920 return new_character_token c
2923 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2924 tok_state_plaintext = ->
2925 switch c = txt.charAt(cur++)
2928 return new_character_token "\ufffd"
2930 return new_eof_token()
2932 return new_character_token c
2936 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2937 tok_state_tag_open = ->
2938 switch c = txt.charAt(cur++)
2940 tok_state = tok_state_markup_declaration_open
2942 tok_state = tok_state_end_tag_open
2945 tok_cur_tag = new_comment_token '?'
2946 tok_state = tok_state_bogus_comment
2949 tok_cur_tag = new_open_tag c
2950 tok_state = tok_state_tag_name
2951 else if is_uc_alpha(c)
2952 tok_cur_tag = new_open_tag c.toLowerCase()
2953 tok_state = tok_state_tag_name
2956 tok_state = tok_state_data
2957 cur -= 1 # we didn't parse/handle the char after <
2958 return new_text_node '<'
2961 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2962 tok_state_end_tag_open = ->
2963 switch c = txt.charAt(cur++)
2966 tok_state = tok_state_data
2969 tok_state = tok_state_data
2970 return new_text_node '</'
2973 tok_cur_tag = new_end_tag c.toLowerCase()
2974 tok_state = tok_state_tag_name
2975 else if is_lc_alpha(c)
2976 tok_cur_tag = new_end_tag c
2977 tok_state = tok_state_tag_name
2980 tok_cur_tag = new_comment_token '/'
2981 tok_state = tok_state_bogus_comment
2984 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2985 tok_state_tag_name = ->
2986 switch c = txt.charAt(cur++)
2987 when "\t", "\n", "\u000c", ' '
2988 tok_state = tok_state_before_attribute_name
2990 tok_state = tok_state_self_closing_start_tag
2992 tok_state = tok_state_data
2998 tok_cur_tag.name += "\ufffd"
3001 tok_state = tok_state_data
3004 tok_cur_tag.name += c.toLowerCase()
3006 tok_cur_tag.name += c
3009 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3010 tok_state_rcdata_less_than_sign = ->
3011 c = txt.charAt(cur++)
3013 temporary_buffer = ''
3014 tok_state = tok_state_rcdata_end_tag_open
3017 tok_state = tok_state_rcdata
3018 cur -= 1 # reconsume the input character
3019 return new_character_token '<'
3021 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3022 tok_state_rcdata_end_tag_open = ->
3023 c = txt.charAt(cur++)
3025 tok_cur_tag = new_end_tag c.toLowerCase()
3026 temporary_buffer += c
3027 tok_state = tok_state_rcdata_end_tag_name
3030 tok_cur_tag = new_end_tag c
3031 temporary_buffer += c
3032 tok_state = tok_state_rcdata_end_tag_name
3035 tok_state = tok_state_rcdata
3036 cur -= 1 # reconsume the input character
3037 return new_character_token "</" # fixfull separate these
3039 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3040 is_appropriate_end_tag = (t) ->
3041 # spec says to check against "the tag name of the last start tag to
3042 # have been emitted from this tokenizer", but this is only called from
3043 # the various "raw" states, which I'm pretty sure all push the start
3044 # token onto open_els. TODO: verify this after the script data states
3046 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3047 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3049 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3050 tok_state_rcdata_end_tag_name = ->
3051 c = txt.charAt(cur++)
3052 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3053 if is_appropriate_end_tag tok_cur_tag
3054 tok_state = tok_state_before_attribute_name
3056 # else fall through to "Anything else"
3058 if is_appropriate_end_tag tok_cur_tag
3059 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3061 # else fall through to "Anything else"
3063 if is_appropriate_end_tag tok_cur_tag
3064 tok_state = tok_state_data
3066 # else fall through to "Anything else"
3068 tok_cur_tag.name += c.toLowerCase()
3069 temporary_buffer += c
3072 tok_cur_tag.name += c
3073 temporary_buffer += c
3076 tok_state = tok_state_rcdata
3077 cur -= 1 # reconsume the input character
3078 return new_character_token '</' + temporary_buffer # fixfull separate these
3080 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3081 tok_state_rawtext_less_than_sign = ->
3082 c = txt.charAt(cur++)
3084 temporary_buffer = ''
3085 tok_state = tok_state_rawtext_end_tag_open
3088 tok_state = tok_state_rawtext
3089 cur -= 1 # reconsume the input character
3090 return new_character_token '<'
3092 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3093 tok_state_rawtext_end_tag_open = ->
3094 c = txt.charAt(cur++)
3096 tok_cur_tag = new_end_tag c.toLowerCase()
3097 temporary_buffer += c
3098 tok_state = tok_state_rawtext_end_tag_name
3101 tok_cur_tag = new_end_tag c
3102 temporary_buffer += c
3103 tok_state = tok_state_rawtext_end_tag_name
3106 tok_state = tok_state_rawtext
3107 cur -= 1 # reconsume the input character
3108 return new_character_token "</" # fixfull separate these
3110 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3111 tok_state_rawtext_end_tag_name = ->
3112 c = txt.charAt(cur++)
3113 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3114 if is_appropriate_end_tag tok_cur_tag
3115 tok_state = tok_state_before_attribute_name
3117 # else fall through to "Anything else"
3119 if is_appropriate_end_tag tok_cur_tag
3120 tok_state = tok_state_self_closing_start_tag
3122 # else fall through to "Anything else"
3124 if is_appropriate_end_tag tok_cur_tag
3125 tok_state = tok_state_data
3127 # else fall through to "Anything else"
3129 tok_cur_tag.name += c.toLowerCase()
3130 temporary_buffer += c
3133 tok_cur_tag.name += c
3134 temporary_buffer += c
3137 tok_state = tok_state_rawtext
3138 cur -= 1 # reconsume the input character
3139 return new_character_token '</' + temporary_buffer # fixfull separate these
3141 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3142 tok_state_script_data_less_than_sign = ->
3143 c = txt.charAt(cur++)
3145 temporary_buffer = ''
3146 tok_state = tok_state_script_data_end_tag_open
3149 tok_state = tok_state_script_data_escape_start
3150 return new_character_token '<!' # fixfull split
3152 tok_state = tok_state_script_data
3153 cur -= 1 # Reconsume
3154 return new_character_token '<'
3156 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3157 tok_state_script_data_end_tag_open = ->
3158 c = txt.charAt(cur++)
3160 tok_cur_tag = new_end_tag c.toLowerCase()
3161 temporary_buffer += c
3162 tok_state = tok_state_script_data_end_tag_name
3165 tok_cur_tag = new_end_tag c
3166 temporary_buffer += c
3167 tok_state = tok_state_script_data_end_tag_name
3170 tok_state = tok_state_script_data
3171 cur -= 1 # Reconsume
3172 return new_character_token '</'
3174 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3175 tok_state_script_data_end_tag_name = ->
3176 c = txt.charAt(cur++)
3177 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3178 if is_appropriate_end_tag tok_cur_tag
3179 tok_state = tok_state_before_attribute_name
3183 if is_appropriate_end_tag tok_cur_tag
3184 tok_state = tok_state_self_closing_start_tag
3188 tok_cur_tag.name += c.toLowerCase()
3189 temporary_buffer += c
3192 tok_cur_tag.name += c
3193 temporary_buffer += c
3196 tok_state = tok_state_script_data
3197 cur -= 1 # Reconsume
3198 return new_character_token "</#{temporary_buffer}" # fixfull split
3200 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3201 tok_state_script_data_escape_start = ->
3202 c = txt.charAt(cur++)
3204 tok_state = tok_state_script_data_escape_start_dash
3205 return new_character_token '-'
3207 tok_state = tok_state_script_data
3208 cur -= 1 # Reconsume
3211 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3212 tok_state_script_data_escape_start_dash = ->
3213 c = txt.charAt(cur++)
3215 tok_state = tok_state_script_data_escaped_dash_dash
3216 return new_character_token '-'
3218 tok_state = tok_state_script_data
3219 cur -= 1 # Reconsume
3222 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3223 tok_state_script_data_escaped = ->
3224 c = txt.charAt(cur++)
3226 tok_state = tok_state_script_data_escaped_dash
3227 return new_character_token '-'
3229 tok_state = tok_state_script_data_escaped_less_than_sign
3233 return new_character_token "\ufffd"
3235 tok_state = tok_state_data
3237 cur -= 1 # Reconsume
3240 return new_character_token c
3242 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3243 tok_state_script_data_escaped_dash = ->
3244 c = txt.charAt(cur++)
3246 tok_state = tok_state_script_data_escaped_dash_dash
3247 return new_character_token '-'
3249 tok_state = tok_state_script_data_escaped_less_than_sign
3253 tok_state = tok_state_script_data_escaped
3254 return new_character_token "\ufffd"
3256 tok_state = tok_state_data
3258 cur -= 1 # Reconsume
3261 tok_state = tok_state_script_data_escaped
3262 return new_character_token c
3264 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3265 tok_state_script_data_escaped_dash_dash = ->
3266 c = txt.charAt(cur++)
3268 return new_character_token '-'
3270 tok_state = tok_state_script_data_escaped_less_than_sign
3273 tok_state = tok_state_script_data
3274 return new_character_token '>'
3277 tok_state = tok_state_script_data_escaped
3278 return new_character_token "\ufffd"
3281 tok_state = tok_state_data
3282 cur -= 1 # Reconsume
3285 tok_state = tok_state_script_data_escaped
3286 return new_character_token c
3288 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3289 tok_state_script_data_escaped_less_than_sign = ->
3290 c = txt.charAt(cur++)
3292 temporary_buffer = ''
3293 tok_state = tok_state_script_data_escaped_end_tag_open
3296 temporary_buffer = c.toLowerCase() # yes, really
3297 tok_state = tok_state_script_data_double_escape_start
3298 return new_character_token "<#{c}" # fixfull split
3300 temporary_buffer = c
3301 tok_state = tok_state_script_data_double_escape_start
3302 return new_character_token "<#{c}" # fixfull split
3304 tok_state = tok_state_script_data_escaped
3305 cur -= 1 # Reconsume
3306 return new_character_token c
3308 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3309 tok_state_script_data_escaped_end_tag_open = ->
3310 c = txt.charAt(cur++)
3312 tok_cur_tag = new_end_tag c.toLowerCase()
3313 temporary_buffer += c
3314 tok_state = tok_state_script_data_escaped_end_tag_name
3317 tok_cur_tag = new_end_tag c
3318 temporary_buffer += c
3319 tok_state = tok_state_script_data_escaped_end_tag_name
3322 tok_state = tok_state_script_data_escaped
3323 cur -= 1 # Reconsume
3324 return new_character_token '</' # fixfull split
3326 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3327 tok_state_script_data_escaped_end_tag_name = ->
3328 c = txt.charAt(cur++)
3329 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3330 if is_appropriate_end_tag tok_cur_tag
3331 tok_state = tok_state_before_attribute_name
3335 if is_appropriate_end_tag tok_cur_tag
3336 tok_state = tok_state_self_closing_start_tag
3340 tok_cur_tag.name += c.toLowerCase()
3341 temporary_buffer += c.toLowerCase()
3344 tok_cur_tag.name += c
3345 temporary_buffer += c.toLowerCase()
3348 tok_state = tok_state_script_data_escaped
3349 cur -= 1 # Reconsume
3350 return new_character_token "</#{temporary_buffer}" # fixfull split
3352 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3353 tok_state_script_data_double_escape_start = ->
3354 c = txt.charAt(cur++)
3355 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3356 if temporary_buffer is 'script'
3357 tok_state = tok_state_script_data_double_escaped
3359 tok_state = tok_state_script_data_escaped
3360 return new_character_token c
3362 temporary_buffer += c.toLowerCase() # yes, really lowercase
3363 return new_character_token c
3365 temporary_buffer += c
3366 return new_character_token c
3368 tok_state = tok_state_script_data_escaped
3369 cur -= 1 # Reconsume
3372 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3373 tok_state_script_data_double_escaped = ->
3374 c = txt.charAt(cur++)
3376 tok_state = tok_state_script_data_double_escaped_dash
3377 return new_character_token '-'
3379 tok_state = tok_state_script_data_double_escaped_less_than_sign
3380 return new_character_token '<'
3383 return new_character_token "\ufffd"
3386 tok_state = tok_state_data
3387 cur -= 1 # Reconsume
3390 return new_character_token c
3392 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3393 tok_state_script_data_double_escaped_dash = ->
3394 c = txt.charAt(cur++)
3396 tok_state = tok_state_script_data_double_escaped_dash_dash
3397 return new_character_token '-'
3399 tok_state = tok_state_script_data_double_escaped_less_than_sign
3400 return new_character_token '<'
3403 tok_state = tok_state_script_data_double_escaped
3404 return new_character_token "\ufffd"
3407 tok_state = tok_state_data
3408 cur -= 1 # Reconsume
3411 tok_state = tok_state_script_data_double_escaped
3412 return new_character_token c
3414 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3415 tok_state_script_data_double_escaped_dash_dash = ->
3416 c = txt.charAt(cur++)
3418 return new_character_token '-'
3420 tok_state = tok_state_script_data_double_escaped_less_than_sign
3421 return new_character_token '<'
3423 tok_state = tok_state_script_data
3424 return new_character_token '>'
3427 tok_state = tok_state_script_data_double_escaped
3428 return new_character_token "\ufffd"
3431 tok_state = tok_state_data
3432 cur -= 1 # Reconsume
3435 tok_state = tok_state_script_data_double_escaped
3436 return new_character_token c
3438 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3439 tok_state_script_data_double_escaped_less_than_sign = ->
3440 c = txt.charAt(cur++)
3442 temporary_buffer = ''
3443 tok_state = tok_state_script_data_double_escape_end
3444 return new_character_token '/'
3446 tok_state = tok_state_script_data_double_escaped
3447 cur -= 1 # Reconsume
3450 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3451 tok_state_script_data_double_escape_end = ->
3452 c = txt.charAt(cur++)
3453 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3454 if temporary_buffer is 'script'
3455 tok_state = tok_state_script_data_escaped
3457 tok_state = tok_state_script_data_double_escaped
3458 return new_character_token c
3460 temporary_buffer += c.toLowerCase() # yes, really lowercase
3461 return new_character_token c
3463 temporary_buffer += c
3464 return new_character_token c
3466 tok_state = tok_state_script_data_double_escaped
3467 cur -= 1 # Reconsume
3470 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3471 tok_state_before_attribute_name = ->
3473 switch c = txt.charAt(cur++)
3474 when "\t", "\n", "\u000c", ' '
3477 tok_state = tok_state_self_closing_start_tag
3480 tok_state = tok_state_data
3486 attr_name = "\ufffd"
3487 when '"', "'", '<', '='
3492 tok_state = tok_state_data
3495 attr_name = c.toLowerCase()
3499 tok_cur_tag.attrs_a.unshift [attr_name, '']
3500 tok_state = tok_state_attribute_name
3503 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3504 tok_state_attribute_name = ->
3505 switch c = txt.charAt(cur++)
3506 when "\t", "\n", "\u000c", ' '
3507 tok_state = tok_state_after_attribute_name
3509 tok_state = tok_state_self_closing_start_tag
3511 tok_state = tok_state_before_attribute_value
3513 tok_state = tok_state_data
3519 tok_cur_tag.attrs_a[0][0] = "\ufffd"
3522 tok_cur_tag.attrs_a[0][0] = c
3525 tok_state = tok_state_data
3528 tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
3530 tok_cur_tag.attrs_a[0][0] += c
3533 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3534 tok_state_after_attribute_name = ->
3535 c = txt.charAt(cur++)
3536 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3539 tok_state = tok_state_self_closing_start_tag
3542 tok_state = tok_state_before_attribute_value
3545 tok_state = tok_state_data
3548 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3549 tok_state = tok_state_attribute_name
3553 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3554 tok_state = tok_state_attribute_name
3558 tok_state = tok_state_data
3559 cur -= 1 # reconsume
3561 if c is '"' or c is "'" or c is '<'
3563 # fall through to Anything else
3565 tok_cur_tag.attrs_a.unshift [c, '']
3566 tok_state = tok_state_attribute_name
3568 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3569 tok_state_before_attribute_value = ->
3570 switch c = txt.charAt(cur++)
3571 when "\t", "\n", "\u000c", ' '
3574 tok_state = tok_state_attribute_value_double_quoted
3576 tok_state = tok_state_attribute_value_unquoted
3579 tok_state = tok_state_attribute_value_single_quoted
3582 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3583 tok_state = tok_state_attribute_value_unquoted
3586 tok_state = tok_state_data
3592 tok_state = tok_state_data
3594 tok_cur_tag.attrs_a[0][1] += c
3595 tok_state = tok_state_attribute_value_unquoted
3598 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3599 tok_state_attribute_value_double_quoted = ->
3600 switch c = txt.charAt(cur++)
3602 tok_state = tok_state_after_attribute_value_quoted
3604 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3607 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3610 tok_state = tok_state_data
3612 tok_cur_tag.attrs_a[0][1] += c
3615 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3616 tok_state_attribute_value_single_quoted = ->
3617 switch c = txt.charAt(cur++)
3619 tok_state = tok_state_after_attribute_value_quoted
3621 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3624 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3627 tok_state = tok_state_data
3629 tok_cur_tag.attrs_a[0][1] += c
3632 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3633 tok_state_attribute_value_unquoted = ->
3634 switch c = txt.charAt(cur++)
3635 when "\t", "\n", "\u000c", ' '
3636 tok_state = tok_state_before_attribute_name
3638 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3640 tok_state = tok_state_data
3645 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3648 tok_state = tok_state_data
3650 # Parse Error if ', <, = or ` (backtick)
3651 tok_cur_tag.attrs_a[0][1] += c
3654 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3655 tok_state_after_attribute_value_quoted = ->
3656 switch c = txt.charAt(cur++)
3657 when "\t", "\n", "\u000c", ' '
3658 tok_state = tok_state_before_attribute_name
3660 tok_state = tok_state_self_closing_start_tag
3662 tok_state = tok_state_data
3668 tok_state = tok_state_data
3671 tok_state = tok_state_before_attribute_name
3672 cur -= 1 # we didn't handle that char
3675 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3676 tok_state_self_closing_start_tag = ->
3677 c = txt.charAt(cur++)
3679 tok_cur_tag.flag 'self-closing'
3680 tok_state = tok_state_data
3684 tok_state = tok_state_data
3685 cur -= 1 # Reconsume
3689 tok_state = tok_state_before_attribute_name
3690 cur -= 1 # Reconsume
3693 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3694 # WARNING: put a comment token in tok_cur_tag before setting this state
3695 tok_state_bogus_comment = ->
3696 next_gt = txt.indexOf '>', cur
3698 val = txt.substr cur
3701 val = txt.substr cur, (next_gt - cur)
3703 val = val.replace "\u0000", "\ufffd"
3704 tok_cur_tag.text += val
3705 tok_state = tok_state_data
3708 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3709 tok_state_markup_declaration_open = ->
3710 if txt.substr(cur, 2) is '--'
3712 tok_cur_tag = new_comment_token ''
3713 tok_state = tok_state_comment_start
3715 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3717 tok_state = tok_state_doctype
3719 acn = adjusted_current_node()
3720 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3722 tok_state = tok_state_cdata_section
3726 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
3727 tok_state = tok_state_bogus_comment
3730 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3731 tok_state_comment_start = ->
3732 switch c = txt.charAt(cur++)
3734 tok_state = tok_state_comment_start_dash
3737 return new_character_token "\ufffd"
3740 tok_state = tok_state_data
3744 tok_state = tok_state_data
3745 cur -= 1 # Reconsume
3748 tok_cur_tag.text += c
3751 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3752 tok_state_comment_start_dash = ->
3753 switch c = txt.charAt(cur++)
3755 tok_state = tok_state_comment_end
3758 tok_cur_tag.text += "-\ufffd"
3759 tok_state = tok_state_comment
3762 tok_state = tok_state_data
3766 tok_state = tok_state_data
3767 cur -= 1 # Reconsume
3770 tok_cur_tag.text += "-#{c}"
3771 tok_state = tok_state_comment
3774 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3775 tok_state_comment = ->
3776 switch c = txt.charAt(cur++)
3778 tok_state = tok_state_comment_end_dash
3781 tok_cur_tag.text += "\ufffd"
3784 tok_state = tok_state_data
3785 cur -= 1 # Reconsume
3788 tok_cur_tag.text += c
3791 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3792 tok_state_comment_end_dash = ->
3793 switch c = txt.charAt(cur++)
3795 tok_state = tok_state_comment_end
3798 tok_cur_tag.text += "-\ufffd"
3799 tok_state = tok_state_comment
3802 tok_state = tok_state_data
3803 cur -= 1 # Reconsume
3806 tok_cur_tag.text += "-#{c}"
3807 tok_state = tok_state_comment
3810 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3811 tok_state_comment_end = ->
3812 switch c = txt.charAt(cur++)
3814 tok_state = tok_state_data
3818 tok_cur_tag.text += "--\ufffd"
3819 tok_state = tok_state_comment
3822 tok_state = tok_state_comment_end_bang
3825 tok_cur_tag.text += '-'
3828 tok_state = tok_state_data
3829 cur -= 1 # Reconsume
3833 tok_cur_tag.text += "--#{c}"
3834 tok_state = tok_state_comment
3837 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3838 tok_state_comment_end_bang = ->
3839 switch c = txt.charAt(cur++)
3841 tok_cur_tag.text += "--!#{c}"
3842 tok_state = tok_state_comment_end_dash
3844 tok_state = tok_state_data
3848 tok_cur_tag.text += "--!\ufffd"
3849 tok_state = tok_state_comment
3852 tok_state = tok_state_data
3853 cur -= 1 # Reconsume
3856 tok_cur_tag.text += "--!#{c}"
3857 tok_state = tok_state_comment
3860 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3861 tok_state_doctype = ->
3862 switch c = txt.charAt(cur++)
3863 when "\t", "\u000a", "\u000c", ' '
3864 tok_state = tok_state_before_doctype_name
3867 tok_state = tok_state_data
3868 el = new_doctype_token ''
3869 el.flag 'force-quirks', true
3870 cur -= 1 # Reconsume
3874 tok_state = tok_state_before_doctype_name
3875 cur -= 1 # Reconsume
3878 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3879 tok_state_before_doctype_name = ->
3880 c = txt.charAt(cur++)
3881 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3884 tok_cur_tag = new_doctype_token c.toLowerCase()
3885 tok_state = tok_state_doctype_name
3889 tok_cur_tag = new_doctype_token "\ufffd"
3890 tok_state = tok_state_doctype_name
3894 el = new_doctype_token ''
3895 el.flag 'force-quirks', true
3896 tok_state = tok_state_data
3900 tok_state = tok_state_data
3901 el = new_doctype_token ''
3902 el.flag 'force-quirks', true
3903 cur -= 1 # Reconsume
3906 tok_cur_tag = new_doctype_token c
3907 tok_state = tok_state_doctype_name
3910 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3911 tok_state_doctype_name = ->
3912 c = txt.charAt(cur++)
3913 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3914 tok_state = tok_state_after_doctype_name
3917 tok_state = tok_state_data
3920 tok_cur_tag.name += c.toLowerCase()
3924 tok_cur_tag.name += "\ufffd"
3928 tok_state = tok_state_data
3929 tok_cur_tag.flag 'force-quirks', true
3930 cur -= 1 # Reconsume
3933 tok_cur_tag.name += c
3936 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3937 tok_state_after_doctype_name = ->
3938 c = txt.charAt(cur++)
3939 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3942 tok_state = tok_state_data
3946 tok_state = tok_state_data
3947 tok_cur_tag.flag 'force-quirks', true
3948 cur -= 1 # Reconsume
3951 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3953 tok_state = tok_state_after_doctype_public_keyword
3955 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3957 tok_state = tok_state_after_doctype_system_keyword
3960 tok_cur_tag.flag 'force-quirks', true
3961 tok_state = tok_state_bogus_doctype
3964 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3965 tok_state_after_doctype_public_keyword = ->
3966 c = txt.charAt(cur++)
3967 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3968 tok_state = tok_state_before_doctype_public_identifier
3972 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3973 tok_state = tok_state_doctype_public_identifier_double_quoted
3977 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3978 tok_state = tok_state_doctype_public_identifier_single_quoted
3982 tok_cur_tag.flag 'force-quirks', true
3983 tok_state = tok_state_data
3987 tok_state = tok_state_data
3988 tok_cur_tag.flag 'force-quirks', true
3989 cur -= 1 # Reconsume
3993 tok_cur_tag.flag 'force-quirks', true
3994 tok_state = tok_state_bogus_doctype
3997 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
3998 tok_state_before_doctype_public_identifier = ->
3999 c = txt.charAt(cur++)
4000 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4004 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
4005 tok_state = tok_state_doctype_public_identifier_double_quoted
4009 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
4010 tok_state = tok_state_doctype_public_identifier_single_quoted
4014 tok_cur_tag.flag 'force-quirks', true
4015 tok_state = tok_state_data
4019 tok_state = tok_state_data
4020 tok_cur_tag.flag 'force-quirks', true
4021 cur -= 1 # Reconsume
4025 tok_cur_tag.flag 'force-quirks', true
4026 tok_state = tok_state_bogus_doctype
4030 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4031 tok_state_doctype_public_identifier_double_quoted = ->
4032 c = txt.charAt(cur++)
4034 tok_state = tok_state_after_doctype_public_identifier
4038 tok_cur_tag.public_identifier += "\ufffd"
4042 tok_cur_tag.flag 'force-quirks', true
4043 tok_state = tok_state_data
4047 tok_state = tok_state_data
4048 tok_cur_tag.flag 'force-quirks', true
4049 cur -= 1 # Reconsume
4052 tok_cur_tag.public_identifier += c
4055 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4056 tok_state_doctype_public_identifier_single_quoted = ->
4057 c = txt.charAt(cur++)
4059 tok_state = tok_state_after_doctype_public_identifier
4063 tok_cur_tag.public_identifier += "\ufffd"
4067 tok_cur_tag.flag 'force-quirks', true
4068 tok_state = tok_state_data
4072 tok_state = tok_state_data
4073 tok_cur_tag.flag 'force-quirks', true
4074 cur -= 1 # Reconsume
4077 tok_cur_tag.public_identifier += c
4080 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4081 tok_state_after_doctype_public_identifier = ->
4082 c = txt.charAt(cur++)
4083 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4084 tok_state = tok_state_between_doctype_public_and_system_identifiers
4087 tok_state = tok_state_data
4091 tok_cur_tag.system_identifier = ''
4092 tok_state = tok_state_doctype_system_identifier_double_quoted
4096 tok_cur_tag.system_identifier = ''
4097 tok_state = tok_state_doctype_system_identifier_single_quoted
4101 tok_state = tok_state_data
4102 tok_cur_tag.flag 'force-quirks', true
4103 cur -= 1 # Reconsume
4107 tok_cur_tag.flag 'force-quirks', true
4108 tok_state = tok_state_bogus_doctype
4111 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4112 tok_state_between_doctype_public_and_system_identifiers = ->
4113 c = txt.charAt(cur++)
4114 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4117 tok_state = tok_state_data
4121 tok_cur_tag.system_identifier = ''
4122 tok_state = tok_state_doctype_system_identifier_double_quoted
4126 tok_cur_tag.system_identifier = ''
4127 tok_state = tok_state_doctype_system_identifier_single_quoted
4131 tok_state = tok_state_data
4132 tok_cur_tag.flag 'force-quirks', true
4133 cur -= 1 # Reconsume
4137 tok_cur_tag.flag 'force-quirks', true
4138 tok_state = tok_state_bogus_doctype
4141 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4142 tok_state_after_doctype_system_keyword = ->
4143 c = txt.charAt(cur++)
4144 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4145 tok_state = tok_state_before_doctype_system_identifier
4149 tok_cur_tag.system_identifier = ''
4150 tok_state = tok_state_doctype_system_identifier_double_quoted
4154 tok_cur_tag.system_identifier = ''
4155 tok_state = tok_state_doctype_system_identifier_single_quoted
4159 tok_cur_tag.flag 'force-quirks', true
4160 tok_state = tok_state_data
4164 tok_state = tok_state_data
4165 tok_cur_tag.flag 'force-quirks', true
4166 cur -= 1 # Reconsume
4170 tok_cur_tag.flag 'force-quirks', true
4171 tok_state = tok_state_bogus_doctype
4174 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4175 tok_state_before_doctype_system_identifier = ->
4176 c = txt.charAt(cur++)
4177 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4180 tok_cur_tag.system_identifier = ''
4181 tok_state = tok_state_doctype_system_identifier_double_quoted
4184 tok_cur_tag.system_identifier = ''
4185 tok_state = tok_state_doctype_system_identifier_single_quoted
4189 tok_cur_tag.flag 'force-quirks', true
4190 tok_state = tok_state_data
4194 tok_state = tok_state_data
4195 tok_cur_tag.flag 'force-quirks', true
4196 cur -= 1 # Reconsume
4200 tok_cur_tag.flag 'force-quirks', true
4201 tok_state = tok_state_bogus_doctype
4204 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4205 tok_state_doctype_system_identifier_double_quoted = ->
4206 c = txt.charAt(cur++)
4208 tok_state = tok_state_after_doctype_system_identifier
4212 tok_cur_tag.system_identifier += "\ufffd"
4216 tok_cur_tag.flag 'force-quirks', true
4217 tok_state = tok_state_data
4221 tok_state = tok_state_data
4222 tok_cur_tag.flag 'force-quirks', true
4223 cur -= 1 # Reconsume
4226 tok_cur_tag.system_identifier += c
4229 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4230 tok_state_doctype_system_identifier_single_quoted = ->
4231 c = txt.charAt(cur++)
4233 tok_state = tok_state_after_doctype_system_identifier
4237 tok_cur_tag.system_identifier += "\ufffd"
4241 tok_cur_tag.flag 'force-quirks', true
4242 tok_state = tok_state_data
4246 tok_state = tok_state_data
4247 tok_cur_tag.flag 'force-quirks', true
4248 cur -= 1 # Reconsume
4251 tok_cur_tag.system_identifier += c
4254 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4255 tok_state_after_doctype_system_identifier = ->
4256 c = txt.charAt(cur++)
4257 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4260 tok_state = tok_state_data
4264 tok_state = tok_state_data
4265 tok_cur_tag.flag 'force-quirks', true
4266 cur -= 1 # Reconsume
4270 # do _not_ tok_cur_tag.flag 'force-quirks', true
4271 tok_state = tok_state_bogus_doctype
4274 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4275 tok_state_bogus_doctype = ->
4276 c = txt.charAt(cur++)
4278 tok_state = tok_state_data
4281 tok_state = tok_state_data
4282 cur -= 1 # Reconsume
4287 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4288 tok_state_cdata_section = ->
4289 tok_state = tok_state_data
4290 next_gt = txt.indexOf ']]>', cur
4292 val = txt.substr cur
4295 val = txt.substr cur, (next_gt - cur)
4297 val = val.replace "\u0000", "\ufffd" # fixfull spec doesn't say this
4298 return new_character_token val # fixfull split
4300 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4301 # Don't set this as a state, just call it
4302 # returns a string (NOT a text node)
4303 parse_character_reference = (allowed_char = null, in_attr = false) ->
4304 if cur >= txt.length
4306 switch c = txt.charAt(cur)
4307 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4308 # explicitly not a parse error
4311 # there has to be "one or more" alnums between & and ; to be a parse error
4314 if cur + 1 >= txt.length
4316 if txt.charAt(cur + 1).toLowerCase() is 'x'
4325 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4329 if txt.charAt(start + i) is ';'
4331 # FIXME This is supposed to generate parse errors for some chars
4332 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
4339 if alnum.indexOf(txt.charAt(cur + i)) is -1
4342 # exit early, because parse_error() below needs at least one alnum
4344 if txt.charAt(cur + i) is ';'
4345 i += 1 # include ';' terminator in value
4346 decoded = decode_named_char_ref txt.substr(cur, i)
4353 # no ';' terminator (only legacy char refs)
4355 for i in [2..max] # no prefix matches, so ok to check shortest first
4356 c = legacy_char_refs[txt.substr(cur, i)]
4359 if txt.charAt(cur + i) is '='
4360 # "because some legacy user agents will
4361 # misinterpret the markup in those cases"
4364 if alnum.indexOf(txt.charAt(cur + i)) > -1
4365 # this makes attributes forgiving about url args
4367 # ok, and besides the weird exceptions for attributes...
4368 # return the matching char
4369 cur += i # consume entity chars
4370 parse_error() # because no terminating ";"
4374 return # never reached
4376 # tree constructor initialization
4377 # see comments on TYPE_TAG/etc for the structure of this data
4378 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4380 afe = [] # active formatting elements
4381 template_ins_modes = []
4382 ins_mode = ins_mode_initial
4383 original_ins_mode = ins_mode # TODO check spec
4384 flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
4385 flag_frameset_ok = true
4387 flag_foster_parenting = false
4388 form_element_pointer = null
4389 temporary_buffer = null
4390 pending_table_character_tokens = []
4391 head_element_pointer = null
4392 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4393 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4395 # tokenizer initialization
4396 tok_state = tok_state_data
4399 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4404 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4407 serialize_els = (els, shallow, show_ids) ->
4413 serialized += t.serialize shallow, show_ids
4416 # TODO export TYPE_*
4417 module.exports.parse_html = parse_html
4418 module.exports.debug_log_reset = debug_log_reset
4419 module.exports.debug_log_each = debug_log_each
4420 module.exports.TYPE_TAG = TYPE_TAG
4421 module.exports.TYPE_TEXT = TYPE_TEXT
4422 module.exports.TYPE_COMMENT = TYPE_COMMENT
4423 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4424 module.exports.NS_HTML = NS_HTML
4425 module.exports.NS_MATHML = NS_MATHML
4426 module.exports.NS_SVG = NS_SVG