1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
2 # Copyright 2015 Jason Woofenden
4 # This program is free software: you can redistribute it and/or modify it under
5 # the terms of the GNU Affero General Public License as published by the Free
6 # Software Foundation, either version 3 of the License, or (at your option) any
9 # This program is distributed in the hope that it will be useful, but WITHOUT
10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
11 # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more
14 # You should have received a copy of the GNU Affero General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
18 # This file implements a parser for html snippets, meant to be used by a
19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
20 # or <body> tags, nor does it produce the top level "document" node in the dom
21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
22 # indicate places where additional code is needed for full HTML document
25 # Instead, the data structure produced by this parser is an array of Nodes.
30 # the spec uses a many different words do indicate which ends of lists/stacks
31 # they are talking about (and relative movement within the lists/stacks). This
32 # section splains. I'm implementing "lists" (afe and open_els) the same way
35 # stacks grow downward (current element is index=0)
37 # example: open_els = [a, b, c, d, e, f, g]
39 # "grows downwards" means it's visualized like this: (index: el, names)
41 # 6: g "start of the list", "topmost", "first"
43 # 4: e "previous" (to d), "above", "before"
44 # 3: d (previous/next are relative to this element)
45 # 2: c "next", "after", "lower", "below"
47 # 0: a "end of the list", "current node", "bottommost", "last"
51 # note: to get this to run outside a browser, you'll have to write a native
52 # implementation of decode_named_char_ref()
53 unless module?.exports?
55 module = exports: window.wheic
57 # Each node is an obect of the Node class. Here are the Node types:
58 TYPE_TAG = 0 # name, {attributes}, [children]
59 TYPE_TEXT = 1 # "text"
62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
64 TYPE_END_TAG = 5 # name
66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
79 debug_log_each = (cb) ->
80 for str in g_debug_log
85 constructor: (type, args = {}) ->
86 @type = type # one of the TYPE_* constants above
87 @name = args.name ? '' # tag name
88 @text = args.text ? '' # contents for text/comment nodes
89 @attrs = args.attrs ? {}
90 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
91 @children = args.children ? []
92 @namespace = args.namespace ? NS_HTML
93 @parent = args.parent ? null
94 @token = args.token ? null
95 @flags = args.flags ? {}
99 @id = "#{++prev_node_id}"
100 acknowledge_self_closing: ->
102 @token.flag 'did_self_close'
104 @flag 'did_self_close', true
105 flag: (key, value = null) ->
110 serialize: (shallow = false, show_ids = false) -> # for unit tests
115 ret += JSON.stringify @name
130 ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
136 ret += c.serialize shallow, show_ids
140 ret += JSON.stringify @text
143 ret += JSON.stringify @text
145 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
148 when TYPE_AAA_BOOKMARK
149 ret += 'aaa_bookmark'
152 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
155 # helpers: (only take args that are normally known when parser creates nodes)
156 new_open_tag = (name) ->
157 return new Node TYPE_START_TAG, name: name
158 new_end_tag = (name) ->
159 return new Node TYPE_END_TAG, name: name
160 new_element = (name) ->
161 return new Node TYPE_TAG, name: name
162 new_text_node = (txt) ->
163 return new Node TYPE_TEXT, text: txt
164 new_character_token = new_text_node
165 new_comment_token = (txt) ->
166 return new Node TYPE_COMMENT, text: txt
167 new_doctype_token = (name) ->
168 return new Node TYPE_DOCTYPE, name: name
170 return new Node TYPE_EOF
172 return new Node TYPE_AFE_MARKER
173 new_aaa_bookmark = ->
174 return new Node TYPE_AAA_BOOKMARK
176 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
177 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
178 digits = "0123456789"
179 alnum = lc_alpha + uc_alpha + digits
180 hex_chars = digits + "abcdefABCDEF"
182 is_uc_alpha = (str) ->
183 return str.length is 1 and uc_alpha.indexOf(str) > -1
184 is_lc_alpha = (str) ->
185 return str.length is 1 and lc_alpha.indexOf(str) > -1
187 # some SVG elements have dashes in them
188 tag_name_chars = alnum + "-"
190 # http://www.w3.org/TR/html5/infrastructure.html#space-character
191 space_chars = "\u0009\u000a\u000c\u000d\u0020"
193 return txt.length is 1 and space_chars.indexOf(txt) > -1
194 is_space_tok = (t) ->
195 return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
197 is_input_hidden_tok = (t) ->
198 return false unless t.type is TYPE_START_TAG
201 if a[1].toLowerCase() is 'hidden'
206 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
207 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
209 # These are the character references that don't need a terminating semicolon
210 # min length: 2, max: 6, none are a prefix of any other.
212 Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
213 aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
214 aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
215 Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
216 curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
217 ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
218 euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
219 Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
220 igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
221 lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
222 Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
223 Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
224 Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
225 pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
226 shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
227 times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
228 ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
232 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
233 raw_text_elements = ['script', 'style']
234 escapable_raw_text_elements = ['textarea', 'title']
235 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
237 'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
238 'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
239 'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
240 'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
241 'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
242 'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
243 'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
244 'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
245 'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
246 'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
247 'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
248 'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
249 'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
250 'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
254 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
256 'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
257 'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
258 'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
259 'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
260 'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
261 'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
262 'determinant', 'diff', 'divergence', 'divide', 'domain',
263 'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
264 'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
265 'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
266 'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
267 'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
268 'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
269 'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
270 'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
271 'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
272 'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
273 'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
274 'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
275 'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
276 'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
277 'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
278 'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
279 'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
280 'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
281 'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
282 'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
283 'vectorproduct', 'xor'
285 # foreign_elements = [svg_elements..., mathml_elements...]
286 #normal_elements = All other allowed HTML elements are normal elements.
290 address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
291 aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
292 blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
293 caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
294 details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
295 embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
296 footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
297 h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
298 header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
299 img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
300 listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
301 noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
302 ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
303 script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
304 style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
305 template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
306 thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
307 wbr:NS_HTML, xmp:NS_HTML,
310 mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
311 'annotation-xml':NS_MATHML,
314 foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
317 formatting_elements = {
318 a: true, b: true, big: true, code: true, em: true, font: true, i: true,
319 nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
323 mathml_text_integration = {
324 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
326 is_mathml_text_integration_point = (el) ->
327 return mathml_text_integration[el.name] is el.namespace
328 is_html_integration = (el) -> # DON'T PASS A TOKEN
329 if el.namespace is NS_MATHML and el.name is 'annotation-xml'
330 if el.attrs.encoding?
331 if el.attrs.encoding.toLowerCase() is 'text/html'
333 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
336 if el.namespace is NS_SVG
337 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
342 h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
346 foster_parenting_targets = {
369 el_is_special = (e) ->
370 return special_elements[e.name] is e.namespace
372 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
373 el_is_special_not_adp = (el) ->
374 return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
378 altglyphdef: 'altGlyphDef'
379 altglyphitem: 'altGlyphItem'
380 animatecolor: 'animateColor'
381 animatemotion: 'animateMotion'
382 animatetransform: 'animateTransform'
385 fecolormatrix: 'feColorMatrix'
386 fecomponenttransfer: 'feComponentTransfer'
387 fecomposite: 'feComposite'
388 feconvolvematrix: 'feConvolveMatrix'
389 fediffuselighting: 'feDiffuseLighting'
390 fedisplacementmap: 'feDisplacementMap'
391 fedistantlight: 'feDistantLight'
392 fedropshadow: 'feDropShadow'
398 fegaussianblur: 'feGaussianBlur'
401 femergenode: 'feMergeNode'
402 femorphology: 'feMorphology'
404 fepointlight: 'fePointLight'
405 fespecularlighting: 'feSpecularLighting'
406 fespotlight: 'feSpotLight'
408 feturbulence: 'feTurbulence'
409 foreignobject: 'foreignObject'
411 lineargradient: 'linearGradient'
412 radialgradient: 'radialGradient'
415 svg_attribute_fixes = {
416 attributename: 'attributeName'
417 attributetype: 'attributeType'
418 basefrequency: 'baseFrequency'
419 baseprofile: 'baseProfile'
421 clippathunits: 'clipPathUnits'
422 contentscripttype: 'contentScriptType'
423 contentstyletype: 'contentStyleType'
424 diffuseconstant: 'diffuseConstant'
426 externalresourcesrequired: 'externalResourcesRequired'
427 filterres: 'filterRes'
428 filterunits: 'filterUnits'
430 gradienttransform: 'gradientTransform'
431 gradientunits: 'gradientUnits'
432 kernelmatrix: 'kernelMatrix'
433 kernelunitlength: 'kernelUnitLength'
434 keypoints: 'keyPoints'
435 keysplines: 'keySplines'
437 lengthadjust: 'lengthAdjust'
438 limitingconeangle: 'limitingConeAngle'
439 markerheight: 'markerHeight'
440 markerunits: 'markerUnits'
441 markerwidth: 'markerWidth'
442 maskcontentunits: 'maskContentUnits'
443 maskunits: 'maskUnits'
444 numoctaves: 'numOctaves'
445 pathlength: 'pathLength'
446 patterncontentunits: 'patternContentUnits'
447 patterntransform: 'patternTransform'
448 patternunits: 'patternUnits'
449 pointsatx: 'pointsAtX'
450 pointsaty: 'pointsAtY'
451 pointsatz: 'pointsAtZ'
452 preservealpha: 'preserveAlpha'
453 preserveaspectratio: 'preserveAspectRatio'
454 primitiveunits: 'primitiveUnits'
457 repeatcount: 'repeatCount'
458 repeatdur: 'repeatDur'
459 requiredextensions: 'requiredExtensions'
460 requiredfeatures: 'requiredFeatures'
461 specularconstant: 'specularConstant'
462 specularexponent: 'specularExponent'
463 spreadmethod: 'spreadMethod'
464 startoffset: 'startOffset'
465 stddeviation: 'stdDeviation'
466 stitchtiles: 'stitchTiles'
467 surfacescale: 'surfaceScale'
468 systemlanguage: 'systemLanguage'
469 tablevalues: 'tableValues'
472 textlength: 'textLength'
474 viewtarget: 'viewTarget'
475 xchannelselector: 'xChannelSelector'
476 ychannelselector: 'yChannelSelector'
477 zoomandpan: 'zoomAndPan'
479 adjust_mathml_attributes = (t) ->
481 if a[0] is 'definitionurl'
482 a[0] = 'definitionURL'
484 adjust_svg_attributes = (t) ->
486 if svg_attribute_fixes[a[0]]?
487 a[0] = svg_attribute_fixes[a[0]]
489 adjust_foreign_attributes = (t) ->
493 # decode_named_char_ref()
495 # The list of named character references is _huge_ so ask the browser to decode
496 # for us instead of wasting bandwidth/space on including the table here.
498 # Pass without the "&" but with the ";" examples:
499 # for "&" pass "amp;"
500 # for "′" pass "x2032;"
503 textarea: document.createElement('textarea')
505 # TODO test this in IE8
506 decode_named_char_ref = (txt) ->
508 decoded = g_dncr.cache[txt]
509 return decoded if decoded?
510 g_dncr.textarea.innerHTML = txt
511 decoded = g_dncr.textarea.value
512 return null if decoded is txt
513 return g_dncr.cache[txt] = decoded
515 parse_html = (args) ->
517 cur = null # index of next char in txt to be parsed
518 # declare doc and tokenizer variables so they're in scope below
520 open_els = null # stack of open elements
521 afe = null # active formatting elements
522 template_ins_modes = null
524 original_ins_mode = null
526 tok_cur_tag = null # partially parsed tag
527 flag_scripting = null
528 flag_frameset_ok = null
530 flag_foster_parenting = null
531 form_element_pointer = null
532 temporary_buffer = null
533 pending_table_character_tokens = null
534 head_element_pointer = null
535 flag_fragment_parsing = null
536 context_element = null
545 console.log "Parse error at character #{cur} of #{txt.length}"
547 afe_push = (new_el) ->
550 if el.name is new_el.name and el.namespace is new_el.namespace
552 continue unless new_el.attrs[k] is v
553 for k, v of new_el.attrs
554 continue unless el.attrs[k] is v
561 afe.unshift new_afe_marker()
563 # the functions below impliment the Tree Contstruction algorithm
564 # http://www.w3.org/TR/html5/syntax.html#tree-construction
566 # But first... the helpers
567 template_tag_is_open = ->
569 if t.name is 'template' # maybe should also check: and t.namespace is 'html'
572 is_in_scope_x = (tag_name, scope, namespace) ->
574 if t.name is tag_name and (namespace is null or namespace is t.namespace)
576 if scope[t.name] is t.namespace
579 is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
581 if t.name is tag_name and (namespace is null or namespace is t.namespace)
583 if scope[t.name] is t.namespace
585 if scope2[t.name] is t.namespace
589 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
590 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
591 template: NS_HTML, mi: NS_MATHML,
593 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
594 'annotation-xml': NS_MATHML,
596 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
598 button_scopers = button: NS_HTML
599 li_scopers = ol: NS_HTML, ul: NS_HTML
600 table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
601 is_in_scope = (tag_name, namespace = null) ->
602 return is_in_scope_x tag_name, standard_scopers, namespace
603 is_in_button_scope = (tag_name, namespace = null) ->
604 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
605 is_in_table_scope = (tag_name, namespace = null) ->
606 return is_in_scope_x tag_name, table_scopers, namespace
607 # aka is_in_list_item_scope
608 is_in_li_scope = (tag_name, namespace = null) ->
609 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
610 is_in_select_scope = (tag_name, namespace = null) ->
612 if t.name is tag_name and (namespace is null or namespace is t.namespace)
614 if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
617 # this checks for a particular element, not by name
618 el_is_in_scope = (el) ->
622 if standard_scopers[t.name] is t.namespace
626 clear_to_table_stopers = {
631 clear_stack_to_table_context = ->
633 if clear_to_table_stopers[open_els[0].name]?
637 clear_to_table_body_stopers = {
644 clear_stack_to_table_body_context = ->
646 if clear_to_table_body_stopers[open_els[0].name]?
650 clear_to_table_row_stopers = {
655 clear_stack_to_table_row_context = ->
657 if clear_to_table_row_stopers[open_els[0].name]?
661 clear_afe_to_marker = ->
663 return unless afe.length > 0 # this happens in fragment case, ?spec error
665 if el.type is TYPE_AFE_MARKER
670 # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
672 # 1. Let last be false.
674 # 2. Let node be the last node in the stack of open elements.
676 node = open_els[node_i]
677 # 3. Loop: If node is the first node in the stack of open elements,
678 # then set last to true, and, if the parser was originally created as
679 # part of the HTML fragment parsing algorithm (fragment case) set node
680 # to the context element.
682 if node_i is open_els.length - 1
684 # fixfull (fragment case)
686 # 4. If node is a select element, run these substeps:
687 if node.name is 'select'
688 # 1. If last is true, jump to the step below labeled done.
690 # 2. Let ancestor be node.
693 # 3. Loop: If ancestor is the first node in the stack of
694 # open elements, jump to the step below labeled done.
696 if ancestor_i is open_els.length - 1
698 # 4. Let ancestor be the node before ancestor in the stack
701 ancestor = open_els[ancestor_i]
702 # 5. If ancestor is a template node, jump to the step below
704 if ancestor.name is 'template'
706 # 6. If ancestor is a table node, switch the insertion mode
707 # to "in select in table" and abort these steps.
708 if ancestor.name is 'table'
709 ins_mode = ins_mode_in_select_in_table
711 # 7. Jump back to the step labeled loop.
712 # 8. Done: Switch the insertion mode to "in select" and abort
714 ins_mode = ins_mode_in_select
716 # 5. If node is a td or th element and last is false, then switch
717 # the insertion mode to "in cell" and abort these steps.
718 if (node.name is 'td' or node.name is 'th') and last is false
719 ins_mode = ins_mode_in_cell
721 # 6. If node is a tr element, then switch the insertion mode to "in
722 # row" and abort these steps.
724 ins_mode = ins_mode_in_row
726 # 7. If node is a tbody, thead, or tfoot element, then switch the
727 # insertion mode to "in table body" and abort these steps.
728 if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
729 ins_mode = ins_mode_in_table_body
731 # 8. If node is a caption element, then switch the insertion mode
732 # to "in caption" and abort these steps.
733 if node.name is 'caption'
734 ins_mode = ins_mode_in_caption
736 # 9. If node is a colgroup element, then switch the insertion mode
737 # to "in column group" and abort these steps.
738 if node.name is 'colgroup'
739 ins_mode = ins_mode_in_column_group
741 # 10. If node is a table element, then switch the insertion mode to
742 # "in table" and abort these steps.
743 if node.name is 'table'
744 ins_mode = ins_mode_in_table
746 # 11. If node is a template element, then switch the insertion mode
747 # to the current template insertion mode and abort these steps.
748 # fixfull (template insertion mode stack)
750 # 12. If node is a head element and last is true, then switch the
751 # insertion mode to "in body" ("in body"! not "in head"!) and abort
752 # these steps. (fragment case)
753 if node.name is 'head' and last
754 ins_mode = ins_mode_in_body
756 # 13. If node is a head element and last is false, then switch the
757 # insertion mode to "in head" and abort these steps.
758 if node.name is 'head' and last is false
759 ins_mode = ins_mode_in_head
761 # 14. If node is a body element, then switch the insertion mode to
762 # "in body" and abort these steps.
763 if node.name is 'body'
764 ins_mode = ins_mode_in_body
766 # 15. If node is a frameset element, then switch the insertion mode
767 # to "in frameset" and abort these steps. (fragment case)
768 if node.name is 'frameset'
769 ins_mode = ins_mode_in_frameset
771 # 16. If node is an html element, run these substeps:
772 if node.name is 'html'
773 # 1. If the head element pointer is null, switch the insertion
774 # mode to "before head" and abort these steps. (fragment case)
775 if head_element_pointer is null
776 ins_mode = ins_mode_before_head
778 # 2. Otherwise, the head element pointer is not null,
779 # switch the insertion mode to "after head" and abort these
781 ins_mode = ins_mode_after_head
783 # 17. If last is true, then switch the insertion mode to "in body"
784 # and abort these steps. (fragment case)
786 ins_mode = ins_mode_in_body
788 # 18. Let node now be the node before node in the stack of open
791 node = open_els[node_i]
792 # 19. Return to the step labeled loop.
796 # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
797 adjusted_current_node = ->
798 if open_els.length is 1 and flag_fragment_parsing
799 return context_element
802 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
803 # this implementation is structured (mostly) as described at the link above.
804 # capitalized comments are the "labels" described at the link above.
806 return if afe.length is 0
807 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
812 if i is afe.length - 1
815 if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
820 el = insert_html_element afe[i].token
825 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
826 # adoption agency algorithm
828 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
829 # http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
830 # http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
831 adoption_agency = (subject) ->
832 debug_log "adoption_agency()"
833 debug_log "tree: #{serialize_els doc.children, false, true}"
834 debug_log "open_els: #{serialize_els open_els, true, true}"
835 debug_log "afe: #{serialize_els afe, true, true}"
836 if open_els[0].name is subject
839 # remove it from the list of active formatting elements (if found)
844 debug_log "aaa: starting off with subject on top of stack, exiting"
851 # 5. Let formatting element be the last element in the list of
852 # active formatting elements that: is between the end of the list
853 # and the last scope marker in the list, if any, or the start of
854 # the list otherwise, and has the tag name subject.
856 for t, fe_of_afe in afe
857 if t.type is TYPE_AFE_MARKER
862 # If there is no such element, then abort these steps and instead
863 # act as described in the "any other end tag" entry above.
865 debug_log "aaa: fe not found in afe"
866 in_body_any_other_end_tag subject
868 # 6. If formatting element is not in the stack of open elements,
869 # then this is a parse error; remove the element from the list, and
872 for t, fe_of_open_els in open_els
877 debug_log "aaa: fe not found in open_els"
879 # "remove it from the list" must mean afe, since it's not in open_els
880 afe.splice fe_of_afe, 1
882 # 7. If formatting element is in the stack of open elements, but
883 # the element is not in scope, then this is a parse error; abort
885 unless el_is_in_scope fe
886 debug_log "aaa: fe not in scope"
889 # 8. If formatting element is not the current node, this is a parse
890 # error. (But do not abort these steps.)
891 unless open_els[0] is fe
894 # 9. Let furthest block be the topmost node in the stack of open
895 # elements that is lower in the stack than formatting element, and
896 # is an element in the special category. There might not be one.
898 fb_of_open_els = null
905 # and continue, to see if there's one that's more "topmost"
906 # 10. If there is no furthest block, then the UA must first pop all
907 # the nodes from the bottom of the stack of open elements, from the
908 # current node up to and including formatting element, then remove
909 # formatting element from the list of active formatting elements,
910 # and finally abort these steps.
912 debug_log "aaa: no fb"
916 afe.splice fe_of_afe, 1
918 # 11. Let common ancestor be the element immediately above
919 # formatting element in the stack of open elements.
920 ca = open_els[fe_of_open_els + 1] # common ancestor
922 node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
923 # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
924 bookmark = new_aaa_bookmark()
927 afe.splice i, 0, bookmark
929 node = last_node = fb
933 # 3. Let node be the element immediately above node in the
934 # stack of open elements, or if node is no longer in the stack
935 # of open elements (e.g. because it got removed by this
936 # algorithm), the element that was immediately above node in
937 # the stack of open elements before node was removed.
941 node_next = open_els[i + 1]
943 node = node_next ? node_above
944 debug_log "inner loop #{inner}"
945 debug_log "tree: #{serialize_els doc.children, false, true}"
946 debug_log "open_els: #{serialize_els open_els, true, true}"
947 debug_log "afe: #{serialize_els afe, true, true}"
948 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
949 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
950 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
951 debug_log "node: #{node.serialize true, true}"
952 # TODO make sure node_above gets re-set if/when node is removed from open_els
954 # 4. If node is formatting element, then go to the next step in
955 # the overall algorithm.
959 # 5. If inner loop counter is greater than three and node is in
960 # the list of active formatting elements, then remove node from
961 # the list of active formatting elements.
967 debug_log "max out inner"
972 # 6. If node is not in the list of active formatting elements,
973 # then remove node from the stack of open elements and then go
974 # back to the step labeled inner loop.
976 debug_log "not in afe"
979 node_above = open_els[i + 1]
983 debug_log "the bones"
984 # 7. create an element for the token for which the element node
985 # was created, in the HTML namespace, with common ancestor as
986 # the intended parent; replace the entry for node in the list
987 # of active formatting elements with an entry for the new
988 # element, replace the entry for node in the stack of open
989 # elements with an entry for the new element, and let node be
991 new_node = token_to_element node.token, NS_HTML, ca
995 debug_log "replaced in afe"
999 node_above = open_els[i + 1]
1000 open_els[i] = new_node
1001 debug_log "replaced in open_els"
1004 # 8. If last node is furthest block, then move the
1005 # aforementioned bookmark to be immediately after the new node
1006 # in the list of active formatting elements.
1011 debug_log "removed bookmark"
1015 # "after" means lower
1016 afe.splice i, 0, bookmark # "after as <-
1017 debug_log "placed bookmark after node"
1018 debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1020 # 9. Insert last node into node, first removing it from its
1021 # previous parent node if any.
1022 if last_node.parent?
1023 debug_log "last_node has parent"
1024 for c, i in last_node.parent.children
1026 debug_log "removing last_node from parent"
1027 last_node.parent.children.splice i, 1
1029 node.children.push last_node
1030 last_node.parent = node
1031 # 10. Let last node be node.
1034 # 11. Return to the step labeled inner loop.
1035 # 14. Insert whatever last node ended up being in the previous step
1036 # at the appropriate place for inserting a node, but using common
1037 # ancestor as the override target.
1039 # In the case where fe is immediately followed by fb:
1040 # * inner loop exits out early (node==fe)
1042 # * last_node is still in the tree (not a duplicate)
1043 if last_node.parent?
1044 debug_log "FEFIRST? last_node has parent"
1045 for c, i in last_node.parent.children
1047 debug_log "removing last_node from parent"
1048 last_node.parent.children.splice i, 1
1051 debug_log "after aaa inner loop"
1052 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1053 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1054 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1055 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1056 debug_log "tree: #{serialize_els doc.children, false, true}"
1061 # can't use standard insert token thing, because it's already in
1062 # open_els and must stay at it's current position in open_els
1063 dest = adjusted_insertion_location ca
1064 dest[0].children.splice dest[1], 0, last_node
1065 last_node.parent = dest[0]
1068 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1069 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1070 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1071 debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1072 debug_log "tree: #{serialize_els doc.children, false, true}"
1074 # 15. Create an element for the token for which formatting element
1075 # was created, in the HTML namespace, with furthest block as the
1077 new_element = token_to_element fe.token, NS_HTML, fb
1078 # 16. Take all of the child nodes of furthest block and append them
1079 # to the element created in the last step.
1080 while fb.children.length
1081 t = fb.children.shift()
1082 t.parent = new_element
1083 new_element.children.push t
1084 # 17. Append that new element to furthest block.
1085 new_element.parent = fb
1086 fb.children.push new_element
1087 # 18. Remove formatting element from the list of active formatting
1088 # elements, and insert the new element into the list of active
1089 # formatting elements at the position of the aforementioned
1097 afe[i] = new_element
1099 # 19. Remove formatting element from the stack of open elements,
1100 # and insert the new element into the stack of open elements
1101 # immediately below the position of furthest block in that stack.
1102 for t, i in open_els
1104 open_els.splice i, 1
1106 for t, i in open_els
1108 open_els.splice i, 0, new_element
1110 # 20. Jump back to the step labeled outer loop.
1111 debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1112 debug_log "tree: #{serialize_els doc.children, false, true}"
1113 debug_log "open_els: #{serialize_els open_els, true, true}"
1114 debug_log "afe: #{serialize_els afe, true, true}"
1115 debug_log "AAA DONE"
1117 # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1118 close_p_element = ->
1119 generate_implied_end_tags 'p' # arg is exception
1120 if open_els[0].name isnt 'p'
1122 while open_els.length > 1 # just in case
1123 el = open_els.shift()
1126 close_p_if_in_button_scope = ->
1127 if is_in_button_scope 'p'
1130 # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1131 # aka insert_a_character = (t) ->
1132 insert_character = (t) ->
1133 dest = adjusted_insertion_location()
1134 # fixfull check for Document node
1136 prev = dest[0].children[dest[1] - 1]
1137 if prev.type is TYPE_TEXT
1140 dest[0].children.splice dest[1], 0, t
1143 # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1144 process_token = (t) ->
1145 acn = adjusted_current_node()
1149 if acn.namespace is NS_HTML
1152 if is_mathml_text_integration_point(acn)
1153 if t.type is TYPE_START_TAG and (t.name is 'mglyph' or t.name is 'malignmark')
1156 if t.type is TYPE_TEXT
1159 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1162 if is_html_integration acn
1163 if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1166 if t.type is TYPE_EOF
1169 in_foreign_content t
1173 # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1174 # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1175 adjusted_insertion_location = (override_target = null) ->
1176 # 1. If there was an override target specified, then let target be the
1179 target = override_target
1180 else # Otherwise, let target be the current node.
1181 target = open_els[0]
1182 # 2. Determine the adjusted insertion location using the first matching
1183 # steps from the following list:
1185 # If foster parenting is enabled and target is a table, tbody, tfoot,
1186 # thead, or tr element Foster parenting happens when content is
1187 # misnested in tables.
1188 if flag_foster_parenting and foster_parenting_targets[target.name]
1189 loop # once. this is here so we can ``break`` to "abort these substeps"
1190 # 1. Let last template be the last template element in the
1191 # stack of open elements, if any.
1192 last_template = null
1193 last_template_i = null
1194 for el, i in open_els
1195 if el.name is 'template' and el.namespace is NS_HTML
1199 # 2. Let last table be the last table element in the stack of
1200 # open elements, if any.
1203 for el, i in open_els
1204 if el.name is 'table' and el.namespace is NS_HTML
1208 # 3. If there is a last template and either there is no last
1209 # table, or there is one, but last template is lower (more
1210 # recently added) than last table in the stack of open
1211 # elements, then: let adjusted insertion location be inside
1212 # last template's template contents, after its last child (if
1213 # any), and abort these substeps.
1214 if last_template and (last_table is null or last_template_i < last_table_i)
1215 target = last_template # fixfull should be it's contents
1216 target_i = target.children.length
1218 # 4. If there is no last table, then let adjusted insertion
1219 # location be inside the first element in the stack of open
1220 # elements (the html element), after its last child (if any),
1221 # and abort these substeps. (fragment case)
1222 if last_table is null
1224 target = open_els[open_els.length - 1]
1225 target_i = target.children.length
1227 # 5. If last table has a parent element, then let adjusted
1228 # insertion location be inside last table's parent element,
1229 # immediately before last table, and abort these substeps.
1230 if last_table.parent?
1231 for c, i in last_table.parent.children
1233 target = last_table.parent
1237 # 6. Let previous element be the element immediately above last
1238 # table in the stack of open elements.
1240 # huh? how could it not have a parent?
1241 previous_element = open_els[last_table_i + 1]
1242 # 7. Let adjusted insertion location be inside previous
1243 # element, after its last child (if any).
1244 target = previous_element
1245 target_i = target.children.length
1246 # Note: These steps are involved in part because it's possible
1247 # for elements, the table element in this case in particular,
1248 # to have been moved by a script around in the DOM, or indeed
1249 # removed from the DOM entirely, after the element was inserted
1251 break # don't really loop
1253 # Otherwise Let adjusted insertion location be inside target, after
1254 # its last child (if any).
1255 target_i = target.children.length
1257 # 3. If the adjusted insertion location is inside a template element,
1258 # let it instead be inside the template element's template contents,
1259 # after its last child (if any).
1260 # fixfull (template)
1262 # 4. Return the adjusted insertion location.
1263 return [target, target_i]
1265 # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1266 # aka create_an_element_for_token
1267 token_to_element = (t, namespace, intended_parent) ->
1268 # convert attributes into a hash
1271 attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1272 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1274 # TODO 2. If the newly created element has an xmlns attribute in the
1275 # XMLNS namespace whose value is not exactly the same as the element's
1276 # namespace, that is a parse error. Similarly, if the newly created
1277 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1278 # value is not the XLink Namespace, that is a parse error.
1280 # fixfull: the spec says stuff about form pointers and ownerDocument
1284 # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1285 insert_foreign_element = (token, namespace) ->
1286 ail = adjusted_insertion_location()
1289 el = token_to_element token, namespace, ail_el
1290 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1292 ail_el.children.splice ail_i, 0, el
1295 # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1296 insert_html_element = (token) ->
1297 insert_foreign_element token, NS_HTML
1299 # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1300 # position should be [node, index_within_children]
1301 insert_comment = (t, position = null) ->
1302 position ?= adjusted_insertion_location()
1303 position[0].children.splice position[1], 0, t
1306 # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1307 parse_generic_raw_text = (t) ->
1308 insert_html_element t
1309 tok_state = tok_state_rawtext
1310 original_ins_mode = ins_mode
1311 ins_mode = ins_mode_text
1312 parse_generic_rcdata_text = (t) ->
1313 insert_html_element t
1314 tok_state = tok_state_rcdata
1315 original_ins_mode = ins_mode
1316 ins_mode = ins_mode_text
1318 # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1319 # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1320 generate_implied_end_tags = (except = null) ->
1321 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1324 # 8.2.5.4 The rules for parsing tokens in HTML content
1325 # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1327 # 8.2.5.4.1 The "initial" insertion mode
1328 # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1329 ins_mode_initial = (t) ->
1332 if t.type is TYPE_COMMENT
1336 if t.type is TYPE_DOCTYPE
1337 # FIXME check identifiers, set quirks, etc
1340 ins_mode = ins_mode_before_html
1343 #fixfull (iframe, quirks)
1344 ins_mode = ins_mode_before_html
1348 # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1349 ins_mode_before_html = (t) ->
1350 if t.type is TYPE_DOCTYPE
1353 if t.type is TYPE_COMMENT
1358 if t.type is TYPE_START_TAG and t.name is 'html'
1359 el = token_to_element t, NS_HTML, doc
1360 doc.children.push el
1361 open_els.unshift(el)
1362 # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1363 ins_mode = ins_mode_before_head
1365 if t.type is TYPE_END_TAG
1366 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1367 # fall through to "anything else"
1372 html_tok = new_open_tag 'html'
1373 el = token_to_element html_tok, NS_HTML, doc
1374 doc.children.push el
1376 # ?fixfull browsing context
1377 ins_mode = ins_mode_before_head
1381 # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1382 ins_mode_before_head = (t) ->
1385 if t.type is TYPE_COMMENT
1388 if t.type is TYPE_DOCTYPE
1391 if t.type is TYPE_START_TAG and t.name is 'html'
1394 if t.type is TYPE_START_TAG and t.name is 'head'
1395 el = insert_html_element t
1396 head_element_pointer = el
1397 ins_mode = ins_mode_in_head
1399 if t.type is TYPE_END_TAG
1400 if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1401 # fall through to Anything else below
1406 head_tok = new_open_tag 'head'
1407 el = insert_html_element head_tok
1408 head_element_pointer = el
1409 ins_mode = ins_mode_in_head
1412 # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1413 ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1414 open_els.shift() # spec says this will be a 'head' node
1415 ins_mode = ins_mode_after_head
1417 ins_mode_in_head = (t) ->
1418 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1421 if t.type is TYPE_COMMENT
1424 if t.type is TYPE_DOCTYPE
1427 if t.type is TYPE_START_TAG and t.name is 'html'
1430 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1431 el = insert_html_element t
1433 t.acknowledge_self_closing()
1435 if t.type is TYPE_START_TAG and t.name is 'meta'
1436 el = insert_html_element t
1438 t.acknowledge_self_closing()
1439 # fixfull encoding stuff
1441 if t.type is TYPE_START_TAG and t.name is 'title'
1442 parse_generic_rcdata_text t
1444 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1445 parse_generic_raw_text t
1447 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1448 insert_html_element t
1449 ins_mode = ins_mode_in_head_noscript
1451 if t.type is TYPE_START_TAG and t.name is 'script'
1452 ail = adjusted_insertion_location()
1453 el = token_to_element t, NS_HTML, ail
1454 el.flag 'parser-inserted', true
1455 # fixfull frament case
1456 ail[0].children.splice ail[1], 0, el
1458 tok_state = tok_state_script_data
1459 original_ins_mode = ins_mode # make sure orig... is defined
1460 ins_mode = ins_mode_text
1462 if t.type is TYPE_END_TAG and t.name is 'head'
1463 open_els.shift() # will be a head element... spec says so
1464 ins_mode = ins_mode_after_head
1466 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1467 ins_mode_in_head_else t
1469 if t.type is TYPE_START_TAG and t.name is 'template'
1470 insert_html_element t
1472 flag_frameset_ok = false
1473 ins_mode = ins_mode_in_template
1474 template_ins_modes.unshift ins_mode_in_template
1476 if t.type is TYPE_END_TAG and t.name is 'template'
1477 if template_tag_is_open()
1478 generate_implied_end_tags
1479 if open_els[0].name isnt 'template'
1482 el = open_els.shift()
1483 if el.name is 'template'
1485 clear_afe_to_marker()
1486 template_ins_modes.shift()
1491 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1494 ins_mode_in_head_else t
1496 # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1497 ins_mode_in_head_noscript_else = (t) ->
1500 ins_mode = ins_mode_in_head
1502 ins_mode_in_head_noscript = (t) ->
1503 if t.type is TYPE_DOCTYPE
1506 if t.type is TYPE_START_TAG and t.name is 'html'
1509 if t.type is TYPE_END_TAG and t.name is 'noscript'
1511 ins_mode = ins_mode_in_head
1513 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1516 if t.type is TYPE_END_TAG and t.name is 'br'
1517 ins_mode_in_head_noscript_else t
1519 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1523 ins_mode_in_head_noscript_else t
1528 # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1529 ins_mode_after_head_else = (t) ->
1530 body_tok = new_open_tag 'body'
1531 insert_html_element body_tok
1532 ins_mode = ins_mode_in_body
1535 ins_mode_after_head = (t) ->
1539 if t.type is TYPE_COMMENT
1542 if t.type is TYPE_DOCTYPE
1545 if t.type is TYPE_START_TAG and t.name is 'html'
1548 if t.type is TYPE_START_TAG and t.name is 'body'
1549 insert_html_element t
1550 flag_frameset_ok = false
1551 ins_mode = ins_mode_in_body
1553 if t.type is TYPE_START_TAG and t.name is 'frameset'
1554 insert_html_element t
1555 ins_mode = ins_mode_in_frameset
1557 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1559 open_els.unshift head_element_pointer
1561 for el, i of open_els
1562 if el is head_element_pointer
1563 open_els.splice i, 1
1565 console.log "warning: 23904 couldn't find head element in open_els"
1567 if t.type is TYPE_END_TAG and t.name is 'template'
1570 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1571 ins_mode_after_head_else t
1573 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1577 ins_mode_after_head_else t
1579 # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1580 in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1581 for el, i in open_els
1582 if el.namespace is NS_HTML and el.name is name
1583 generate_implied_end_tags name # arg is exception
1584 parse_error() unless i is 0
1589 if special_elements[el.name] is el.namespace
1593 ins_mode_in_body = (t) ->
1594 if t.type is TYPE_TEXT and t.text is "\u0000"
1601 if t.type is TYPE_TEXT
1604 flag_frameset_ok = false
1606 if t.type is TYPE_COMMENT
1609 if t.type is TYPE_DOCTYPE
1612 if t.type is TYPE_START_TAG and t.name is 'html'
1614 return if template_tag_is_open()
1615 root_attrs = open_els[open_els.length - 1].attrs
1617 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1620 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1623 if t.type is TYPE_START_TAG and t.name is 'body'
1625 return if open_els.length < 2
1626 second = open_els[open_els.length - 2]
1627 return unless second.ns is NS_HTML
1628 return unless second.name is 'body'
1629 return if template_tag_is_open()
1630 frameset_ok_flag = false
1632 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1634 if t.type is TYPE_START_TAG and t.name is 'frameset'
1636 return if open_els.length < 2
1637 second_i = open_els.length - 2
1638 second = open_els[second_i]
1639 return unless second.ns is NS_HTML
1640 return unless second.name is 'body'
1641 flag_frameset_ok = false
1643 for el, i in second.parent.children
1645 second.parent.children.splice i, 1
1647 open_els.splice second_i, 1
1648 # pop everything except the "root html element"
1649 while open_els.length > 1
1651 insert_html_element t
1652 ins_mode = ins_mode_in_frameset
1654 if t.type is TYPE_EOF
1656 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1657 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1658 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1661 unless ok_tags[t.name] is el.namespace
1664 if template_ins_modes.length > 0
1665 ins_mode_in_template t
1669 if t.type is TYPE_END_TAG and t.name is 'body'
1670 unless is_in_scope 'body'
1674 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1675 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1676 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1677 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1681 unless ok_tags[t.name] is el.namespace
1684 ins_mode = ins_mode_after_body
1686 if t.type is TYPE_END_TAG and t.name is 'html'
1687 unless is_in_scope 'body'
1691 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1692 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1693 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1694 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1698 unless ok_tags[t.name] is el.namespace
1701 ins_mode = ins_mode_after_body
1704 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1705 close_p_if_in_button_scope()
1706 insert_html_element t
1708 if t.type is TYPE_START_TAG and h_tags[t.name]?
1709 close_p_if_in_button_scope()
1710 if h_tags[open_els[0]] is NS_HTML
1713 insert_html_element t
1715 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1716 close_p_if_in_button_scope()
1717 insert_html_element t
1718 # spec: If the next token is a "LF" (U+000A) character token, then
1719 # ignore that token and move on to the next one. (Newlines at the
1720 # start of pre blocks are ignored as an authoring convenience.)
1721 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1723 flag_frameset_ok = false
1725 if t.type is TYPE_START_TAG and t.name is 'form'
1726 unless form_element_pointer is null or template_tag_is_open()
1729 close_p_if_in_button_scope()
1730 el = insert_html_element t
1731 unless template_tag_is_open()
1732 form_element_pointer = el
1734 if t.type is TYPE_START_TAG and t.name is 'li'
1735 flag_frameset_ok = false
1736 for node in open_els
1737 if node.name is 'li' and node.namespace is NS_HTML
1738 generate_implied_end_tags 'li' # arg is exception
1739 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1742 el = open_els.shift()
1743 if el.name is 'li' and el.namespace is NS_HTML
1746 if el_is_special_not_adp node
1748 close_p_if_in_button_scope()
1749 insert_html_element t
1751 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1752 flag_frameset_ok = false
1753 for node in open_els
1754 if node.name is 'dd' and node.namespace is NS_HTML
1755 generate_implied_end_tags 'dd' # arg is exception
1756 if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1759 el = open_els.shift()
1760 if el.name is 'dd' and el.namespace is NS_HTML
1763 if node.name is 'dt' and node.namespace is NS_HTML
1764 generate_implied_end_tags 'dt' # arg is exception
1765 if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1768 el = open_els.shift()
1769 if el.name is 'dt' and el.namespace is NS_HTML
1772 if el_is_special_not_adp node
1774 close_p_if_in_button_scope()
1775 insert_html_element t
1777 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1778 close_p_if_in_button_scope()
1779 insert_html_element t
1780 tok_state = tok_state_plaintext
1782 if t.type is TYPE_START_TAG and t.name is 'button'
1783 if is_in_scope 'button', NS_HTML
1785 generate_implied_end_tags()
1787 el = open_els.shift()
1788 if el.name is 'button' and el.namespace is NS_HTML
1791 insert_html_element t
1792 flag_frameset_ok = false
1794 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1795 unless is_in_scope t.name, NS_HTML
1798 generate_implied_end_tags()
1799 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1802 el = open_els.shift()
1803 if el.name is t.name and el.namespace is NS_HTML
1806 if t.type is TYPE_END_TAG and t.name is 'form'
1807 unless template_tag_is_open()
1808 node = form_element_pointer
1809 form_element_pointer = null
1810 if node is null or not el_is_in_scope node
1813 generate_implied_end_tags()
1814 if open_els[0] isnt node
1816 for el, i in open_els
1818 open_els.splice i, 1
1821 unless is_in_scope 'form', NS_HTML
1824 generate_implied_end_tags()
1825 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1828 el = open_els.shift()
1829 if el.name is 'form' and el.namespace is NS_HTML
1832 if t.type is TYPE_END_TAG and t.name is 'p'
1833 unless is_in_button_scope 'p', NS_HTML
1835 insert_html_element new_open_tag 'p'
1838 if t.type is TYPE_END_TAG and t.name is 'li'
1839 unless is_in_li_scope 'li', NS_HTML
1842 generate_implied_end_tags 'li' # arg is exception
1843 if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1846 el = open_els.shift()
1847 if el.name is 'li' and el.namespace is NS_HTML
1850 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1851 unless is_in_scope t.name, NS_HTML
1854 generate_implied_end_tags t.name # arg is exception
1855 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1858 el = open_els.shift()
1859 if el.name is t.name and el.namespace is NS_HTML
1862 if t.type is TYPE_END_TAG and h_tags[t.name]?
1865 if h_tags[el.name] is el.namespace
1868 if standard_scopers[el.name] is el.namespace
1873 generate_implied_end_tags()
1874 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1877 el = open_els.shift()
1878 if h_tags[el.name] is el.namespace
1882 if t.type is TYPE_START_TAG and t.name is 'a'
1883 # If the list of active formatting elements contains an a element
1884 # between the end of the list and the last marker on the list (or
1885 # the start of the list if there is no marker on the list), then
1886 # this is a parse error; run the adoption agency algorithm for the
1887 # tag name "a", then remove that element from the list of active
1888 # formatting elements and the stack of open elements if the
1889 # adoption agency algorithm didn't already remove it (it might not
1890 # have if the element is not in table scope).
1893 if el.type is TYPE_AFE_MARKER
1895 if el.name is 'a' and el.namespace is NS_HTML
1903 for el, i in open_els
1905 open_els.splice i, 1
1907 el = insert_html_element t
1910 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1912 el = insert_html_element t
1915 if t.type is TYPE_START_TAG and t.name is 'nobr'
1917 el = insert_html_element t
1920 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1921 adoption_agency t.name
1923 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1925 insert_html_element t
1927 flag_frameset_ok = false
1929 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1930 unless is_in_scope t.name, NS_HTML
1933 generate_implied_end_tags()
1934 if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1937 el = open_els.shift()
1938 if el.name is t.name and el.namespace is NS_HTML
1940 clear_afe_to_marker()
1942 if t.type is TYPE_START_TAG and t.name is 'table'
1943 close_p_if_in_button_scope() # fixfull quirksmode thing
1944 insert_html_element t
1945 flag_frameset_ok = false
1946 ins_mode = ins_mode_in_table
1948 if t.type is TYPE_END_TAG and t.name is 'br'
1950 t.type is TYPE_START_TAG
1952 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
1954 insert_html_element t
1956 t.acknowledge_self_closing()
1957 flag_frameset_ok = false
1959 if t.type is TYPE_START_TAG and t.name is 'input'
1961 insert_html_element t
1963 t.acknowledge_self_closing()
1964 unless is_input_hidden_tok t
1965 flag_frameset_ok = false
1967 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
1968 insert_html_element t
1970 t.acknowledge_self_closing()
1972 if t.type is TYPE_START_TAG and t.name is 'hr'
1973 close_p_if_in_button_scope()
1974 insert_html_element t
1976 t.acknowledge_self_closing()
1977 flag_frameset_ok = false
1979 if t.type is TYPE_START_TAG and t.name is 'image'
1984 if t.type is TYPE_START_TAG and t.name is 'isindex'
1986 if template_tag_is_open() is false and form_element_pointer isnt null
1988 t.acknowledge_self_closing()
1989 flag_frameset_ok = false
1990 close_p_if_in_button_scope()
1991 el = insert_html_element new_open_tag 'form'
1992 unless template_tag_is_open()
1993 form_element_pointer = el
1996 el.attrs['action'] = a[1]
1998 insert_html_element new_open_tag 'hr'
2001 insert_html_element new_open_tag 'label'
2002 # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2003 input_el = new_open_tag 'input'
2008 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2009 input_el.attrs_a.push [a[0], a[1]]
2010 input_el.attrs_a.push ['name', 'isindex']
2011 # fixfull this next bit is in english... internationalize?
2012 prompt ?= "This is a searchable index. Enter search keywords: "
2013 insert_character new_character_token prompt # fixfull split
2014 # TODO submit typo "balue" in spec
2015 insert_html_element input_el
2017 # insert_character '' # you can put chars here if promt attr missing
2019 insert_html_element new_open_tag 'hr'
2022 unless template_tag_is_open()
2023 form_element_pointer = null
2025 if t.type is TYPE_START_TAG and t.name is 'textarea'
2026 insert_html_element t
2027 if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2029 tok_state = tok_state_rcdata
2030 original_ins_mode = ins_mode
2031 flag_frameset_ok = false
2032 ins_mode = ins_mode_text
2034 if t.type is TYPE_START_TAG and t.name is 'xmp'
2035 close_p_if_in_button_scope()
2037 flag_frameset_ok = false
2038 parse_generic_raw_text t
2040 if t.type is TYPE_START_TAG and t.name is 'iframe'
2041 flag_frameset_ok = false
2042 parse_generic_raw_text t
2044 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2045 parse_generic_raw_text t
2047 if t.type is TYPE_START_TAG and t.name is 'select'
2049 insert_html_element t
2050 flag_frameset_ok = false
2051 if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2052 ins_mode = ins_mode_in_select_in_table
2054 ins_mode = ins_mode_in_select
2056 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2057 if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2060 insert_html_element t
2062 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2063 if is_in_scope 'ruby', NS_HTML
2064 generate_implied_end_tags()
2065 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2067 insert_html_element t
2069 if t.type is TYPE_START_TAG and t.name is 'rt'
2070 if is_in_scope 'ruby', NS_HTML
2071 generate_implied_end_tags 'rtc' # arg is exception
2072 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2074 insert_html_element t
2076 if t.type is TYPE_START_TAG and t.name is 'math'
2078 adjust_mathml_attributes t
2079 adjust_foreign_attributes t
2080 insert_foreign_element t, NS_MATHML
2081 if t.flag 'self-closing'
2083 t.acknowledge_self_closing()
2085 if t.type is TYPE_START_TAG and t.name is 'svg'
2087 adjust_svg_attributes t
2088 adjust_foreign_attributes t
2089 insert_foreign_element t, NS_SVG
2090 if t.flag 'self-closing'
2092 t.acknowledge_self_closing()
2094 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2097 if t.type is TYPE_START_TAG # any other start tag
2099 insert_html_element t
2101 if t.type is TYPE_END_TAG # any other end tag
2102 in_body_any_other_end_tag t.name
2106 # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2107 ins_mode_text = (t) ->
2108 if t.type is TYPE_TEXT
2111 if t.type is TYPE_EOF
2113 if open_els[0].name is 'script'
2114 open_els[0].flag 'already started', true
2116 ins_mode = original_ins_mode
2119 if t.type is TYPE_END_TAG and t.name is 'script'
2121 ins_mode = original_ins_mode
2122 # fixfull the spec seems to assume that I'm going to run the script
2123 # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2125 if t.type is TYPE_END_TAG
2127 ins_mode = original_ins_mode
2129 console.log 'warning: end of ins_mode_text reached'
2131 # the functions below implement the tokenizer stats described here:
2132 # http://www.w3.org/TR/html5/syntax.html#tokenization
2134 # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2135 ins_mode_in_table_else = (t) ->
2137 flag_foster_parenting = true
2139 flag_foster_parenting = false
2141 can_in_table = { # FIXME do this inline like everywhere else
2148 ins_mode_in_table = (t) ->
2151 if can_in_table[t.name]
2152 original_ins_mode = ins_mode
2153 ins_mode = ins_mode_in_table_text
2156 ins_mode_in_table_else t
2164 clear_stack_to_table_context()
2166 insert_html_element t
2167 ins_mode = ins_mode_in_caption
2169 clear_stack_to_table_context()
2170 insert_html_element t
2171 ins_mode = ins_mode_in_column_group
2173 clear_stack_to_table_context()
2174 insert_html_element new_open_tag 'colgroup'
2175 ins_mode = ins_mode_in_column_group
2177 when 'tbody', 'tfoot', 'thead'
2178 clear_stack_to_table_context()
2179 insert_html_element t
2180 ins_mode = ins_mode_in_table_body
2181 when 'td', 'th', 'tr'
2182 clear_stack_to_table_context()
2183 insert_html_element new_open_tag 'tbody'
2184 ins_mode = ins_mode_in_table_body
2188 if is_in_table_scope 'table'
2190 el = open_els.shift()
2191 if el.name is 'table'
2195 when 'style', 'script', 'template'
2198 unless is_input_hidden_tok t
2199 ins_mode_in_table_else t
2202 el = insert_html_element t
2204 t.acknowledge_self_closing()
2207 if form_element_pointer?
2209 if template_tag_is_open()
2211 form_element_pointer = insert_html_element t
2214 ins_mode_in_table_else t
2218 if is_in_table_scope 'table'
2220 el = open_els.shift()
2221 if el.name is 'table'
2226 when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2231 ins_mode_in_table_else t
2235 ins_mode_in_table_else t
2238 # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2239 ins_mode_in_table_text = (t) ->
2240 if t.type is TYPE_TEXT and t.text is "\u0000"
2241 # huh? I thought the tokenizer didn't emit these
2244 if t.type is TYPE_TEXT
2245 pending_table_character_tokens.push t
2249 for old in pending_table_character_tokens
2250 unless is_space_tok old
2254 for old in pending_table_character_tokens
2255 insert_character old
2257 for old in pending_table_character_tokens
2258 ins_mode_table_else old
2259 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
2260 ins_mode = original_ins_mode
2263 # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2264 ins_mode_in_caption = (t) ->
2265 if t.type is TYPE_END_TAG and t.name is 'caption'
2266 if is_in_table_scope 'caption'
2267 generate_implied_end_tags()
2268 if open_els[0].name isnt 'caption'
2271 el = open_els.shift()
2272 if el.name is 'caption'
2274 clear_afe_to_marker()
2275 ins_mode = ins_mode_in_table
2280 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2282 if is_in_table_scope 'caption'
2284 el = open_els.shift()
2285 if el.name is 'caption'
2287 clear_afe_to_marker()
2288 ins_mode = ins_mode_in_table
2290 # else fragment case
2292 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2298 # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2299 ins_mode_in_column_group = (t) ->
2303 if t.type is TYPE_COMMENT
2306 if t.type is TYPE_DOCTYPE
2309 if t.type is TYPE_START_TAG and t.name is 'html'
2312 if t.type is TYPE_START_TAG and t.name is 'col'
2313 el = insert_html_element t
2315 t.acknowledge_self_closing()
2317 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2318 if open_els[0].name is 'colgroup'
2320 ins_mode = ins_mode_in_table
2324 if t.type is TYPE_END_TAG and t.name is 'col'
2327 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2330 if t.type is TYPE_EOF
2334 if open_els[0].name isnt 'colgroup'
2338 ins_mode = ins_mode_in_table
2342 # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2343 ins_mode_in_table_body = (t) ->
2344 if t.type is TYPE_START_TAG and t.name is 'tr'
2345 clear_stack_to_table_body_context()
2346 insert_html_element t
2347 ins_mode = ins_mode_in_row
2349 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2351 clear_stack_to_table_body_context()
2352 insert_html_element new_open_tag 'tr'
2353 ins_mode = ins_mode_in_row
2356 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2357 unless is_in_table_scope t.name # fixfull check namespace
2360 clear_stack_to_table_body_context()
2362 ins_mode = ins_mode_in_table
2364 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2367 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
2370 if table_scopers[el.name]
2375 clear_stack_to_table_body_context()
2377 ins_mode = ins_mode_in_table
2380 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2386 # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2387 ins_mode_in_row = (t) ->
2388 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2389 clear_stack_to_table_row_context()
2390 insert_html_element t
2391 ins_mode = ins_mode_in_cell
2394 if t.type is TYPE_END_TAG and t.name is 'tr'
2395 if is_in_table_scope 'tr'
2396 clear_stack_to_table_row_context()
2398 ins_mode = ins_mode_in_table_body
2402 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2403 if is_in_table_scope 'tr'
2404 clear_stack_to_table_row_context()
2406 ins_mode = ins_mode_in_table_body
2411 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2412 if is_in_table_scope t.name # fixfull namespace
2413 if is_in_table_scope 'tr'
2414 clear_stack_to_table_row_context()
2416 ins_mode = ins_mode_in_table_body
2421 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2427 # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2429 generate_implied_end_tags()
2430 unless open_els[0].name is 'td' or open_els[0] is 'th'
2433 el = open_els.shift()
2434 if el.name is 'td' or el.name is 'th'
2436 clear_afe_to_marker()
2437 ins_mode = ins_mode_in_row
2439 # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2440 ins_mode_in_cell = (t) ->
2441 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2442 if is_in_table_scope t.name
2443 generate_implied_end_tags()
2444 if open_els[0].name isnt t.name
2447 el = open_els.shift()
2448 if el.name is t.name
2450 clear_afe_to_marker()
2451 ins_mode = ins_mode_in_row
2455 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2458 if el.name is 'td' or el.name is 'th'
2461 if table_scopers[el.name]
2469 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2472 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2473 if is_in_table_scope t.name # fixfull namespace
2482 # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2483 ins_mode_in_select = (t) ->
2484 if t.type is TYPE_TEXT and t.text is "\u0000"
2487 if t.type is TYPE_TEXT
2490 if t.type is TYPE_COMMENT
2493 if t.type is TYPE_DOCTYPE
2496 if t.type is TYPE_START_TAG and t.name is 'html'
2499 if t.type is TYPE_START_TAG and t.name is 'option'
2500 if open_els[0].name is 'option'
2502 insert_html_element t
2504 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2505 if open_els[0].name is 'option'
2507 if open_els[0].name is 'optgroup'
2509 insert_html_element t
2511 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2512 if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
2514 if open_els[0].name is 'optgroup'
2519 if t.type is TYPE_END_TAG and t.name is 'option'
2520 if open_els[0].name is 'option'
2525 if t.type is TYPE_END_TAG and t.name is 'select'
2526 if is_in_select_scope 'select'
2528 el = open_els.shift()
2529 if el.name is 'select'
2535 if t.type is TYPE_START_TAG and t.name is 'select'
2538 el = open_els.shift()
2539 if el.name is 'select'
2542 # spec says that this is the same as </select> but it doesn't say
2543 # to check scope first
2545 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2547 if is_in_select_scope 'select'
2550 el = open_els.shift()
2551 if el.name is 'select'
2556 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2559 if t.type is TYPE_EOF
2566 # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2567 ins_mode_in_select_in_table = (t) ->
2568 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2571 el = open_els.shift()
2572 if el.name is 'select'
2577 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2579 unless is_in_table_scope t.name, NS_HTML
2582 el = open_els.shift()
2583 if el.name is 'select'
2589 ins_mode_in_select t
2592 # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2593 ins_mode_in_template = (t) ->
2594 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2597 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2600 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2601 template_ins_modes.shift()
2602 template_ins_modes.unshift ins_mode_in_table
2603 ins_mode = ins_mode_in_table
2606 if t.type is TYPE_START_TAG and t.name is 'col'
2607 template_ins_modes.shift()
2608 template_ins_modes.unshift ins_mode_in_column_group
2609 ins_mode = ins_mode_in_column_group
2612 if t.type is TYPE_START_TAG and t.name is 'tr'
2613 template_ins_modes.shift()
2614 template_ins_modes.unshift ins_mode_in_table_body
2615 ins_mode = ins_mode_in_table_body
2618 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2619 template_ins_modes.shift()
2620 template_ins_modes.unshift ins_mode_in_row
2621 ins_mode = ins_mode_in_row
2624 if t.type is TYPE_START_TAG
2625 template_ins_modes.shift()
2626 template_ins_modes.unshift ins_mode_in_body
2627 ins_mode = ins_mode_in_body
2630 if t.type is TYPE_END_TAG
2633 if t.type is TYPE_EOF
2634 unless template_tag_is_open()
2639 el = open_els.shift()
2640 if el.name is 'template' # fixfull check namespace
2642 clear_afe_to_marker()
2643 template_ins_modes.shift()
2647 # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2648 ins_mode_after_body = (t) ->
2652 if t.type is TYPE_COMMENT
2653 insert_comment t, [open_els[0], open_els[0].children.length]
2655 if t.type is TYPE_DOCTYPE
2658 if t.type is TYPE_START_TAG and t.name is 'html'
2661 if t.type is TYPE_END_TAG and t.name is 'html'
2662 # fixfull fragment case
2663 ins_mode = ins_mode_after_after_body
2665 if t.type is TYPE_EOF
2670 ins_mode = ins_mode_in_body
2673 # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2674 ins_mode_in_frameset = (t) ->
2678 if t.type is TYPE_COMMENT
2681 if t.type is TYPE_DOCTYPE
2684 if t.type is TYPE_START_TAG and t.name is 'html'
2687 if t.type is TYPE_START_TAG and t.name is 'frameset'
2688 insert_html_element t
2690 if t.type is TYPE_END_TAG and t.name is 'frameset'
2691 # TODO ?correct for: "if the current node is the root html element"
2692 if open_els.length is 1
2694 return # fragment case
2696 if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2697 ins_mode = ins_mode_after_frameset
2699 if t.type is TYPE_START_TAG and t.name is 'frame'
2700 insert_html_element t
2702 t.acknowledge_self_closing()
2704 if t.type is TYPE_START_TAG and t.name is 'noframes'
2707 if t.type is TYPE_EOF
2708 # TODO ?correct for: "if the current node is not the root html element"
2709 if open_els.length isnt 1
2717 # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2718 ins_mode_after_frameset = (t) ->
2722 if t.type is TYPE_COMMENT
2725 if t.type is TYPE_DOCTYPE
2728 if t.type is TYPE_START_TAG and t.name is 'html'
2731 if t.type is TYPE_END_TAG and t.name is 'html'
2732 insert_mode = ins_mode_after_after_frameset
2734 if t.type is TYPE_START_TAG and t.name is 'noframes'
2737 if t.type is TYPE_EOF
2744 # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2745 ins_mode_after_after_body = (t) ->
2746 if t.type is TYPE_COMMENT
2747 insert_comment t, [doc, doc.children.length]
2749 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2752 if t.type is TYPE_EOF
2757 ins_mode = ins_mode_in_body
2760 # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2761 ins_mode_after_after_frameset = (t) ->
2762 if t.type is TYPE_COMMENT
2763 insert_comment t, [doc, doc.children.length]
2765 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2768 if t.type is TYPE_EOF
2771 if t.type is TYPE_START_TAG and t.name is 'noframes'
2778 # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2779 has_color_face_or_size = (t) ->
2781 if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2784 in_foreign_content_end_script = ->
2788 in_foreign_content_other_start = (t) ->
2789 acn = adjusted_current_node()
2790 if acn.namespace is NS_MATHML
2791 adjust_mathml_attributes t
2792 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2793 t.name = svg_name_fixes[t.name]
2794 if acn.namespace is NS_SVG
2795 adjust_svg_attributes t
2796 adjust_foreign_attributes t
2797 insert_foreign_element t, acn.namespace
2798 if t.flag 'self-closing'
2799 if t.name is 'script'
2800 t.acknowledge_self_closing()
2801 in_foreign_content_end_script()
2804 t.acknowledge_self_closing()
2806 in_foreign_content = (t) ->
2807 if t.type is TYPE_TEXT and t.text is "\u0000"
2809 insert_character new_character_token "\ufffd"
2814 if t.type is TYPE_TEXT
2815 flag_frameset_ok = false
2818 if t.type is TYPE_COMMENT
2821 if t.type is TYPE_DOCTYPE
2824 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
2826 if flag_fragment_parsing
2827 in_foreign_content_other_start t
2829 loop # is this safe?
2832 if is_mathml_text_integration_point(cn) or is_html_integration(cn) or cn.namespace is NS_HTML
2836 if t.type is TYPE_START_TAG
2837 in_foreign_content_other_start t
2839 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
2840 in_foreign_content_end_script()
2842 if t.type is TYPE_END_TAG
2843 if open_els[0].name.toLowerCase() isnt t.name
2845 for node in open_els
2846 if node is open_els[open_els.length - 1]
2848 if node.name.toLowerCase() is t.name
2850 el = open_els.shift()
2853 if node.namespace is NS_HTML
2855 ins_mode t # explicitly call HTML insertion mode
2858 # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2860 switch c = txt.charAt(cur++)
2862 return new_text_node parse_character_reference()
2864 tok_state = tok_state_tag_open
2867 return new_text_node c
2869 return new_eof_token()
2871 return new_text_node c
2874 # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2875 # not needed: tok_state_character_reference_in_data = ->
2876 # just call parse_character_reference()
2878 # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2879 tok_state_rcdata = ->
2880 switch c = txt.charAt(cur++)
2882 return new_text_node parse_character_reference()
2884 tok_state = tok_state_rcdata_less_than_sign
2887 return new_character_token "\ufffd"
2889 return new_eof_token()
2891 return new_character_token c
2894 # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2895 # not needed: tok_state_character_reference_in_rcdata = ->
2896 # just call parse_character_reference()
2898 # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2899 tok_state_rawtext = ->
2900 switch c = txt.charAt(cur++)
2902 tok_state = tok_state_rawtext_less_than_sign
2905 return new_character_token "\ufffd"
2907 return new_eof_token()
2909 return new_character_token c
2912 # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2913 tok_state_script_data = ->
2914 switch c = txt.charAt(cur++)
2916 tok_state = tok_state_script_data_less_than_sign
2919 return new_character_token "\ufffd"
2921 return new_eof_token()
2923 return new_character_token c
2926 # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2927 tok_state_plaintext = ->
2928 switch c = txt.charAt(cur++)
2931 return new_character_token "\ufffd"
2933 return new_eof_token()
2935 return new_character_token c
2939 # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2940 tok_state_tag_open = ->
2941 switch c = txt.charAt(cur++)
2943 tok_state = tok_state_markup_declaration_open
2945 tok_state = tok_state_end_tag_open
2948 tok_cur_tag = new_comment_token '?'
2949 tok_state = tok_state_bogus_comment
2952 tok_cur_tag = new_open_tag c
2953 tok_state = tok_state_tag_name
2954 else if is_uc_alpha(c)
2955 tok_cur_tag = new_open_tag c.toLowerCase()
2956 tok_state = tok_state_tag_name
2959 tok_state = tok_state_data
2960 cur -= 1 # we didn't parse/handle the char after <
2961 return new_text_node '<'
2964 # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2965 tok_state_end_tag_open = ->
2966 switch c = txt.charAt(cur++)
2969 tok_state = tok_state_data
2972 tok_state = tok_state_data
2973 return new_text_node '</'
2976 tok_cur_tag = new_end_tag c.toLowerCase()
2977 tok_state = tok_state_tag_name
2978 else if is_lc_alpha(c)
2979 tok_cur_tag = new_end_tag c
2980 tok_state = tok_state_tag_name
2983 tok_cur_tag = new_comment_token '/'
2984 tok_state = tok_state_bogus_comment
2987 # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2988 tok_state_tag_name = ->
2989 switch c = txt.charAt(cur++)
2990 when "\t", "\n", "\u000c", ' '
2991 tok_state = tok_state_before_attribute_name
2993 tok_state = tok_state_self_closing_start_tag
2995 tok_state = tok_state_data
3001 tok_cur_tag.name += "\ufffd"
3004 tok_state = tok_state_data
3007 tok_cur_tag.name += c.toLowerCase()
3009 tok_cur_tag.name += c
3012 # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3013 tok_state_rcdata_less_than_sign = ->
3014 c = txt.charAt(cur++)
3016 temporary_buffer = ''
3017 tok_state = tok_state_rcdata_end_tag_open
3020 tok_state = tok_state_rcdata
3021 cur -= 1 # reconsume the input character
3022 return new_character_token '<'
3024 # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3025 tok_state_rcdata_end_tag_open = ->
3026 c = txt.charAt(cur++)
3028 tok_cur_tag = new_end_tag c.toLowerCase()
3029 temporary_buffer += c
3030 tok_state = tok_state_rcdata_end_tag_name
3033 tok_cur_tag = new_end_tag c
3034 temporary_buffer += c
3035 tok_state = tok_state_rcdata_end_tag_name
3038 tok_state = tok_state_rcdata
3039 cur -= 1 # reconsume the input character
3040 return new_character_token "</" # fixfull separate these
3042 # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3043 is_appropriate_end_tag = (t) ->
3044 # spec says to check against "the tag name of the last start tag to
3045 # have been emitted from this tokenizer", but this is only called from
3046 # the various "raw" states, so it's hopefully ok to assume that
3047 # open_els[0].name will work instead TODO: verify this after the script
3048 # data states are implemented
3049 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3050 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3052 # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3053 tok_state_rcdata_end_tag_name = ->
3054 c = txt.charAt(cur++)
3055 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3056 if is_appropriate_end_tag tok_cur_tag
3057 tok_state = tok_state_before_attribute_name
3059 # else fall through to "Anything else"
3061 if is_appropriate_end_tag tok_cur_tag
3062 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3064 # else fall through to "Anything else"
3066 if is_appropriate_end_tag tok_cur_tag
3067 tok_state = tok_state_data
3069 # else fall through to "Anything else"
3071 tok_cur_tag.name += c.toLowerCase()
3072 temporary_buffer += c
3075 tok_cur_tag.name += c
3076 temporary_buffer += c
3079 tok_state = tok_state_rcdata
3080 cur -= 1 # reconsume the input character
3081 return new_character_token '</' + temporary_buffer # fixfull separate these
3083 # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3084 tok_state_rawtext_less_than_sign = ->
3085 c = txt.charAt(cur++)
3087 temporary_buffer = ''
3088 tok_state = tok_state_rawtext_end_tag_open
3091 tok_state = tok_state_rawtext
3092 cur -= 1 # reconsume the input character
3093 return new_character_token '<'
3095 # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3096 tok_state_rawtext_end_tag_open = ->
3097 c = txt.charAt(cur++)
3099 tok_cur_tag = new_end_tag c.toLowerCase()
3100 temporary_buffer += c
3101 tok_state = tok_state_rawtext_end_tag_name
3104 tok_cur_tag = new_end_tag c
3105 temporary_buffer += c
3106 tok_state = tok_state_rawtext_end_tag_name
3109 tok_state = tok_state_rawtext
3110 cur -= 1 # reconsume the input character
3111 return new_character_token "</" # fixfull separate these
3113 # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3114 tok_state_rawtext_end_tag_name = ->
3115 c = txt.charAt(cur++)
3116 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3117 if is_appropriate_end_tag tok_cur_tag
3118 tok_state = tok_state_before_attribute_name
3120 # else fall through to "Anything else"
3122 if is_appropriate_end_tag tok_cur_tag
3123 tok_state = tok_state_self_closing_start_tag
3125 # else fall through to "Anything else"
3127 if is_appropriate_end_tag tok_cur_tag
3128 tok_state = tok_state_data
3130 # else fall through to "Anything else"
3132 tok_cur_tag.name += c.toLowerCase()
3133 temporary_buffer += c
3136 tok_cur_tag.name += c
3137 temporary_buffer += c
3140 tok_state = tok_state_rawtext
3141 cur -= 1 # reconsume the input character
3142 return new_character_token '</' + temporary_buffer # fixfull separate these
3144 # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3145 tok_state_script_data_less_than_sign = ->
3146 c = txt.charAt(cur++)
3148 temporary_buffer = ''
3149 tok_state = tok_state_script_data_end_tag_open
3152 tok_state = tok_state_script_data_escape_start
3153 return new_character_token '<!' # fixfull split
3155 tok_state = tok_state_script_data
3156 cur -= 1 # Reconsume
3157 return new_character_token '<'
3159 # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3160 tok_state_script_data_end_tag_open = ->
3161 c = txt.charAt(cur++)
3163 tok_cur_tag = new_end_tag c.toLowerCase()
3164 temporary_buffer += c
3165 tok_state = tok_state_script_data_end_tag_name
3168 tok_cur_tag = new_end_tag c
3169 temporary_buffer += c
3170 tok_state = tok_state_script_data_end_tag_name
3173 tok_state = tok_state_script_data
3174 cur -= 1 # Reconsume
3175 return new_character_token '</'
3177 # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3178 tok_state_script_data_end_tag_name = ->
3179 c = txt.charAt(cur++)
3180 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3181 if is_appropriate_end_tag tok_cur_tag
3182 tok_state = tok_state_before_attribute_name
3186 if is_appropriate_end_tag tok_cur_tag
3187 tok_state = tok_state_self_closing_start_tag
3191 if is_appropriate_end_tag tok_cur_tag
3192 tok_state = tok_state_data
3196 tok_cur_tag.name += c.toLowerCase()
3197 temporary_buffer += c
3200 tok_cur_tag.name += c
3201 temporary_buffer += c
3204 tok_state = tok_state_script_data
3205 cur -= 1 # Reconsume
3206 return new_character_token "</#{temporary_buffer}" # fixfull split
3208 # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3209 tok_state_script_data_escape_start = ->
3210 c = txt.charAt(cur++)
3212 tok_state = tok_state_script_data_escape_start_dash
3213 return new_character_token '-'
3215 tok_state = tok_state_script_data
3216 cur -= 1 # Reconsume
3219 # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3220 tok_state_script_data_escape_start_dash = ->
3221 c = txt.charAt(cur++)
3223 tok_state = tok_state_script_data_escaped_dash_dash
3224 return new_character_token '-'
3226 tok_state = tok_state_script_data
3227 cur -= 1 # Reconsume
3230 # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3231 tok_state_script_data_escaped = ->
3232 c = txt.charAt(cur++)
3234 tok_state = tok_state_script_data_escaped_dash
3235 return new_character_token '-'
3237 tok_state = tok_state_script_data_escaped_less_than_sign
3241 return new_character_token "\ufffd"
3243 tok_state = tok_state_data
3245 cur -= 1 # Reconsume
3248 return new_character_token c
3250 # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3251 tok_state_script_data_escaped_dash = ->
3252 c = txt.charAt(cur++)
3254 tok_state = tok_state_script_data_escaped_dash_dash
3255 return new_character_token '-'
3257 tok_state = tok_state_script_data_escaped_less_than_sign
3261 tok_state = tok_state_script_data_escaped
3262 return new_character_token "\ufffd"
3264 tok_state = tok_state_data
3266 cur -= 1 # Reconsume
3269 tok_state = tok_state_script_data_escaped
3270 return new_character_token c
3272 # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3273 tok_state_script_data_escaped_dash_dash = ->
3274 c = txt.charAt(cur++)
3276 return new_character_token '-'
3278 tok_state = tok_state_script_data_escaped_less_than_sign
3281 tok_state = tok_state_script_data
3282 return new_character_token '>'
3285 tok_state = tok_state_script_data_escaped
3286 return new_character_token "\ufffd"
3289 tok_state = tok_state_data
3290 cur -= 1 # Reconsume
3293 tok_state = tok_state_script_data_escaped
3294 return new_character_token c
3296 # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3297 tok_state_script_data_escaped_less_than_sign = ->
3298 c = txt.charAt(cur++)
3300 temporary_buffer = ''
3301 tok_state = tok_state_script_data_escaped_end_tag_open
3304 temporary_buffer = c.toLowerCase() # yes, really
3305 tok_state = tok_state_script_data_double_escape_start
3306 return new_character_token "<#{c}" # fixfull split
3308 temporary_buffer = c
3309 tok_state = tok_state_script_data_double_escape_start
3310 return new_character_token "<#{c}" # fixfull split
3312 tok_state = tok_state_script_data_escaped
3313 cur -= 1 # Reconsume
3314 return new_character_token c
3316 # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3317 tok_state_script_data_escaped_end_tag_open = ->
3318 c = txt.charAt(cur++)
3320 tok_cur_tag = new_end_tag c.toLowerCase()
3321 temporary_buffer += c
3322 tok_state = tok_state_script_data_escaped_end_tag_name
3325 tok_cur_tag = new_end_tag c
3326 temporary_buffer += c
3327 tok_state = tok_state_script_data_escaped_end_tag_name
3330 tok_state = tok_state_script_data_escaped
3331 cur -= 1 # Reconsume
3332 return new_character_token '</' # fixfull split
3334 # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3335 tok_state_script_data_escaped_end_tag_name = ->
3336 c = txt.charAt(cur++)
3337 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3338 if is_appropriate_end_tag tok_cur_tag
3339 tok_state = tok_state_before_attribute_name
3343 if is_appropriate_end_tag tok_cur_tag
3344 tok_state = tok_state_self_closing_start_tag
3348 if is_appropriate_end_tag tok_cur_tag
3349 tok_state = tok_state_data
3353 tok_cur_tag.name += c.toLowerCase()
3354 temporary_buffer += c.toLowerCase()
3357 tok_cur_tag.name += c
3358 temporary_buffer += c.toLowerCase()
3361 tok_state = tok_state_script_data_escaped
3362 cur -= 1 # Reconsume
3363 return new_character_token "</#{temporary_buffer}" # fixfull split
3365 # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3366 tok_state_script_data_double_escape_start = ->
3367 c = txt.charAt(cur++)
3368 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3369 if temporary_buffer is 'script'
3370 tok_state = tok_state_script_data_double_escaped
3372 tok_state = tok_state_script_data_escaped
3373 return new_character_token c
3375 temporary_buffer += c.toLowerCase() # yes, really lowercase
3376 return new_character_token c
3378 temporary_buffer += c
3379 return new_character_token c
3381 tok_state = tok_state_script_data_escaped
3382 cur -= 1 # Reconsume
3385 # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3386 tok_state_script_data_double_escaped = ->
3387 c = txt.charAt(cur++)
3389 tok_state = tok_state_script_data_double_escaped_dash
3390 return new_character_token '-'
3392 tok_state = tok_state_script_data_double_escaped_less_than_sign
3393 return new_character_token '<'
3396 return new_character_token "\ufffd"
3399 tok_state = tok_state_data
3400 cur -= 1 # Reconsume
3403 return new_character_token c
3405 # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3406 tok_state_script_data_double_escaped_dash = ->
3407 c = txt.charAt(cur++)
3409 tok_state = tok_state_script_data_double_escaped_dash_dash
3410 return new_character_token '-'
3412 tok_state = tok_state_script_data_double_escaped_less_than_sign
3413 return new_character_token '<'
3416 tok_state = tok_state_script_data_double_escaped
3417 return new_character_token "\ufffd"
3420 tok_state = tok_state_data
3421 cur -= 1 # Reconsume
3424 tok_state = tok_state_script_data_double_escaped
3425 return new_character_token c
3427 # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3428 tok_state_script_data_double_escaped_dash_dash = ->
3429 c = txt.charAt(cur++)
3431 return new_character_token '-'
3433 tok_state = tok_state_script_data_double_escaped_less_than_sign
3434 return new_character_token '<'
3436 tok_state = tok_state_script_data
3437 return new_character_token '>'
3440 tok_state = tok_state_script_data_double_escaped
3441 return new_character_token "\ufffd"
3444 tok_state = tok_state_data
3445 cur -= 1 # Reconsume
3448 tok_state = tok_state_script_data_double_escaped
3449 return new_character_token c
3451 # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3452 tok_state_script_data_double_escaped_less_than_sign = ->
3453 c = txt.charAt(cur++)
3455 temporary_buffer = ''
3456 tok_state = tok_state_script_data_double_escape_end
3457 return new_character_token '/'
3459 tok_state = tok_state_script_data_double_escaped
3460 cur -= 1 # Reconsume
3463 # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3464 tok_state_script_data_double_escape_end = ->
3465 c = txt.charAt(cur++)
3466 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3467 if temporary_buffer is 'script'
3468 tok_state = tok_state_script_data_escaped
3470 tok_state = tok_state_script_data_double_escaped
3471 return new_character_token c
3473 temporary_buffer += c.toLowerCase() # yes, really lowercase
3474 return new_character_token c
3476 temporary_buffer += c
3477 return new_character_token c
3479 tok_state = tok_state_script_data_double_escaped
3480 cur -= 1 # Reconsume
3483 # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3484 tok_state_before_attribute_name = ->
3486 switch c = txt.charAt(cur++)
3487 when "\t", "\n", "\u000c", ' '
3490 tok_state = tok_state_self_closing_start_tag
3493 tok_state = tok_state_data
3499 attr_name = "\ufffd"
3500 when '"', "'", '<', '='
3505 tok_state = tok_state_data
3508 attr_name = c.toLowerCase()
3512 tok_cur_tag.attrs_a.unshift [attr_name, '']
3513 tok_state = tok_state_attribute_name
3516 # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3517 tok_state_attribute_name = ->
3518 switch c = txt.charAt(cur++)
3519 when "\t", "\n", "\u000c", ' '
3520 tok_state = tok_state_after_attribute_name
3522 tok_state = tok_state_self_closing_start_tag
3524 tok_state = tok_state_before_attribute_value
3526 tok_state = tok_state_data
3532 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3535 tok_cur_tag.attrs_a[0][0] += c
3538 tok_state = tok_state_data
3541 tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3543 tok_cur_tag.attrs_a[0][0] += c
3546 # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3547 tok_state_after_attribute_name = ->
3548 c = txt.charAt(cur++)
3549 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3552 tok_state = tok_state_self_closing_start_tag
3555 tok_state = tok_state_before_attribute_value
3558 tok_state = tok_state_data
3561 tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3562 tok_state = tok_state_attribute_name
3566 tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3567 tok_state = tok_state_attribute_name
3571 tok_state = tok_state_data
3572 cur -= 1 # reconsume
3574 if c is '"' or c is "'" or c is '<'
3576 # fall through to Anything else
3578 tok_cur_tag.attrs_a.unshift [c, '']
3579 tok_state = tok_state_attribute_name
3581 # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3582 tok_state_before_attribute_value = ->
3583 switch c = txt.charAt(cur++)
3584 when "\t", "\n", "\u000c", ' '
3587 tok_state = tok_state_attribute_value_double_quoted
3589 tok_state = tok_state_attribute_value_unquoted
3592 tok_state = tok_state_attribute_value_single_quoted
3595 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3596 tok_state = tok_state_attribute_value_unquoted
3599 tok_state = tok_state_data
3605 tok_state = tok_state_data
3607 tok_cur_tag.attrs_a[0][1] += c
3608 tok_state = tok_state_attribute_value_unquoted
3611 # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3612 tok_state_attribute_value_double_quoted = ->
3613 switch c = txt.charAt(cur++)
3615 tok_state = tok_state_after_attribute_value_quoted
3617 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3620 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3623 tok_state = tok_state_data
3625 tok_cur_tag.attrs_a[0][1] += c
3628 # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3629 tok_state_attribute_value_single_quoted = ->
3630 switch c = txt.charAt(cur++)
3632 tok_state = tok_state_after_attribute_value_quoted
3634 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3637 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3640 tok_state = tok_state_data
3642 tok_cur_tag.attrs_a[0][1] += c
3645 # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3646 tok_state_attribute_value_unquoted = ->
3647 switch c = txt.charAt(cur++)
3648 when "\t", "\n", "\u000c", ' '
3649 tok_state = tok_state_before_attribute_name
3651 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3653 tok_state = tok_state_data
3658 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3661 tok_state = tok_state_data
3663 # Parse Error if ', <, = or ` (backtick)
3664 tok_cur_tag.attrs_a[0][1] += c
3667 # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3668 tok_state_after_attribute_value_quoted = ->
3669 switch c = txt.charAt(cur++)
3670 when "\t", "\n", "\u000c", ' '
3671 tok_state = tok_state_before_attribute_name
3673 tok_state = tok_state_self_closing_start_tag
3675 tok_state = tok_state_data
3681 tok_state = tok_state_data
3684 tok_state = tok_state_before_attribute_name
3685 cur -= 1 # we didn't handle that char
3688 # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3689 tok_state_self_closing_start_tag = ->
3690 c = txt.charAt(cur++)
3692 tok_cur_tag.flag 'self-closing'
3693 tok_state = tok_state_data
3697 tok_state = tok_state_data
3698 cur -= 1 # Reconsume
3702 tok_state = tok_state_before_attribute_name
3703 cur -= 1 # Reconsume
3706 # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3707 # WARNING: put a comment token in tok_cur_tag before setting this state
3708 tok_state_bogus_comment = ->
3709 next_gt = txt.indexOf '>', cur
3711 val = txt.substr cur
3714 val = txt.substr cur, (next_gt - cur)
3716 val = val.replace "\u0000", "\ufffd"
3717 tok_cur_tag.text += val
3718 tok_state = tok_state_data
3721 # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3722 tok_state_markup_declaration_open = ->
3723 if txt.substr(cur, 2) is '--'
3725 tok_cur_tag = new_comment_token ''
3726 tok_state = tok_state_comment_start
3728 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3730 tok_state = tok_state_doctype
3732 acn = adjusted_current_node()
3733 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3735 tok_state = tok_state_cdata_section
3739 tok_cur_tag = new_comment_token ''
3740 tok_state = tok_state_bogus_comment
3743 # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3744 tok_state_comment_start = ->
3745 switch c = txt.charAt(cur++)
3747 tok_state = tok_state_comment_start_dash
3750 tok_state = tok_state_comment
3751 return new_character_token "\ufffd"
3754 tok_state = tok_state_data
3758 tok_state = tok_state_data
3759 cur -= 1 # Reconsume
3762 tok_cur_tag.text += c
3763 tok_state = tok_state_comment
3766 # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3767 tok_state_comment_start_dash = ->
3768 switch c = txt.charAt(cur++)
3770 tok_state = tok_state_comment_end
3773 tok_cur_tag.text += "-\ufffd"
3774 tok_state = tok_state_comment
3777 tok_state = tok_state_data
3781 tok_state = tok_state_data
3782 cur -= 1 # Reconsume
3785 tok_cur_tag.text += "-#{c}"
3786 tok_state = tok_state_comment
3789 # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3790 tok_state_comment = ->
3791 switch c = txt.charAt(cur++)
3793 tok_state = tok_state_comment_end_dash
3796 tok_cur_tag.text += "\ufffd"
3799 tok_state = tok_state_data
3800 cur -= 1 # Reconsume
3803 tok_cur_tag.text += c
3806 # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3807 tok_state_comment_end_dash = ->
3808 switch c = txt.charAt(cur++)
3810 tok_state = tok_state_comment_end
3813 tok_cur_tag.text += "-\ufffd"
3814 tok_state = tok_state_comment
3817 tok_state = tok_state_data
3818 cur -= 1 # Reconsume
3821 tok_cur_tag.text += "-#{c}"
3822 tok_state = tok_state_comment
3825 # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3826 tok_state_comment_end = ->
3827 switch c = txt.charAt(cur++)
3829 tok_state = tok_state_data
3833 tok_cur_tag.text += "--\ufffd"
3834 tok_state = tok_state_comment
3837 tok_state = tok_state_comment_end_bang
3840 tok_cur_tag.text += '-'
3843 tok_state = tok_state_data
3844 cur -= 1 # Reconsume
3848 tok_cur_tag.text += "--#{c}"
3849 tok_state = tok_state_comment
3852 # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3853 tok_state_comment_end_bang = ->
3854 switch c = txt.charAt(cur++)
3856 tok_cur_tag.text += "--!#{c}"
3857 tok_state = tok_state_comment_end_dash
3859 tok_state = tok_state_data
3863 tok_cur_tag.text += "--!\ufffd"
3864 tok_state = tok_state_comment
3867 tok_state = tok_state_data
3868 cur -= 1 # Reconsume
3871 tok_cur_tag.text += "--!#{c}"
3872 tok_state = tok_state_comment
3875 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3876 tok_state_doctype = ->
3877 switch c = txt.charAt(cur++)
3878 when "\t", "\u000a", "\u000c", ' '
3879 tok_state = tok_state_before_doctype_name
3882 tok_state = tok_state_data
3883 el = new_doctype_token ''
3884 el.flag 'force-quirks', true
3885 cur -= 1 # Reconsume
3889 tok_state = tok_state_before_doctype_name
3890 cur -= 1 # Reconsume
3893 # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3894 tok_state_before_doctype_name = ->
3895 c = txt.charAt(cur++)
3896 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3899 tok_cur_tag = new_doctype_token c.toLowerCase()
3900 tok_state = tok_state_doctype_name
3904 tok_cur_tag = new_doctype_token "\ufffd"
3905 tok_state = tok_state_doctype_name
3909 el = new_doctype_token ''
3910 el.flag 'force-quirks', true
3911 tok_state = tok_state_data
3915 tok_state = tok_state_data
3916 el = new_doctype_token ''
3917 el.flag 'force-quirks', true
3918 cur -= 1 # Reconsume
3921 tok_cur_tag = new_doctype_token c
3922 tok_state = tok_state_doctype_name
3925 # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3926 tok_state_doctype_name = ->
3927 c = txt.charAt(cur++)
3928 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3929 tok_state = tok_state_after_doctype_name
3932 tok_state = tok_state_data
3935 tok_cur_tag.name += c.toLowerCase()
3939 tok_cur_tag.name += "\ufffd"
3943 tok_state = tok_state_data
3944 tok_cur_tag.flag 'force-quirks', true
3945 cur -= 1 # Reconsume
3948 tok_cur_tag.name += c
3951 # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3952 tok_state_after_doctype_name = ->
3953 c = txt.charAt(cur++)
3954 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3957 tok_state = tok_state_data
3961 tok_state = tok_state_data
3962 tok_cur_tag.flag 'force-quirks', true
3963 cur -= 1 # Reconsume
3966 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3968 tok_state = tok_state_after_doctype_public_keyword
3970 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3972 tok_state = tok_state_after_doctype_system_keyword
3975 tok_cur_tag.flag 'force-quirks', true
3976 tok_state = tok_state_bogus_doctype
3979 # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3980 tok_state_after_doctype_public_keyword = ->
3981 c = txt.charAt(cur++)
3982 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3983 tok_state = tok_state_before_doctype_public_identifier
3987 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3988 tok_state = tok_state_doctype_public_identifier_double_quoted
3992 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3993 tok_state = tok_state_doctype_public_identifier_single_quoted
3997 tok_cur_tag.flag 'force-quirks', true
3998 tok_state = tok_state_data
4002 tok_state = tok_state_data
4003 tok_cur_tag.flag 'force-quirks', true
4004 cur -= 1 # Reconsume
4008 tok_cur_tag.flag 'force-quirks', true
4009 tok_state = tok_state_bogus_doctype
4012 # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4013 tok_state_before_doctype_public_identifier = ->
4014 c = txt.charAt(cur++)
4015 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4019 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
4020 tok_state = tok_state_doctype_public_identifier_double_quoted
4024 tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
4025 tok_state = tok_state_doctype_public_identifier_single_quoted
4029 tok_cur_tag.flag 'force-quirks', true
4030 tok_state = tok_state_data
4034 tok_state = tok_state_data
4035 tok_cur_tag.flag 'force-quirks', true
4036 cur -= 1 # Reconsume
4040 tok_cur_tag.flag 'force-quirks', true
4041 tok_state = tok_state_bogus_doctype
4045 # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4046 tok_state_doctype_public_identifier_double_quoted = ->
4047 c = txt.charAt(cur++)
4049 tok_state = tok_state_after_doctype_public_identifier
4053 tok_cur_tag.public_identifier += "\ufffd"
4057 tok_cur_tag.flag 'force-quirks', true
4058 tok_state = tok_state_data
4062 tok_state = tok_state_data
4063 tok_cur_tag.flag 'force-quirks', true
4064 cur -= 1 # Reconsume
4067 tok_cur_tag.public_identifier += c
4070 # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4071 tok_state_doctype_public_identifier_single_quoted = ->
4072 c = txt.charAt(cur++)
4074 tok_state = tok_state_after_doctype_public_identifier
4078 tok_cur_tag.public_identifier += "\ufffd"
4082 tok_cur_tag.flag 'force-quirks', true
4083 tok_state = tok_state_data
4087 tok_state = tok_state_data
4088 tok_cur_tag.flag 'force-quirks', true
4089 cur -= 1 # Reconsume
4092 tok_cur_tag.public_identifier += c
4095 # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4096 tok_state_after_doctype_public_identifier = ->
4097 c = txt.charAt(cur++)
4098 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4099 tok_state = tok_state_between_doctype_public_and_system_identifiers
4102 tok_state = tok_state_data
4106 tok_cur_tag.system_identifier = ''
4107 tok_state = tok_state_doctype_system_identifier_double_quoted
4111 tok_cur_tag.system_identifier = ''
4112 tok_state = tok_state_doctype_system_identifier_single_quoted
4116 tok_state = tok_state_data
4117 tok_cur_tag.flag 'force-quirks', true
4118 cur -= 1 # Reconsume
4122 tok_cur_tag.flag 'force-quirks', true
4123 tok_state = tok_state_bogus_doctype
4126 # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4127 tok_state_between_doctype_public_and_system_identifiers = ->
4128 c = txt.charAt(cur++)
4129 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4132 tok_state = tok_state_data
4136 tok_cur_tag.system_identifier = ''
4137 tok_state = tok_state_doctype_system_identifier_double_quoted
4141 tok_cur_tag.system_identifier = ''
4142 tok_state = tok_state_doctype_system_identifier_single_quoted
4146 tok_state = tok_state_data
4147 tok_cur_tag.flag 'force-quirks', true
4148 cur -= 1 # Reconsume
4152 tok_cur_tag.flag 'force-quirks', true
4153 tok_state = tok_state_bogus_doctype
4156 # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4157 tok_state_after_doctype_system_keyword = ->
4158 c = txt.charAt(cur++)
4159 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4160 tok_state = tok_state_before_doctype_system_identifier
4164 tok_cur_tag.system_identifier = ''
4165 tok_state = tok_state_doctype_system_identifier_double_quoted
4169 tok_cur_tag.system_identifier = ''
4170 tok_state = tok_state_doctype_system_identifier_single_quoted
4174 tok_cur_tag.flag 'force-quirks', true
4175 tok_state = tok_state_data
4179 tok_state = tok_state_data
4180 tok_cur_tag.flag 'force-quirks', true
4181 cur -= 1 # Reconsume
4185 tok_cur_tag.flag 'force-quirks', true
4186 tok_state = tok_state_bogus_doctype
4189 # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4190 tok_state_before_doctype_system_identifier = ->
4191 c = txt.charAt(cur++)
4192 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4195 tok_cur_tag.system_identifier = ''
4196 tok_state = tok_state_doctype_system_identifier_double_quoted
4199 tok_cur_tag.system_identifier = ''
4200 tok_state = tok_state_doctype_system_identifier_single_quoted
4204 tok_cur_tag.flag 'force-quirks', true
4205 tok_state = tok_state_data
4209 tok_state = tok_state_data
4210 tok_cur_tag.flag 'force-quirks', true
4211 cur -= 1 # Reconsume
4215 tok_cur_tag.flag 'force-quirks', true
4216 tok_state = tok_state_bogus_doctype
4219 # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4220 tok_state_doctype_system_identifier_double_quoted = ->
4221 c = txt.charAt(cur++)
4223 tok_state = tok_state_after_doctype_system_identifier
4227 tok_cur_tag.system_identifier += "\ufffd"
4231 tok_cur_tag.flag 'force-quirks', true
4232 tok_state = tok_state_data
4236 tok_state = tok_state_data
4237 tok_cur_tag.flag 'force-quirks', true
4238 cur -= 1 # Reconsume
4241 tok_cur_tag.system_identifier += c
4244 # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4245 tok_state_doctype_system_identifier_single_quoted = ->
4246 c = txt.charAt(cur++)
4248 tok_state = tok_state_after_doctype_system_identifier
4252 tok_cur_tag.system_identifier += "\ufffd"
4256 tok_cur_tag.flag 'force-quirks', true
4257 tok_state = tok_state_data
4261 tok_state = tok_state_data
4262 tok_cur_tag.flag 'force-quirks', true
4263 cur -= 1 # Reconsume
4266 tok_cur_tag.system_identifier += c
4269 # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4270 tok_state_after_doctype_system_identifier = ->
4271 c = txt.charAt(cur++)
4272 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4275 tok_state = tok_state_data
4279 tok_state = tok_state_data
4280 tok_cur_tag.flag 'force-quirks', true
4281 cur -= 1 # Reconsume
4285 # do _not_ tok_cur_tag.flag 'force-quirks', true
4286 tok_state = tok_state_bogus_doctype
4289 # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4290 tok_state_bogus_doctype = ->
4291 c = txt.charAt(cur++)
4293 tok_state = tok_state_data
4296 tok_state = tok_state_data
4297 cur -= 1 # Reconsume
4302 # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4303 tok_state_cdata_section = ->
4304 tok_state = tok_state_data
4305 next_gt = txt.indexOf ']]>', cur
4307 val = txt.substr cur
4310 val = txt.substr cur, (next_gt - cur)
4312 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4313 val = val.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4314 val = val.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4315 return new_character_token val # fixfull split
4317 # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4318 # Don't set this as a state, just call it
4319 # returns a string (NOT a text node)
4320 parse_character_reference = (allowed_char = null, in_attr = false) ->
4321 if cur >= txt.length
4323 switch c = txt.charAt(cur)
4324 when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4325 # explicitly not a parse error
4328 # there has to be "one or more" alnums between & and ; to be a parse error
4331 if cur + 1 >= txt.length
4333 if txt.charAt(cur + 1).toLowerCase() is 'x'
4342 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4346 if txt.charAt(start + i) is ';'
4348 # FIXME This is supposed to generate parse errors for some chars
4349 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
4356 if alnum.indexOf(txt.charAt(cur + i)) is -1
4359 # exit early, because parse_error() below needs at least one alnum
4361 if txt.charAt(cur + i) is ';'
4362 i += 1 # include ';' terminator in value
4363 decoded = decode_named_char_ref txt.substr(cur, i)
4370 # no ';' terminator (only legacy char refs)
4372 for i in [2..max] # no prefix matches, so ok to check shortest first
4373 c = legacy_char_refs[txt.substr(cur, i)]
4376 if txt.charAt(cur + i) is '='
4377 # "because some legacy user agents will
4378 # misinterpret the markup in those cases"
4381 if alnum.indexOf(txt.charAt(cur + i)) > -1
4382 # this makes attributes forgiving about url args
4384 # ok, and besides the weird exceptions for attributes...
4385 # return the matching char
4386 cur += i # consume entity chars
4387 parse_error() # because no terminating ";"
4391 return # never reached
4393 # tree constructor initialization
4394 # see comments on TYPE_TAG/etc for the structure of this data
4397 doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4399 afe = [] # active formatting elements
4400 template_ins_modes = []
4401 ins_mode = ins_mode_initial
4402 original_ins_mode = ins_mode # TODO check spec
4403 flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4404 flag_frameset_ok = true
4406 flag_foster_parenting = false
4407 form_element_pointer = null
4408 temporary_buffer = null
4409 pending_table_character_tokens = []
4410 head_element_pointer = null
4411 flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4412 context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4414 # tokenizer initialization
4415 tok_state = tok_state_data
4417 if args.name is "one_that_breaks #1"
4418 throw "hi" # console.log "hi"
4420 # http://www.w3.org/TR/html5/syntax.html#tree-construction
4425 # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4428 serialize_els = (els, shallow, show_ids) ->
4434 serialized += t.serialize shallow, show_ids
4437 # TODO export TYPE_*
4438 module.exports.parse_html = parse_html
4439 module.exports.debug_log_reset = debug_log_reset
4440 module.exports.debug_log_each = debug_log_each
4441 module.exports.TYPE_TAG = TYPE_TAG
4442 module.exports.TYPE_TEXT = TYPE_TEXT
4443 module.exports.TYPE_COMMENT = TYPE_COMMENT
4444 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4445 module.exports.NS_HTML = NS_HTML
4446 module.exports.NS_MATHML = NS_MATHML
4447 module.exports.NS_SVG = NS_SVG