parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
  20 # or <body> tags, nor does it produce the top level "document" node in the dom
  21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
  22 # indicate places where additional code is needed for full HTML document
  23 # parsing.
  24 #
  25 # Instead, the data structure produced by this parser is an array of Nodes.
  26
  27
  28 # stacks/lists
  29 #
  30 # the spec uses a many different words do indicate which ends of lists/stacks
  31 # they are talking about (and relative movement within the lists/stacks). This
  32 # section splains. I'm implementing "lists" (afe and open_els) the same way
  33 # (both as stacks)
  34 #
  35 # stacks grow downward (current element is index=0)
  36 #
  37 # example: open_els = [a, b, c, d, e, f, g]
  38 #
  39 # "grows downwards" means it's visualized like this: (index: el, names)
  40 #
  41 #   6: g "start of the list", "topmost", "first"
  42 #   5: f
  43 #   4: e "previous" (to d), "above", "before"
  44 #   3: d   (previous/next are relative to this element)
  45 #   2: c "next", "after", "lower", "below"
  46 #   1: b
  47 #   0: a "end of the list", "current node", "bottommost", "last"
  48
  49
  50 # browser
  51 # note: to get this to run outside a browser, you'll have to write a native
  52 # implementation of decode_named_char_ref()
  53 unless module?.exports?
  54         window.wheic = {}
  55         module = exports: window.wheic
  56
  57 # Each node is an obect of the Node class. Here are the Node types:
  58 TYPE_TAG = 0 # name, {attributes}, [children]
  59 TYPE_TEXT = 1 # "text"
  60 TYPE_COMMENT = 2
  61 TYPE_DOCTYPE = 3
  62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
  64 TYPE_END_TAG = 5 # name
  65 TYPE_EOF = 6
  66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
  67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
  68
  69 # namespace constants
  70 NS_HTML = 1
  71 NS_MATHML = 2
  72 NS_SVG = 3
  73
  74 g_debug_log = []
  75 debug_log_reset = ->
  76         g_debug_log = []
  77 debug_log = (str) ->
  78         g_debug_log.push str
  79 debug_log_each = (cb) ->
  80         for str in g_debug_log
  81                 cb str
  82
  83 prev_node_id = 0
  84 class Node
  85         constructor: (type, args = {}) ->
  86                 @type = type # one of the TYPE_* constants above
  87                 @name = args.name ? '' # tag name
  88                 @text = args.text ? '' # contents for text/comment nodes
  89                 @attrs = args.attrs ? {}
  90                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
  91                 @children = args.children ? []
  92                 @namespace = args.namespace ? NS_HTML
  93                 @parent = args.parent ? null
  94                 @token = args.token ? null
  95                 @flags = args.flags ? {}
  96                 if args.id?
  97                         @id = "#{args.id}+"
  98                 else
  99                         @id = "#{++prev_node_id}"
 100         acknowledge_self_closing: ->
 101                 if @token?
 102                         @token.flag 'did_self_close'
 103                 else
 104                         @flag 'did_self_close', true
 105         flag: (key, value = null) ->
 106                 if value?
 107                         @flags[key] = value
 108                 else
 109                         return @flags[key]
 110         serialize: (shallow = false, show_ids = false) -> # for unit tests
 111                 ret = ''
 112                 switch @type
 113                         when TYPE_TAG
 114                                 ret += 'tag:'
 115                                 ret += JSON.stringify @name
 116                                 ret += ','
 117                                 if show_ids
 118                                         ret += "##{@id},"
 119                                 if shallow
 120                                         break
 121                                 attr_keys = []
 122                                 for k of @attrs
 123                                         attr_keys.push k
 124                                 attr_keys.sort()
 125                                 ret += '{'
 126                                 sep = ''
 127                                 for k in attr_keys
 128                                         ret += sep
 129                                         sep = ','
 130                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
 131                                 ret += '},['
 132                                 sep = ''
 133                                 for c in @children
 134                                         ret += sep
 135                                         sep = ','
 136                                         ret += c.serialize shallow, show_ids
 137                                 ret += ']'
 138                         when TYPE_TEXT
 139                                 ret += 'text:'
 140                                 ret += JSON.stringify @text
 141                         when TYPE_COMMENT
 142                                 ret += 'comment:'
 143                                 ret += JSON.stringify @text
 144                         when TYPE_DOCTYPE
 145                                 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
 146                         when TYPE_AFE_MARKER
 147                                 ret += 'marker'
 148                         when TYPE_AAA_BOOKMARK
 149                                 ret += 'aaa_bookmark'
 150                         else
 151                                 ret += 'unknown:'
 152                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
 153                 return ret
 154
 155 # helpers: (only take args that are normally known when parser creates nodes)
 156 new_open_tag = (name) ->
 157         return new Node TYPE_START_TAG, name: name
 158 new_end_tag = (name) ->
 159         return new Node TYPE_END_TAG, name: name
 160 new_element = (name) ->
 161         return new Node TYPE_TAG, name: name
 162 new_text_node = (txt) ->
 163         return new Node TYPE_TEXT, text: txt
 164 new_character_token = new_text_node
 165 new_comment_token = (txt) ->
 166         return new Node TYPE_COMMENT, text: txt
 167 new_doctype_token = (name) ->
 168         return new Node TYPE_DOCTYPE, name: name
 169 new_eof_token = ->
 170         return new Node TYPE_EOF
 171 new_afe_marker = ->
 172         return new Node TYPE_AFE_MARKER
 173 new_aaa_bookmark = ->
 174         return new Node TYPE_AAA_BOOKMARK
 175
 176 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
 177 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 178 digits = "0123456789"
 179 alnum = lc_alpha + uc_alpha + digits
 180 hex_chars = digits + "abcdefABCDEF"
 181
 182 is_uc_alpha = (str) ->
 183         return str.length is 1 and uc_alpha.indexOf(str) > -1
 184 is_lc_alpha = (str) ->
 185         return str.length is 1 and lc_alpha.indexOf(str) > -1
 186
 187 # some SVG elements have dashes in them
 188 tag_name_chars = alnum + "-"
 189
 190 # http://www.w3.org/TR/html5/infrastructure.html#space-character
 191 space_chars = "\u0009\u000a\u000c\u000d\u0020"
 192 is_space = (txt) ->
 193         return txt.length is 1 and space_chars.indexOf(txt) > -1
 194 is_space_tok = (t) ->
 195         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
 196
 197 is_input_hidden_tok = (t) ->
 198         return unless t.type is TYPE_START_TAG
 199         for a of t.attrs_a
 200                 if a[0] is 'type'
 201                         if a[1].toLowerCase() is 'hidden'
 202                                 return true
 203                         return false
 204         return false
 205
 206 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
 207 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
 208
 209 # These are the character references that don't need a terminating semicolon
 210 # min length: 2, max: 6, none are a prefix of any other.
 211 legacy_char_refs = {
 212         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
 213         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
 214         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
 215         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
 216         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
 217         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
 218         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
 219         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
 220         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
 221         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
 222         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
 223         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
 224         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
 225         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
 226         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
 227         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
 228         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
 229         yen: '¥', yuml: 'ÿ'
 230 }
 231
 232 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
 233 raw_text_elements = ['script', 'style']
 234 escapable_raw_text_elements = ['textarea', 'title']
 235 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
 236 svg_elements = [
 237         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
 238         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
 239         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
 240         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
 241         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
 242         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
 243         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
 244         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
 245         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
 246         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
 247         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
 248         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
 249         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
 250         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
 251         'view', 'vkern'
 252 ]
 253
 254 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
 255 mathml_elements = [
 256         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
 257         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
 258         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
 259         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
 260         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
 261         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 262         'determinant', 'diff', 'divergence', 'divide', 'domain',
 263         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 264         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 265         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 266         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 267         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 268         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 269         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 270         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 271         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 272         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 273         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 274         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 275         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 276         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 277         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 278         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 279         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 280         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 281         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 282         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 283         'vectorproduct', 'xor'
 284 ]
 285 # foreign_elements = [svg_elements..., mathml_elements...]
 286 #normal_elements = All other allowed HTML elements are normal elements.
 287
 288 special_elements = {
 289         # HTML:
 290         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
 291         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
 292         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
 293         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
 294         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
 295         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
 296         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
 297         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
 298         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
 299         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
 300         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
 301         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
 302         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
 303         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
 304         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
 305         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
 306         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
 307         wbr:NS_HTML, xmp:NS_HTML,
 308
 309         # MathML:
 310         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
 311         'annotation-xml':NS_MATHML,
 312
 313         # SVG:
 314         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
 315 }
 316
 317 formatting_elements = {
 318          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
 319          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
 320          u: true
 321 }
 322
 323 mathml_text_integration = {
 324         mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
 325 }
 326 is_mathml_text_integration_point = (el) ->
 327         return mathml_text_integration[el.name] = el.namespace
 328 is_html_integration = (el) -> # DON'T PASS A TOKEN
 329         if el.namespace is NS_MATHML and el.name is 'annotation-xml'
 330                 if el.attrs.encoding?
 331                         if el.attrs.encoding.toLowerCase() is 'text/html'
 332                                 return true
 333                         if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
 334                                 return true
 335                 return false
 336         if el.namespace is NS_SVG
 337                 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
 338                         return true
 339         return false
 340
 341 h_tags = {
 342         h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
 343 }
 344
 345 # FIXME namespacify
 346 foster_parenting_targets = {
 347         table: true
 348         tbody: true
 349         tfoot: true
 350         thead: true
 351         tr: true
 352 }
 353
 354 # FIXME namespacify
 355 # all html I presume
 356 end_tag_implied = {
 357         dd: true
 358         dt: true
 359         li: true
 360         option: true
 361         optgroup: true
 362         p: true
 363         rb: true
 364         rp: true
 365         rt: true
 366         rtc: true
 367 }
 368
 369 el_is_special = (e) ->
 370         return special_elements[e.name] is e.namespace
 371
 372 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
 373 el_is_special_not_adp = (el) ->
 374         return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
 375
 376 svg_name_fixes = {
 377         altglyph: 'altGlyph'
 378         altglyphdef: 'altGlyphDef'
 379         altglyphitem: 'altGlyphItem'
 380         animatecolor: 'animateColor'
 381         animatemotion: 'animateMotion'
 382         animatetransform: 'animateTransform'
 383         clippath: 'clipPath'
 384         feblend: 'feBlend'
 385         fecolormatrix: 'feColorMatrix'
 386         fecomponenttransfer: 'feComponentTransfer'
 387         fecomposite: 'feComposite'
 388         feconvolvematrix: 'feConvolveMatrix'
 389         fediffuselighting: 'feDiffuseLighting'
 390         fedisplacementmap: 'feDisplacementMap'
 391         fedistantlight: 'feDistantLight'
 392         fedropshadow: 'feDropShadow'
 393         feflood: 'feFlood'
 394         fefunca: 'feFuncA'
 395         fefuncb: 'feFuncB'
 396         fefuncg: 'feFuncG'
 397         fefuncr: 'feFuncR'
 398         fegaussianblur: 'feGaussianBlur'
 399         feimage: 'feImage'
 400         femerge: 'feMerge'
 401         femergenode: 'feMergeNode'
 402         femorphology: 'feMorphology'
 403         feoffset: 'feOffset'
 404         fepointlight: 'fePointLight'
 405         fespecularlighting: 'feSpecularLighting'
 406         fespotlight: 'feSpotLight'
 407         fetile: 'feTile'
 408         feturbulence: 'feTurbulence'
 409         foreignobject: 'foreignObject'
 410         glyphref: 'glyphRef'
 411         lineargradient: 'linearGradient'
 412         radialgradient: 'radialGradient'
 413         textpath: 'textPath'
 414 }
 415 svg_attribute_fixes = {
 416         attributename: 'attributeName'
 417         attributetype: 'attributeType'
 418         basefrequency: 'baseFrequency'
 419         baseprofile: 'baseProfile'
 420         calcmode: 'calcMode'
 421         clippathunits: 'clipPathUnits'
 422         contentscripttype: 'contentScriptType'
 423         contentstyletype: 'contentStyleType'
 424         diffuseconstant: 'diffuseConstant'
 425         edgemode: 'edgeMode'
 426         externalresourcesrequired: 'externalResourcesRequired'
 427         filterres: 'filterRes'
 428         filterunits: 'filterUnits'
 429         glyphref: 'glyphRef'
 430         gradienttransform: 'gradientTransform'
 431         gradientunits: 'gradientUnits'
 432         kernelmatrix: 'kernelMatrix'
 433         kernelunitlength: 'kernelUnitLength'
 434         keypoints: 'keyPoints'
 435         keysplines: 'keySplines'
 436         keytimes: 'keyTimes'
 437         lengthadjust: 'lengthAdjust'
 438         limitingconeangle: 'limitingConeAngle'
 439         markerheight: 'markerHeight'
 440         markerunits: 'markerUnits'
 441         markerwidth: 'markerWidth'
 442         maskcontentunits: 'maskContentUnits'
 443         maskunits: 'maskUnits'
 444         numoctaves: 'numOctaves'
 445         pathlength: 'pathLength'
 446         patterncontentunits: 'patternContentUnits'
 447         patterntransform: 'patternTransform'
 448         patternunits: 'patternUnits'
 449         pointsatx: 'pointsAtX'
 450         pointsaty: 'pointsAtY'
 451         pointsatz: 'pointsAtZ'
 452         preservealpha: 'preserveAlpha'
 453         preserveaspectratio: 'preserveAspectRatio'
 454         primitiveunits: 'primitiveUnits'
 455         refx: 'refX'
 456         refy: 'refY'
 457         repeatcount: 'repeatCount'
 458         repeatdur: 'repeatDur'
 459         requiredextensions: 'requiredExtensions'
 460         requiredfeatures: 'requiredFeatures'
 461         specularconstant: 'specularConstant'
 462         specularexponent: 'specularExponent'
 463         spreadmethod: 'spreadMethod'
 464         startoffset: 'startOffset'
 465         stddeviation: 'stdDeviation'
 466         stitchtiles: 'stitchTiles'
 467         surfacescale: 'surfaceScale'
 468         systemlanguage: 'systemLanguage'
 469         tablevalues: 'tableValues'
 470         targetx: 'targetX'
 471         targety: 'targetY'
 472         textlength: 'textLength'
 473         viewbox: 'viewBox'
 474         viewtarget: 'viewTarget'
 475         xchannelselector: 'xChannelSelector'
 476         ychannelselector: 'yChannelSelector'
 477         zoomandpan: 'zoomAndPan'
 478 }
 479 adjust_mathml_attributes = (t) ->
 480         for a in t.attrs_a
 481                 if a[0] is 'definitionurl'
 482                         a[0] = 'definitionURL'
 483         return
 484 adjust_svg_attributes = (t) ->
 485         for a in t.attrs_a
 486                 if svg_attribute_fixes[a[0]]?
 487                         a[0] = svg_attribute_fixes[a[0]]
 488         return
 489 adjust_foreign_attributes = (t) ->
 490         # fixfull
 491         return
 492
 493 # decode_named_char_ref()
 494 #
 495 # The list of named character references is _huge_ so ask the browser to decode
 496 # for us instead of wasting bandwidth/space on including the table here.
 497 #
 498 # Pass without the "&" but with the ";" examples:
 499 #    for "&amp" pass "amp;"
 500 #    for "&#x2032" pass "x2032;"
 501 g_dncr = {
 502         cache: {}
 503         textarea: document.createElement('textarea')
 504 }
 505 # TODO test this in IE8
 506 decode_named_char_ref = (txt) ->
 507         txt = "&#{txt}"
 508         decoded = g_dncr.cache[txt]
 509         return decoded if decoded?
 510         g_dncr.textarea.innerHTML = txt
 511         decoded = g_dncr.textarea.value
 512         return null if decoded is txt
 513         return g_dncr.cache[txt] = decoded
 514
 515 parse_html = (args) ->
 516         txt = null
 517         cur = null # index of next char in txt to be parsed
 518         # declare doc and tokenizer variables so they're in scope below
 519         doc = null
 520         open_els = null # stack of open elements
 521         afe = null # active formatting elements
 522         template_ins_modes = null
 523         ins_mode = null
 524         original_ins_mode = null
 525         tok_state = null
 526         tok_cur_tag = null # partially parsed tag
 527         flag_scripting = null
 528         flag_frameset_ok = null
 529         flag_parsing = null
 530         flag_foster_parenting = null
 531         form_element_pointer = null
 532         temporary_buffer = null
 533         pending_table_character_tokens = null
 534         head_element_pointer = null
 535         flag_fragment_parsing = null
 536         context_element = null
 537
 538         stop_parsing = ->
 539                 flag_parsing = false
 540
 541         parse_error = ->
 542                 if args.error_cb?
 543                         args.error_cb cur
 544                 else
 545                         console.log "Parse error at character #{cur} of #{txt.length}"
 546
 547         afe_push = (new_el) ->
 548                 matches = 0
 549                 for el, i in afe
 550                         if el.name is new_el.name and el.namespace is new_el.namespace
 551                                 for k, v of el.attrs
 552                                         continue unless new_el.attrs[k] is v
 553                                 for k, v of new_el.attrs
 554                                         continue unless el.attrs[k] is v
 555                                 matches += 1
 556                                 if matches is 3
 557                                         afe.splice i, 1
 558                                         break
 559                 afe.unshift new_el
 560         afe_push_marker = ->
 561                 afe.unshift new_afe_marker()
 562
 563         # the functions below impliment the Tree Contstruction algorithm
 564         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 565
 566         # But first... the helpers
 567         template_tag_is_open = ->
 568                 for t in open_els
 569                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
 570                                 return true
 571                 return false
 572         is_in_scope_x = (tag_name, scope, namespace) ->
 573                 for t in open_els
 574                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 575                                 return true
 576                         if scope[t.name] is t.namespace
 577                                 return false
 578                 return false
 579         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
 580                 for t in open_els
 581                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 582                                 return true
 583                         if scope[t.name] is t.namespace
 584                                 return false
 585                         if scope2[t.name] is t.namespace
 586                                 return false
 587                 return false
 588         standard_scopers = {
 589                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
 590                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
 591                 template: NS_HTML, mi: NS_MATHML,
 592
 593                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
 594                 'annotation-xml': NS_MATHML,
 595
 596                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
 597         }
 598         button_scopers = button: NS_HTML
 599         li_scopers = ol: NS_HTML, ul: NS_HTML
 600         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
 601         is_in_scope = (tag_name, namespace = null) ->
 602                 return is_in_scope_x tag_name, standard_scopers, namespace
 603         is_in_button_scope = (tag_name, namespace = null) ->
 604                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
 605         is_in_table_scope = (tag_name, namespace = null) ->
 606                 return is_in_scope_x tag_name, table_scopers, namespace
 607         # aka is_in_list_item_scope
 608         is_in_li_scope = (tag_name, namespace = null) ->
 609                 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
 610         is_in_select_scope = (tag_name, namespace = null) ->
 611                 for t in open_els
 612                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 613                                 return true
 614                         if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
 615                                 return false
 616                 return false
 617         # this checks for a particular element, not by name
 618         el_is_in_scope = (el) ->
 619                 for t in open_els
 620                         if t is el
 621                                 return true
 622                         if standard_scopers[t.name] is t.namespace
 623                                 return false
 624                 return false
 625
 626         clear_to_table_stopers = {
 627                 'table': true
 628                 'template': true
 629                 'html': true
 630         }
 631         clear_stack_to_table_context = ->
 632                 loop
 633                         if clear_to_table_stopers[open_els[0].name]?
 634                                 break
 635                         open_els.shift()
 636                 return
 637         clear_to_table_body_stopers = {
 638                 'tbody': true
 639                 'tfoot': true
 640                 'thead': true
 641                 'template': true
 642                 'html': true
 643         }
 644         clear_stack_to_table_body_context = ->
 645                 loop
 646                         if clear_to_table_body_stopers[open_els[0].name]?
 647                                 break
 648                         open_els.shift()
 649                 return
 650         clear_to_table_row_stopers = {
 651                 'tr': true
 652                 'template': true
 653                 'html': true
 654         }
 655         clear_stack_to_table_row_context = ->
 656                 loop
 657                         if clear_to_table_row_stopers[open_els[0].name]?
 658                                 break
 659                         open_els.shift()
 660                 return
 661         clear_afe_to_marker = ->
 662                 loop
 663                         return unless afe.length > 0 # this happens in fragment case, ?spec error
 664                         el = afe.shift()
 665                         if el.type is TYPE_AFE_MARKER
 666                                 return
 667                 return
 668
 669         # 8.2.3.1 ...
 670         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
 671         reset_ins_mode = ->
 672                 # 1. Let last be false.
 673                 last = false
 674                 # 2. Let node be the last node in the stack of open elements.
 675                 node_i = 0
 676                 node = open_els[node_i]
 677                 # 3. Loop: If node is the first node in the stack of open elements,
 678                 # then set last to true, and, if the parser was originally created as
 679                 # part of the HTML fragment parsing algorithm (fragment case) set node
 680                 # to the context element.
 681                 loop
 682                         if node_i is open_els.length - 1
 683                                 last = true
 684                                 # fixfull (fragment case)
 685
 686                         # 4. If node is a select element, run these substeps:
 687                         if node.name is 'select'
 688                                 # 1. If last is true, jump to the step below labeled done.
 689                                 unless last
 690                                         # 2. Let ancestor be node.
 691                                         ancestor_i = node_i
 692                                         ancestor = node
 693                                         # 3. Loop: If ancestor is the first node in the stack of
 694                                         # open elements, jump to the step below labeled done.
 695                                         loop
 696                                                 if ancestor_i is open_els.length - 1
 697                                                         break
 698                                                 # 4. Let ancestor be the node before ancestor in the stack
 699                                                 # of open elements.
 700                                                 ancestor_i += 1
 701                                                 ancestor = open_els[ancestor_i]
 702                                                 # 5. If ancestor is a template node, jump to the step below
 703                                                 # labeled done.
 704                                                 if ancestor.name is 'template'
 705                                                         break
 706                                                 # 6. If ancestor is a table node, switch the insertion mode
 707                                                 # to "in select in table" and abort these steps.
 708                                                 if ancestor.name is 'table'
 709                                                         ins_mode = ins_mode_in_select_in_table
 710                                                         return
 711                                                 # 7. Jump back to the step labeled loop.
 712                                 # 8. Done: Switch the insertion mode to "in select" and abort
 713                                 # these steps.
 714                                 ins_mode = ins_mode_in_select
 715                                 return
 716                         # 5. If node is a td or th element and last is false, then switch
 717                         # the insertion mode to "in cell" and abort these steps.
 718                         if (node.name is 'td' or node.name is 'th') and last is false
 719                                 ins_mode = ins_mode_in_cell
 720                                 return
 721                         # 6. If node is a tr element, then switch the insertion mode to "in
 722                         # row" and abort these steps.
 723                         if node.name is 'tr'
 724                                 ins_mode = ins_mode_in_row
 725                                 return
 726                         # 7. If node is a tbody, thead, or tfoot element, then switch the
 727                         # insertion mode to "in table body" and abort these steps.
 728                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
 729                                 ins_mode = ins_mode_in_table_body
 730                                 return
 731                         # 8. If node is a caption element, then switch the insertion mode
 732                         # to "in caption" and abort these steps.
 733                         if node.name is 'caption'
 734                                 ins_mode = ins_mode_in_caption
 735                                 return
 736                         # 9. If node is a colgroup element, then switch the insertion mode
 737                         # to "in column group" and abort these steps.
 738                         if node.name is 'colgroup'
 739                                 ins_mode = ins_mode_in_column_group
 740                                 return
 741                         # 10. If node is a table element, then switch the insertion mode to
 742                         # "in table" and abort these steps.
 743                         if node.name is 'table'
 744                                 ins_mode = ins_mode_in_table
 745                                 return
 746                         # 11. If node is a template element, then switch the insertion mode
 747                         # to the current template insertion mode and abort these steps.
 748                         # fixfull (template insertion mode stack)
 749
 750                         # 12. If node is a head element and last is true, then switch the
 751                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
 752                         # these steps. (fragment case)
 753                         if node.name is 'head' and last
 754                                 ins_mode = ins_mode_in_body
 755                                 return
 756                         # 13. If node is a head element and last is false, then switch the
 757                         # insertion mode to "in head" and abort these steps.
 758                         if node.name is 'head' and last is false
 759                                 ins_mode = ins_mode_in_head
 760                                 return
 761                         # 14. If node is a body element, then switch the insertion mode to
 762                         # "in body" and abort these steps.
 763                         if node.name is 'body'
 764                                 ins_mode = ins_mode_in_body
 765                                 return
 766                         # 15. If node is a frameset element, then switch the insertion mode
 767                         # to "in frameset" and abort these steps. (fragment case)
 768                         if node.name is 'frameset'
 769                                 ins_mode = ins_mode_in_frameset
 770                                 return
 771                         # 16. If node is an html element, run these substeps:
 772                         if node.name is 'html'
 773                                 # 1. If the head element pointer is null, switch the insertion
 774                                 # mode to "before head" and abort these steps. (fragment case)
 775                                 if head_element_pointer is null
 776                                         ins_mode = ins_mode_before_head
 777                                 else
 778                                         # 2. Otherwise, the head element pointer is not null,
 779                                         # switch the insertion mode to "after head" and abort these
 780                                         # steps.
 781                                         ins_mode = ins_mode_after_head
 782                                 return
 783                         # 17. If last is true, then switch the insertion mode to "in body"
 784                         # and abort these steps. (fragment case)
 785                         if last
 786                                 ins_mode = ins_mode_in_body
 787                                 return
 788                         # 18. Let node now be the node before node in the stack of open
 789                         # elements.
 790                         node_i += 1
 791                         node = open_els[node_i]
 792                         # 19. Return to the step labeled loop.
 793
 794         # 8.2.3.2
 795
 796         # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
 797         adjusted_current_node = ->
 798                 if open_els.length is 1 and flag_fragment_parsing
 799                         return context_element
 800                 return open_els[0]
 801
 802         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
 803         # this implementation is structured (mostly) as described at the link above.
 804         # capitalized comments are the "labels" described at the link above.
 805         reconstruct_afe = ->
 806                 return if afe.length is 0
 807                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
 808                         return
 809                 # Rewind
 810                 i = 0
 811                 loop
 812                         if i is afe.length - 1
 813                                 break
 814                         i += 1
 815                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
 816                                 i -= 1 # Advance
 817                                 break
 818                 # Create
 819                 loop
 820                         el = insert_html_element afe[i].token
 821                         afe[i] = el
 822                         break if i is 0
 823                         i -= 1 # Advance
 824
 825         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
 826         # adoption agency algorithm
 827         # overview here:
 828         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
 829         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
 830         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
 831         adoption_agency = (subject) ->
 832                 debug_log "adoption_agency()"
 833                 debug_log "tree: #{serialize_els doc.children, false, true}"
 834                 debug_log "open_els: #{serialize_els open_els, true, true}"
 835                 debug_log "afe: #{serialize_els afe, true, true}"
 836                 if open_els[0].name is subject
 837                         el = open_els[0]
 838                         open_els.shift()
 839                         # remove it from the list of active formatting elements (if found)
 840                         for t, i in afe
 841                                 if t is el
 842                                         afe.splice i, 1
 843                                         break
 844                         debug_log "aaa: starting off with subject on top of stack, exiting"
 845                         return
 846                 outer = 0
 847                 loop
 848                         if outer >= 8
 849                                 return
 850                         outer += 1
 851                         # 5. Let formatting element be the last element in the list of
 852                         # active formatting elements that: is between the end of the list
 853                         # and the last scope marker in the list, if any, or the start of
 854                         # the list otherwise, and  has the tag name subject.
 855                         fe = null
 856                         for t, fe_of_afe in afe
 857                                 if t.type is TYPE_AFE_MARKER
 858                                         break
 859                                 if t.name is subject
 860                                         fe = t
 861                                         break
 862                         # If there is no such element, then abort these steps and instead
 863                         # act as described in the "any other end tag" entry above.
 864                         if fe is null
 865                                 debug_log "aaa: fe not found in afe"
 866                                 in_body_any_other_end_tag subject
 867                                 return
 868                         # 6. If formatting element is not in the stack of open elements,
 869                         # then this is a parse error; remove the element from the list, and
 870                         # abort these steps.
 871                         in_open_els = false
 872                         for t, fe_of_open_els in open_els
 873                                 if t is fe
 874                                         in_open_els = true
 875                                         break
 876                         unless in_open_els
 877                                 debug_log "aaa: fe not found in open_els"
 878                                 parse_error()
 879                                 # "remove it from the list" must mean afe, since it's not in open_els
 880                                 afe.splice fe_of_afe, 1
 881                                 return
 882                         # 7. If formatting element is in the stack of open elements, but
 883                         # the element is not in scope, then this is a parse error; abort
 884                         # these steps.
 885                         unless el_is_in_scope fe
 886                                 debug_log "aaa: fe not in scope"
 887                                 parse_error()
 888                                 return
 889                         # 8. If formatting element is not the current node, this is a parse
 890                         # error. (But do not abort these steps.)
 891                         unless open_els[0] is fe
 892                                 parse_error()
 893                                 # continue
 894                         # 9. Let furthest block be the topmost node in the stack of open
 895                         # elements that is lower in the stack than formatting element, and
 896                         # is an element in the special category. There might not be one.
 897                         fb = null
 898                         fb_of_open_els = null
 899                         for t, i in open_els
 900                                 if t is fe
 901                                         break
 902                                 if el_is_special t
 903                                         fb = t
 904                                         fb_of_open_els = i
 905                                         # and continue, to see if there's one that's more "topmost"
 906                         # 10. If there is no furthest block, then the UA must first pop all
 907                         # the nodes from the bottom of the stack of open elements, from the
 908                         # current node up to and including formatting element, then remove
 909                         # formatting element from the list of active formatting elements,
 910                         # and finally abort these steps.
 911                         if fb is null
 912                                 debug_log "aaa: no fb"
 913                                 loop
 914                                         t = open_els.shift()
 915                                         if t is fe
 916                                                 afe.splice fe_of_afe, 1
 917                                                 return
 918                         # 11. Let common ancestor be the element immediately above
 919                         # formatting element in the stack of open elements.
 920                         ca = open_els[fe_of_open_els + 1] # common ancestor
 921
 922                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
 923                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
 924                         bookmark = new_aaa_bookmark()
 925                         for t, i in afe
 926                                 if t is fe
 927                                         afe.splice i, 0, bookmark
 928                                         break
 929                         node = last_node = fb
 930                         inner = 0
 931                         loop
 932                                 inner += 1
 933                                 # 3. Let node be the element immediately above node in the
 934                                 # stack of open elements, or if node is no longer in the stack
 935                                 # of open elements (e.g. because it got removed by this
 936                                 # algorithm), the element that was immediately above node in
 937                                 # the stack of open elements before node was removed.
 938                                 node_next = null
 939                                 for t, i in open_els
 940                                         if t is node
 941                                                 node_next = open_els[i + 1]
 942                                                 break
 943                                 node = node_next ? node_above
 944                                 debug_log "inner loop #{inner}"
 945                                 debug_log "tree: #{serialize_els doc.children, false, true}"
 946                                 debug_log "open_els: #{serialize_els open_els, true, true}"
 947                                 debug_log "afe: #{serialize_els afe, true, true}"
 948                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
 949                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
 950                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
 951                                 debug_log "node: #{node.serialize true, true}"
 952                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
 953
 954                                 # 4. If node is formatting element, then go to the next step in
 955                                 # the overall algorithm.
 956                                 if node is fe
 957                                         break
 958                                 debug_log "the meat"
 959                                 # 5. If inner loop counter is greater than three and node is in
 960                                 # the list of active formatting elements, then remove node from
 961                                 # the list of active formatting elements.
 962                                 node_in_afe = false
 963                                 for t, i in afe
 964                                         if t is node
 965                                                 if inner > 3
 966                                                         afe.splice i, 1
 967                                                         debug_log "max out inner"
 968                                                 else
 969                                                         node_in_afe = true
 970                                                         debug_log "in afe"
 971                                                 break
 972                                 # 6. If node is not in the list of active formatting elements,
 973                                 # then remove node from the stack of open elements and then go
 974                                 # back to the step labeled inner loop.
 975                                 unless node_in_afe
 976                                         debug_log "not in afe"
 977                                         for t, i in open_els
 978                                                 if t is node
 979                                                         node_above = open_els[i + 1]
 980                                                         open_els.splice i, 1
 981                                                         break
 982                                         continue
 983                                 debug_log "the bones"
 984                                 # 7. create an element for the token for which the element node
 985                                 # was created, in the HTML namespace, with common ancestor as
 986                                 # the intended parent; replace the entry for node in the list
 987                                 # of active formatting elements with an entry for the new
 988                                 # element, replace the entry for node in the stack of open
 989                                 # elements with an entry for the new element, and let node be
 990                                 # the new element.
 991                                 new_node = token_to_element node.token, NS_HTML, ca
 992                                 for t, i in afe
 993                                         if t is node
 994                                                 afe[i] = new_node
 995                                                 debug_log "replaced in afe"
 996                                                 break
 997                                 for t, i in open_els
 998                                         if t is node
 999                                                 node_above = open_els[i + 1]
1000                                                 open_els[i] = new_node
1001                                                 debug_log "replaced in open_els"
1002                                                 break
1003                                 node = new_node
1004                                 # 8. If last node is furthest block, then move the
1005                                 # aforementioned bookmark to be immediately after the new node
1006                                 # in the list of active formatting elements.
1007                                 if last_node is fb
1008                                         for t, i in afe
1009                                                 if t is bookmark
1010                                                         afe.splice i, 1
1011                                                         debug_log "removed bookmark"
1012                                                         break
1013                                         for t, i in afe
1014                                                 if t is node
1015                                                         # "after" means lower
1016                                                         afe.splice i, 0, bookmark # "after as <-
1017                                                         debug_log "placed bookmark after node"
1018                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1019                                                         break
1020                                 # 9. Insert last node into node, first removing it from its
1021                                 # previous parent node if any.
1022                                 if last_node.parent?
1023                                         debug_log "last_node has parent"
1024                                         for c, i in last_node.parent.children
1025                                                 if c is last_node
1026                                                         debug_log "removing last_node from parent"
1027                                                         last_node.parent.children.splice i, 1
1028                                                         break
1029                                 node.children.push last_node
1030                                 last_node.parent = node
1031                                 # 10. Let last node be node.
1032                                 last_node = node
1033                                 debug_log "at last"
1034                                 # 11. Return to the step labeled inner loop.
1035                         # 14. Insert whatever last node ended up being in the previous step
1036                         # at the appropriate place for inserting a node, but using common
1037                         # ancestor as the override target.
1038
1039                         # In the case where fe is immediately followed by fb:
1040                         #   * inner loop exits out early (node==fe)
1041                         #   * last_node is fb
1042                         #   * last_node is still in the tree (not a duplicate)
1043                         if last_node.parent?
1044                                 debug_log "FEFIRST? last_node has parent"
1045                                 for c, i in last_node.parent.children
1046                                         if c is last_node
1047                                                 debug_log "removing last_node from parent"
1048                                                 last_node.parent.children.splice i, 1
1049                                                 break
1050
1051                         debug_log "after aaa inner loop"
1052                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1053                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1054                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1055                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1056                         debug_log "tree: #{serialize_els doc.children, false, true}"
1057
1058                         debug_log "insert"
1059
1060
1061                         # can't use standard insert token thing, because it's already in
1062                         # open_els and must stay at it's current position in open_els
1063                         dest = adjusted_insertion_location ca
1064                         dest[0].children.splice dest[1], 0, last_node
1065                         last_node.parent = dest[0]
1066
1067
1068                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1069                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1070                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1071                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1072                         debug_log "tree: #{serialize_els doc.children, false, true}"
1073
1074                         # 15. Create an element for the token for which formatting element
1075                         # was created, in the HTML namespace, with furthest block as the
1076                         # intended parent.
1077                         new_element = token_to_element fe.token, NS_HTML, fb
1078                         # 16. Take all of the child nodes of furthest block and append them
1079                         # to the element created in the last step.
1080                         while fb.children.length
1081                                 t = fb.children.shift()
1082                                 t.parent = new_element
1083                                 new_element.children.push t
1084                         # 17. Append that new element to furthest block.
1085                         new_element.parent = fb
1086                         fb.children.push new_element
1087                         # 18. Remove formatting element from the list of active formatting
1088                         # elements, and insert the new element into the list of active
1089                         # formatting elements at the position of the aforementioned
1090                         # bookmark.
1091                         for t, i in afe
1092                                 if t is fe
1093                                         afe.splice i, 1
1094                                         break
1095                         for t, i in afe
1096                                 if t is bookmark
1097                                         afe[i] = new_element
1098                                         break
1099                         # 19. Remove formatting element from the stack of open elements,
1100                         # and insert the new element into the stack of open elements
1101                         # immediately below the position of furthest block in that stack.
1102                         for t, i in open_els
1103                                 if t is fe
1104                                         open_els.splice i, 1
1105                                         break
1106                         for t, i in open_els
1107                                 if t is fb
1108                                         open_els.splice i, 0, new_element
1109                                         break
1110                         # 20. Jump back to the step labeled outer loop.
1111                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1112                         debug_log "tree: #{serialize_els doc.children, false, true}"
1113                         debug_log "open_els: #{serialize_els open_els, true, true}"
1114                         debug_log "afe: #{serialize_els afe, true, true}"
1115                 debug_log "AAA DONE"
1116
1117         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1118         close_p_element = ->
1119                 generate_implied_end_tags 'p' # arg is exception
1120                 if open_els[0].name isnt 'p'
1121                         parse_error()
1122                 while open_els.length > 1 # just in case
1123                         el = open_els.shift()
1124                         if el.name is 'p'
1125                                 return
1126         close_p_if_in_button_scope = ->
1127                 if is_in_button_scope 'p'
1128                         close_p_element()
1129
1130         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1131         # aka insert_a_character = (t) ->
1132         insert_character = (t) ->
1133                 dest = adjusted_insertion_location()
1134                 # fixfull check for Document node
1135                 if dest[1] > 0
1136                         prev = dest[0].children[dest[1] - 1]
1137                         if prev.type is TYPE_TEXT
1138                                 prev.text += t.text
1139                                 return
1140                 dest[0].children.splice dest[1], 0, t
1141
1142
1143         # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1144         process_token = (t) ->
1145                 acn = adjusted_current_node()
1146                 unless acn?
1147                         ins_mode t
1148                         return
1149                 if acn.namespace is NS_HTML
1150                         ins_mode t
1151                         return
1152                 if is_mathml_text_integration_point(acn)
1153                         if t.type is TYPE_START_TAG and (t.name is 'mglyph' or t.name is 'malignmark')
1154                                 ins_mode t
1155                                 return
1156                         if t.type is TYPE_TEXT
1157                                 ins_mode t
1158                                 return
1159                 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1160                         ins_mode t
1161                         return
1162                 if is_html_integration acn
1163                         if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1164                                 ins_mode t
1165                                 return
1166                 if t.type is TYPE_EOF
1167                         ins_mode t
1168                         return
1169                 in_foreign_content t
1170                 return
1171
1172         # 8.2.5.1
1173         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1174         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1175         adjusted_insertion_location = (override_target = null) ->
1176                 # 1. If there was an override target specified, then let target be the
1177                 # override target.
1178                 if override_target?
1179                         target = override_target
1180                 else # Otherwise, let target be the current node.
1181                         target = open_els[0]
1182                 # 2. Determine the adjusted insertion location using the first matching
1183                 # steps from the following list:
1184                 #
1185                 # If foster parenting is enabled and target is a table, tbody, tfoot,
1186                 # thead, or tr element Foster parenting happens when content is
1187                 # misnested in tables.
1188                 if flag_foster_parenting and foster_parenting_targets[target.name]
1189                         loop # once. this is here so we can ``break`` to "abort these substeps"
1190                                 # 1. Let last template be the last template element in the
1191                                 # stack of open elements, if any.
1192                                 last_template = null
1193                                 last_template_i = null
1194                                 for el, i in open_els
1195                                         if el.name is 'template' and el.namespace is NS_HTML
1196                                                 last_template = el
1197                                                 last_template_i = i
1198                                                 break
1199                                 # 2. Let last table be the last table element in the stack of
1200                                 # open elements, if any.
1201                                 last_table = null
1202                                 last_table_i
1203                                 for el, i in open_els
1204                                         if el.name is 'table' and el.namespace is NS_HTML
1205                                                 last_table = el
1206                                                 last_table_i = i
1207                                                 break
1208                                 # 3. If there is a last template and either there is no last
1209                                 # table, or there is one, but last template is lower (more
1210                                 # recently added) than last table in the stack of open
1211                                 # elements, then: let adjusted insertion location be inside
1212                                 # last template's template contents, after its last child (if
1213                                 # any), and abort these substeps.
1214                                 if last_template and (last_table is null or last_template_i < last_table_i)
1215                                         target = last_template # fixfull should be it's contents
1216                                         target_i = target.children.length
1217                                         break
1218                                 # 4. If there is no last table, then let adjusted insertion
1219                                 # location be inside the first element in the stack of open
1220                                 # elements (the html element), after its last child (if any),
1221                                 # and abort these substeps. (fragment case)
1222                                 if last_table is null
1223                                         # this is odd
1224                                         target = open_els[open_els.length - 1]
1225                                         target_i = target.children.length
1226                                         break
1227                                 # 5. If last table has a parent element, then let adjusted
1228                                 # insertion location be inside last table's parent element,
1229                                 # immediately before last table, and abort these substeps.
1230                                 if last_table.parent?
1231                                         for c, i in last_table.parent.children
1232                                                 if c is last_table
1233                                                         target = last_table.parent
1234                                                         target_i = i
1235                                                         break
1236                                         break
1237                                 # 6. Let previous element be the element immediately above last
1238                                 # table in the stack of open elements.
1239                                 #
1240                                 # huh? how could it not have a parent?
1241                                 previous_element = open_els[last_table_i + 1]
1242                                 # 7. Let adjusted insertion location be inside previous
1243                                 # element, after its last child (if any).
1244                                 target = previous_element
1245                                 target_i = target.children.length
1246                                 # Note: These steps are involved in part because it's possible
1247                                 # for elements, the table element in this case in particular,
1248                                 # to have been moved by a script around in the DOM, or indeed
1249                                 # removed from the DOM entirely, after the element was inserted
1250                                 # by the parser.
1251                                 break # don't really loop
1252                 else
1253                         # Otherwise Let adjusted insertion location be inside target, after
1254                         # its last child (if any).
1255                         target_i = target.children.length
1256
1257                 # 3. If the adjusted insertion location is inside a template element,
1258                 # let it instead be inside the template element's template contents,
1259                 # after its last child (if any).
1260                 # fixfull (template)
1261
1262                 # 4. Return the adjusted insertion location.
1263                 return [target, target_i]
1264
1265         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1266         # aka create_an_element_for_token
1267         token_to_element = (t, namespace, intended_parent) ->
1268                 # convert attributes into a hash
1269                 attrs = {}
1270                 for a in t.attrs_a
1271                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1272                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1273
1274                 # TODO 2. If the newly created element has an xmlns attribute in the
1275                 # XMLNS namespace whose value is not exactly the same as the element's
1276                 # namespace, that is a parse error. Similarly, if the newly created
1277                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1278                 # value is not the XLink Namespace, that is a parse error.
1279
1280                 # fixfull: the spec says stuff about form pointers and ownerDocument
1281
1282                 return el
1283
1284         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1285         insert_foreign_element = (token, namespace) ->
1286                 ail = adjusted_insertion_location()
1287                 ail_el = ail[0]
1288                 ail_i = ail[1]
1289                 el = token_to_element token, namespace, ail_el
1290                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1291                 el.parent = ail_el
1292                 ail_el.children.splice ail_i, 0, el
1293                 open_els.unshift el
1294                 return el
1295         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1296         insert_html_element = (token) ->
1297                 insert_foreign_element token, NS_HTML
1298
1299         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1300         # position should be [node, index_within_children]
1301         insert_comment = (t, position = null) ->
1302                 position ?= adjusted_insertion_location()
1303                 position[0].children.splice position[1], 0, t
1304
1305         # 8.2.5.2
1306         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1307         parse_generic_raw_text = (t) ->
1308                 insert_html_element t
1309                 tok_state = tok_state_rawtext
1310                 original_ins_mode = ins_mode
1311                 ins_mode = ins_mode_text
1312         parse_generic_rcdata_text = (t) ->
1313                 insert_html_element t
1314                 tok_state = tok_state_rcdata
1315                 original_ins_mode = ins_mode
1316                 ins_mode = ins_mode_text
1317
1318         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1319         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1320         generate_implied_end_tags = (except = null) ->
1321                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1322                         open_els.shift()
1323
1324         # 8.2.5.4 The rules for parsing tokens in HTML content
1325         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1326
1327         # 8.2.5.4.1 The "initial" insertion mode
1328         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1329         ins_mode_initial = (t) ->
1330                 if is_space_tok t
1331                         return
1332                 if t.type is TYPE_COMMENT
1333                         # ?fixfull
1334                         doc.children.push t
1335                         return
1336                 if t.type is TYPE_DOCTYPE
1337                         # FIXME check identifiers, set quirks, etc
1338                         # fixfull
1339                         doc.children.push t
1340                         ins_mode = ins_mode_before_html
1341                         return
1342                 # Anything else
1343                 #fixfull (iframe, quirks)
1344                 ins_mode = ins_mode_before_html
1345                 process_token t
1346                 return
1347
1348         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1349         ins_mode_before_html = (t) ->
1350                 if t.type is TYPE_DOCTYPE
1351                         parse_error()
1352                         return
1353                 if t.type is TYPE_COMMENT
1354                         doc.children.push t
1355                         return
1356                 if is_space_tok t
1357                         return
1358                 if t.type is TYPE_START_TAG and t.name is 'html'
1359                         el = token_to_element t, NS_HTML, doc
1360                         doc.children.push el
1361                         open_els.unshift(el)
1362                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1363                         ins_mode = ins_mode_before_head
1364                         return
1365                 if t.type is TYPE_END_TAG
1366                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1367                                 # fall through to "anything else"
1368                         else
1369                                 parse_error()
1370                                 return
1371                 # Anything else
1372                 html_tok = new_open_tag 'html'
1373                 el = token_to_element html_tok, NS_HTML, doc
1374                 doc.children.push el
1375                 open_els.unshift el
1376                 # ?fixfull browsing context
1377                 ins_mode = ins_mode_before_head
1378                 process_token t
1379                 return
1380
1381         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1382         ins_mode_before_head = (t) ->
1383                 if is_space_tok t
1384                         return
1385                 if t.type is TYPE_COMMENT
1386                         insert_comment t
1387                         return
1388                 if t.type is TYPE_DOCTYPE
1389                         parse_error()
1390                         return
1391                 if t.type is TYPE_START_TAG and t.name is 'html'
1392                         ins_mode_in_body t
1393                         return
1394                 if t.type is TYPE_START_TAG and t.name is 'head'
1395                         el = insert_html_element t
1396                         head_element_pointer = el
1397                         ins_mode = ins_mode_in_head
1398                 if t.type is TYPE_END_TAG
1399                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1400                                 # fall through to Anything else below
1401                         else
1402                                 parse_error()
1403                                 return
1404                 # Anything else
1405                 head_tok = new_open_tag 'head'
1406                 el = insert_html_element head_tok
1407                 head_element_pointer = el
1408                 ins_mode = ins_mode_in_head
1409                 process_token t
1410
1411         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1412         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1413                 open_els.shift() # spec says this will be a 'head' node
1414                 ins_mode = ins_mode_after_head
1415                 process_token t
1416         ins_mode_in_head = (t) ->
1417                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1418                         insert_character t
1419                         return
1420                 if t.type is TYPE_COMMENT
1421                         insert_comment t
1422                         return
1423                 if t.type is TYPE_DOCTYPE
1424                         parse_error()
1425                         return
1426                 if t.type is TYPE_START_TAG and t.name is 'html'
1427                         ins_mode_in_body t
1428                         return
1429                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1430                         el = insert_html_element t
1431                         open_els.shift()
1432                         t.acknowledge_self_closing()
1433                         return
1434                 if t.type is TYPE_START_TAG and t.name is 'meta'
1435                         el = insert_html_element t
1436                         open_els.shift()
1437                         t.acknowledge_self_closing()
1438                         # fixfull encoding stuff
1439                         return
1440                 if t.type is TYPE_START_TAG and t.name is 'title'
1441                         parse_generic_rcdata_text t
1442                         return
1443                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1444                         parse_generic_raw_text t
1445                         return
1446                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1447                         insert_html_element t
1448                         ins_mode = ins_mode_in_head_noscript
1449                         return
1450                 if t.type is TYPE_START_TAG and t.name is 'script'
1451                         ail = adjusted_insertion_location()
1452                         el = token_to_element t, NS_HTML, ail
1453                         el.flag 'parser-inserted', true
1454                         # fixfull frament case
1455                         ail[0].children.splice ail[1], 0, el
1456                         open_els.unshift el
1457                         tok_state = tok_state_script_data
1458                         original_ins_mode = ins_mode # make sure orig... is defined
1459                         ins_mode = ins_mode_text
1460                         return
1461                 if t.type is TYPE_END_TAG and t.name is 'head'
1462                         open_els.shift() # will be a head element... spec says so
1463                         ins_mode = ins_mode_after_head
1464                         return
1465                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1466                         ins_mode_in_head_else t
1467                         return
1468                 if t.type is TYPE_START_TAG and t.name is 'template'
1469                         insert_html_element t
1470                         afe_push_marker()
1471                         flag_frameset_ok = false
1472                         ins_mode = ins_mode_in_template
1473                         template_ins_modes.unshift ins_mode_in_template
1474                         return
1475                 if t.type is TYPE_END_TAG and t.name is 'template'
1476                         if template_tag_is_open()
1477                                 generate_implied_end_tags
1478                                 if open_els[0].name isnt 'template'
1479                                         parse_error()
1480                                 loop
1481                                         el = open_els.shift()
1482                                         if el.name is 'template'
1483                                                 break
1484                                 clear_afe_to_marker()
1485                                 template_ins_modes.shift()
1486                                 reset_ins_mode()
1487                         else
1488                                 parse_error()
1489                         return
1490                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1491                         parse_error()
1492                         return
1493                 ins_mode_in_head_else t
1494
1495         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1496         ins_mode_in_head_noscript_else = (t) ->
1497                 parse_error()
1498                 open_els.shift()
1499                 ins_mode = ins_mode_in_head
1500                 process_token t
1501         ins_mode_in_head_noscript = (t) ->
1502                 if t.type is TYPE_DOCTYPE
1503                         parse_error()
1504                         return
1505                 if t.type is TYPE_START_TAG
1506                         ins_mode_in_body t
1507                         return
1508                 if t.type is TYPE_END_TAG and t.name is 'noscript'
1509                         open_els.shift()
1510                         ins_mode = ins_mode_in_head
1511                         return
1512                 if (t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\u000a" or t.text is "\u000c" or t.text is "\u000d" or t.text is ' ')) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1513                         ins_mode_in_head t
1514                         return
1515                 if t.type is TYPE_END_TAG and t.name is 'br'
1516                         ins_mode_in_head_noscript_else t
1517                         return
1518                 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1519                         parse_error()
1520                         return
1521                 # Anything else
1522                 ins_mode_in_head_noscript_else t
1523                 return
1524
1525
1526
1527         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1528         ins_mode_after_head_else = (t) ->
1529                 body_tok = new_open_tag 'body'
1530                 insert_html_element body_tok
1531                 ins_mode = ins_mode_in_body
1532                 process_token t
1533                 return
1534         ins_mode_after_head = (t) ->
1535                 if is_space_tok t
1536                         insert_character t
1537                         return
1538                 if t.type is TYPE_COMMENT
1539                         insert_comment t
1540                         return
1541                 if t.type is TYPE_DOCTYPE
1542                         parse_error()
1543                         return
1544                 if t.type is TYPE_START_TAG and t.name is 'html'
1545                         ins_mode_in_body t
1546                         return
1547                 if t.type is TYPE_START_TAG and t.name is 'body'
1548                         insert_html_element t
1549                         flag_frameset_ok = false
1550                         ins_mode = ins_mode_in_body
1551                         return
1552                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1553                         insert_html_element t
1554                         ins_mode = ins_mode_in_frameset
1555                         return
1556                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1557                         parse_error()
1558                         open_els.unshift head_element_pointer
1559                         ins_mode_in_head t
1560                         for el, i of open_els
1561                                 if el is head_element_pointer
1562                                         open_els.splice i, 1
1563                                         return
1564                         console.log "warning: 23904 couldn't find head element in open_els"
1565                         return
1566                 if t.type is TYPE_END_TAG and t.name is 'template'
1567                         ins_mode_in_head t
1568                         return
1569                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1570                         ins_mode_after_head_else t
1571                         return
1572                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1573                         parse_error()
1574                         return
1575                 # Anything else
1576                 ins_mode_after_head_else t
1577
1578         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1579         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1580                 for el, i in open_els
1581                         if el.namespace is NS_HTML and el.name is name
1582                                 generate_implied_end_tags name # arg is exception
1583                                 parse_error() unless i is 0
1584                                 while i >= 0
1585                                         open_els.shift()
1586                                         i -= 1
1587                                 return
1588                         if special_elements[el.name] is el.namespace
1589                                 parse_error()
1590                                 return
1591                 return
1592         ins_mode_in_body = (t) ->
1593                 if t.type is TYPE_TEXT and t.text is "\u0000"
1594                         parse_error()
1595                         return
1596                 if is_space_tok t
1597                         reconstruct_afe()
1598                         insert_character t
1599                         return
1600                 if t.type is TYPE_TEXT
1601                         reconstruct_afe()
1602                         insert_character t
1603                         flag_frameset_ok = false
1604                         return
1605                 if t.type is TYPE_COMMENT
1606                         insert_comment t
1607                         return
1608                 if t.type is TYPE_DOCTYPE
1609                         parse_error()
1610                         return
1611                 if t.type is TYPE_START_TAG and t.name is 'html'
1612                         parse_error()
1613                         return if template_tag_is_open()
1614                         root_attrs = open_els[open_els.length - 1].attrs
1615                         for a of t.attrs_a
1616                                 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1617                         return
1618
1619                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1620                         ins_mode_in_head t
1621                         return
1622                 if t.type is TYPE_START_TAG and t.name is 'body'
1623                         parse_error()
1624                         return if open_els.length < 2
1625                         second = open_els[open_els.length - 2]
1626                         return unless second.ns is NS_HTML
1627                         return unless second.name is 'body'
1628                         return if template_tag_is_open()
1629                         frameset_ok_flag = false
1630                         for a of t.attrs_a
1631                                 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1632                         return
1633                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1634                         parse_error()
1635                         return if open_els.length < 2
1636                         second_i = open_els.length - 2
1637                         second = open_els[second_i]
1638                         return unless second.ns is NS_HTML
1639                         return unless second.name is 'body'
1640                         flag_frameset_ok = false
1641                         if second.parent?
1642                                 for el, i in second.parent.children
1643                                         if el is second
1644                                                 second.parent.children.splice i, 1
1645                                                 break
1646                         open_els.splice second_i, 1
1647                         # pop everything except the "root html element"
1648                         while open_els.length > 1
1649                                 open_els.shift()
1650                         insert_html_element t
1651                         ins_mode = ins_mode_in_frameset
1652                         return
1653                 if t.type is TYPE_EOF
1654                         ok_tags = {
1655                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1656                                 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1657                                 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1658                         }
1659                         for el in open_els
1660                                 unless ok_tags[t.name] is el.namespace
1661                                         parse_error()
1662                                         break
1663                         if template_ins_modes.length > 0
1664                                 ins_mode_in_template t
1665                         else
1666                                 stop_parsing()
1667                         return
1668                 if t.type is TYPE_END_TAG and t.name is 'body'
1669                         unless is_in_scope 'body'
1670                                 parse_error()
1671                                 return
1672                         ok_tags = {
1673                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1674                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1675                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1676                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1677                                 html:NS_HTML
1678                         }
1679                         for el in open_els
1680                                 unless ok_tags[t.name] is el.namespace
1681                                         parse_error()
1682                                         break
1683                         ins_mode = ins_mode_after_body
1684                         return
1685                 if t.type is TYPE_END_TAG and t.name is 'html'
1686                         unless is_in_scope 'body'
1687                                 parse_error()
1688                                 return
1689                         ok_tags = {
1690                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1691                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1692                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1693                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1694                                 html:NS_HTML
1695                         }
1696                         for el in open_els
1697                                 unless ok_tags[t.name] is el.namespace
1698                                         parse_error()
1699                                         break
1700                         ins_mode = ins_mode_after_body
1701                         process_token t
1702                         return
1703                 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1704                         close_p_if_in_button_scope()
1705                         insert_html_element t
1706                         return
1707                 if t.type is TYPE_START_TAG and h_tags[t.name]?
1708                         close_p_if_in_button_scope()
1709                         if h_tags[open_els[0]] is NS_HTML
1710                                 parse_error()
1711                                 open_els.shift()
1712                         insert_html_element t
1713                         return
1714                 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1715                         close_p_if_in_button_scope()
1716                         insert_html_element t
1717                         # spec: If the next token is a "LF" (U+000A) character token, then
1718                         # ignore that token and move on to the next one. (Newlines at the
1719                         # start of pre blocks are ignored as an authoring convenience.)
1720                         if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1721                                 cur += 1
1722                         flag_frameset_ok = false
1723                         return
1724                 if t.type is TYPE_START_TAG and t.name is 'form'
1725                         unless form_element_pointer is null or template_tag_is_open()
1726                                 parse_error()
1727                                 return
1728                         close_p_if_in_button_scope()
1729                         el = insert_html_element t
1730                         unless template_tag_is_open()
1731                                 form_element_pointer = el
1732                         return
1733                 if t.type is TYPE_START_TAG and t.name is 'li'
1734                         flag_frameset_ok = false
1735                         for node in open_els
1736                                 if node.name is 'li' and node.namespace is NS_HTML
1737                                         generate_implied_end_tags 'li' # arg is exception
1738                                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1739                                                 parse_error()
1740                                         loop
1741                                                 el = open_els.shift()
1742                                                 if el.name is 'li' and el.namespace is NS_HTML
1743                                                         break
1744                                         break
1745                                 if el_is_special_not_adp node
1746                                                 break
1747                         close_p_if_in_button_scope()
1748                         insert_html_element t
1749                         return
1750                 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1751                         flag_frameset_ok = false
1752                         for node in open_els
1753                                 if node.name is 'dd' and node.namespace is NS_HTML
1754                                         generate_implied_end_tags 'dd' # arg is exception
1755                                         if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1756                                                 parse_error()
1757                                         loop
1758                                                 el = open_els.shift()
1759                                                 if el.name is 'dd' and el.namespace is NS_HTML
1760                                                         break
1761                                         break
1762                                 if node.name is 'dt' and node.namespace is NS_HTML
1763                                         generate_implied_end_tags 'dt' # arg is exception
1764                                         if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1765                                                 parse_error()
1766                                         loop
1767                                                 el = open_els.shift()
1768                                                 if el.name is 'dt' and el.namespace is NS_HTML
1769                                                         break
1770                                         break
1771                                 if el_is_special_not_adp node
1772                                         break
1773                         close_p_if_in_button_scope()
1774                         insert_html_element t
1775                         return
1776                 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1777                         close_p_if_in_button_scope()
1778                         insert_html_element t
1779                         tok_state = tok_state_plaintext
1780                         return
1781                 if t.type is TYPE_START_TAG and t.name is 'button'
1782                         if is_in_scope 'button', NS_HTML
1783                                 parse_error()
1784                                 generate_implied_end_tags()
1785                                 loop
1786                                         el = open_els.shift()
1787                                         if el.name is 'button' and el.namespace is NS_HTML
1788                                                 break
1789                         reconstruct_afe()
1790                         insert_html_element t
1791                         flag_frameset_ok = false
1792                         return
1793                 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1794                         unless is_in_scope t.name, NS_HTML
1795                                 parse_error()
1796                                 return
1797                         generate_implied_end_tags()
1798                         unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1799                                 parse_error()
1800                         loop
1801                                 el = open_els.shift()
1802                                 if el.name is t.name and el.namespace is NS_HTML
1803                                         return
1804                         return
1805                 if t.type is TYPE_END_TAG and t.name is 'form'
1806                         unless template_tag_is_open()
1807                                 node = form_element_pointer
1808                                 form_element_pointer = null
1809                                 if node is null or not el_is_in_scope node
1810                                         parse_error()
1811                                         return
1812                                 generate_implied_end_tags()
1813                                 if open_els[0] isnt node
1814                                         parse_error()
1815                                 for el, i in open_els
1816                                         if el is node
1817                                                 open_els.splice i, 1
1818                                                 break
1819                         else
1820                                 unless is_in_scope 'form', NS_HTML
1821                                         parse_error()
1822                                         return
1823                                 generate_implied_end_tags()
1824                                 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
1825                                         parse_error()
1826                                 loop
1827                                         el = open_els.shift()
1828                                         if el.name is 'form' and el.namespace is NS_HTML
1829                                                 break
1830                         return
1831                 if t.type is TYPE_END_TAG and t.name is 'p'
1832                         unless is_in_button_scope 'p', NS_HTML
1833                                 parse_error()
1834                                 insert_html_element new_open_tag 'p'
1835                         close_p_element()
1836                         return
1837                 if t.type is TYPE_END_TAG and t.name is 'li'
1838                         unless is_in_li_scope 'li', NS_HTML
1839                                 parse_error()
1840                                 return
1841                         generate_implied_end_tags 'li' # arg is exception
1842                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1843                                 parse_error()
1844                         loop
1845                                 el = open_els.shift()
1846                                 if el.name is 'li' and el.namespace is NS_HTML
1847                                         break
1848                         return
1849                 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
1850                         unless is_in_scope t.name, NS_HTML
1851                                 parse_error()
1852                                 return
1853                         generate_implied_end_tags t.name # arg is exception
1854                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1855                                 parse_error()
1856                         loop
1857                                 el = open_els.shift()
1858                                 if el.name is t.name and el.namespace is NS_HTML
1859                                         break
1860                         return
1861                 if t.type is TYPE_END_TAG and h_tags[t.name]?
1862                         h_in_scope = false
1863                         for el in open_els
1864                                 if h_tags[el.name] is el.namespace
1865                                         h_in_scope = true
1866                                         break
1867                                 if standard_scopers[el.name] is el.namespace
1868                                         break
1869                         unless h_in_scope
1870                                 parse_error()
1871                                 return
1872                         generate_implied_end_tags()
1873                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1874                                 parse_error()
1875                         loop
1876                                 el = open_els.shift()
1877                                 if h_tags[el.name] is el.namespace
1878                                         break
1879                         return
1880                 # deep breath!
1881                 if t.type is TYPE_START_TAG and t.name is 'a'
1882                         # If the list of active formatting elements contains an a element
1883                         # between the end of the list and the last marker on the list (or
1884                         # the start of the list if there is no marker on the list), then
1885                         # this is a parse error; run the adoption agency algorithm for the
1886                         # tag name "a", then remove that element from the list of active
1887                         # formatting elements and the stack of open elements if the
1888                         # adoption agency algorithm didn't already remove it (it might not
1889                         # have if the element is not in table scope).
1890                         found = false
1891                         for el in afe
1892                                 if el.type is TYPE_AFE_MARKER
1893                                         break
1894                                 if el.name is 'a' and el.namespace is NS_HTML
1895                                         found = el
1896                         if found?
1897                                 parse_error()
1898                                 adoption_agency 'a'
1899                                 for el, i in afe
1900                                         if el is found
1901                                                 afe.splice i, 1
1902                                 for el, i in open_els
1903                                         if el is found
1904                                                 open_els.splice i, 1
1905                         reconstruct_afe()
1906                         el = insert_html_element t
1907                         afe_push el
1908                         return
1909                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1910                         reconstruct_afe()
1911                         el = insert_html_element t
1912                         afe_push el
1913                         return
1914                 if t.type is TYPE_START_TAG and t.name is 'nobr'
1915                         reconstruct_afe()
1916                         el = insert_html_element t
1917                         afe_push el
1918                         return
1919                 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
1920                         adoption_agency t.name
1921                         return
1922                 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1923                         reconstruct_afe()
1924                         insert_html_element t
1925                         afe_push_marker()
1926                         flag_frameset_ok = false
1927                         return
1928                 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
1929                         unless is_in_scope t.name, NS_HTML
1930                                 parse_error()
1931                                 return
1932                         generate_implied_end_tags()
1933                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
1934                                 parse_error()
1935                         loop
1936                                 el = open_els.shift()
1937                                 if el.name is t.name and el.namespace is NS_HTML
1938                                         break
1939                         clear_afe_to_marker()
1940                         return
1941                 if t.type is TYPE_START_TAG and t.name is 'table'
1942                         close_p_if_in_button_scope() # fixfull quirksmode thing
1943                         insert_html_element t
1944                         flag_frameset_ok = false
1945                         ins_mode = ins_mode_in_table
1946                         return
1947                 if t.type is TYPE_END_TAG and t.name is 'br'
1948                         parse_error()
1949                         t.type is TYPE_START_TAG
1950                         # fall through
1951                 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
1952                         reconstruct_afe()
1953                         insert_html_element t
1954                         open_els.shift()
1955                         t.acknowledge_self_closing()
1956                         flag_frameset_ok = false
1957                         return
1958                 if t.type is TYPE_START_TAG and t.name is 'input'
1959                         reconstruct_afe()
1960                         insert_html_element t
1961                         open_els.shift()
1962                         t.acknowledge_self_closing()
1963                         unless is_input_hidden_tok t
1964                                 flag_frameset_ok = false
1965                         return
1966                 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
1967                         insert_html_element t
1968                         open_els.shift()
1969                         t.acknowledge_self_closing()
1970                         return
1971                 if t.type is TYPE_START_TAG and t.name is 'hr'
1972                         close_p_if_in_button_scope()
1973                         insert_html_element t
1974                         open_els.shift()
1975                         t.acknowledge_self_closing()
1976                         flag_frameset_ok = false
1977                         return
1978                 if t.type is TYPE_START_TAG and t.name is 'image'
1979                         parse_error()
1980                         t.name = 'img'
1981                         process_token t
1982                         return
1983                 if t.type is TYPE_START_TAG and t.name is 'isindex'
1984                         parse_error()
1985                         if template_tag_is_open() is false and form_element_pointer isnt null
1986                                 return
1987                         t.acknowledge_self_closing()
1988                         flag_frameset_ok = false
1989                         close_p_if_in_button_scope()
1990                         el = insert_html_element new_open_tag 'form'
1991                         unless template_tag_is_open()
1992                                 form_element_pointer = el
1993                         for a in t.attrs_a
1994                                 if a[0] is 'action'
1995                                         el.attrs['action'] = a[1]
1996                                         break
1997                         insert_html_element new_open_tag 'hr'
1998                         open_els.shift()
1999                         reconstruct_afe()
2000                         insert_html_element new_open_tag 'label'
2001                         # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2002                         input_el = new_open_tag 'input'
2003                         prompt = null
2004                         for a in t.attrs_a
2005                                 if a[0] is 'prompt'
2006                                         prompt = a[1]
2007                                 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2008                                         input_el.attrs_a.push [a[0], a[1]]
2009                         input_el.attrs_a.push ['name', 'isindex']
2010                         # fixfull this next bit is in english... internationalize?
2011                         prompt ?= "This is a searchable index. Enter search keywords: "
2012                         insert_character new_character_token prompt # fixfull split
2013                         # TODO submit typo "balue" in spec
2014                         insert_html_element input_el
2015                         open_els.shift()
2016                         # insert_character '' # you can put chars here if promt attr missing
2017                         open_els.shift()
2018                         insert_html_element new_open_tag 'hr'
2019                         open_els.shift()
2020                         open_els.shift()
2021                         unless template_tag_is_open()
2022                                 form_element_pointer = null
2023                         return
2024                 if t.type is TYPE_START_TAG and t.name is 'textarea'
2025                         insert_html_element t
2026                         if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2027                                 cur += 1
2028                         tok_state = tok_state_rcdata
2029                         original_ins_mode = ins_mode
2030                         flag_frameset_ok = false
2031                         ins_mode = ins_mode_text
2032                         return
2033                 if t.type is TYPE_START_TAG and t.name is 'xmp'
2034                         close_p_if_in_button_scope()
2035                         reconstruct_afe()
2036                         flag_frameset_ok = false
2037                         parse_generic_raw_text t
2038                         return
2039                 if t.type is TYPE_START_TAG and t.name is 'iframe'
2040                         flag_frameset_ok = false
2041                         parse_generic_raw_text t
2042                         return
2043                 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2044                         parse_generic_raw_text t
2045                         return
2046                 if t.type is TYPE_START_TAG and t.name is 'select'
2047                         reconstruct_afe()
2048                         insert_html_element t
2049                         flag_frameset_ok = false
2050                         if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2051                                 ins_mode = ins_mode_in_select_in_table
2052                         else
2053                                 ins_mode = ins_mode_in_select
2054                         return
2055                 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2056                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2057                                 open_els.shift()
2058                         reconstruct_afe()
2059                         insert_html_element t
2060                         return
2061                 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2062                         if is_in_scope 'ruby', NS_HTML
2063                                 generate_implied_end_tags()
2064                                 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2065                                         parse_error()
2066                         insert_html_element t
2067                         return
2068                 if t.type is TYPE_START_TAG and t.name is 'rt'
2069                         if is_in_scope 'ruby', NS_HTML
2070                                 generate_implied_end_tags 'rtc' # arg is exception
2071                                 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2072                                         parse_error()
2073                         insert_html_element t
2074                         return
2075                 if t.type is TYPE_START_TAG and t.name is 'math'
2076                         reconstruct_afe()
2077                         adjust_mathml_attributes t
2078                         adjust_foreign_attributes t
2079                         insert_foreign_element t, NS_MATHML
2080                         if t.flag 'self-closing'
2081                                 open_els.shift()
2082                                 t.acknowledge_self_closing()
2083                         return
2084                 if t.type is TYPE_START_TAG and t.name is 'svg'
2085                         reconstruct_afe()
2086                         adjust_svg_attributes t
2087                         adjust_foreign_attributes t
2088                         insert_foreign_element t, NS_SVG
2089                         if t.flag 'self-closing'
2090                                 open_els.shift()
2091                                 t.acknowledge_self_closing()
2092                         return
2093                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2094                         parse_error()
2095                         return
2096                 if t.type is TYPE_START_TAG # any other start tag
2097                         reconstruct_afe()
2098                         insert_html_element t
2099                         return
2100                 if t.type is TYPE_END_TAG # any other end tag
2101                         in_body_any_other_end_tag t.name
2102                         return
2103                 return
2104
2105         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2106         ins_mode_text = (t) ->
2107                 if t.type is TYPE_TEXT
2108                         insert_character t
2109                         return
2110                 if t.type is TYPE_EOF
2111                         parse_error()
2112                         if open_els[0].name is 'script'
2113                                 open_els[0].flag 'already started', true
2114                         open_els.shift()
2115                         ins_mode = original_ins_mode
2116                         process_token t
2117                         return
2118                 if t.type is TYPE_END_TAG and t.name is 'script'
2119                         open_els.shift()
2120                         ins_mode = original_ins_mode
2121                         # fixfull the spec seems to assume that I'm going to run the script
2122                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2123                         return
2124                 if t.type is TYPE_END_TAG
2125                         open_els.shift()
2126                         ins_mode = original_ins_mode
2127                         return
2128                 console.log 'warning: end of ins_mode_text reached'
2129
2130         # the functions below implement the tokenizer stats described here:
2131         # http://www.w3.org/TR/html5/syntax.html#tokenization
2132
2133         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2134         ins_mode_in_table_else = (t) ->
2135                 parse_error()
2136                 flag_foster_parenting = true
2137                 ins_mode_in_body t
2138                 flag_foster_parenting = false
2139                 return
2140         can_in_table = { # FIXME do this inline like everywhere else
2141                 'table': true
2142                 'tbody': true
2143                 'tfoot': true
2144                 'thead': true
2145                 'tr': true
2146         }
2147         ins_mode_in_table = (t) ->
2148                 switch t.type
2149                         when TYPE_TEXT
2150                                 if can_in_table[t.name]
2151                                         original_ins_mode = ins_mode
2152                                         ins_mode = ins_mode_in_table_text
2153                                         process_token t
2154                                 else
2155                                         ins_mode_in_table_else t
2156                         when TYPE_COMMENT
2157                                 insert_comment t
2158                         when TYPE_DOCTYPE
2159                                 parse_error()
2160                         when TYPE_START_TAG
2161                                 switch t.name
2162                                         when 'caption'
2163                                                 clear_stack_to_table_context()
2164                                                 afe_push_marker()
2165                                                 insert_html_element t
2166                                                 ins_mode = ins_mode_in_caption
2167                                         when 'colgroup'
2168                                                 clear_stack_to_table_context()
2169                                                 insert_html_element t
2170                                                 ins_mode = ins_mode_in_column_group
2171                                         when 'col'
2172                                                 clear_stack_to_table_context()
2173                                                 insert_html_element new_open_tag 'colgroup'
2174                                                 ins_mode = ins_mode_in_column_group
2175                                                 process_token t
2176                                         when 'tbody', 'tfoot', 'thead'
2177                                                 clear_stack_to_table_context()
2178                                                 insert_html_element t
2179                                                 ins_mode = ins_mode_in_table_body
2180                                         when 'td', 'th', 'tr'
2181                                                 clear_stack_to_table_context()
2182                                                 insert_html_element new_open_tag 'tbody'
2183                                                 ins_mode = ins_mode_in_table_body
2184                                                 process_token t
2185                                         when 'table'
2186                                                 parse_error()
2187                                                 if is_in_table_scope 'table'
2188                                                         loop
2189                                                                 el = open_els.shift()
2190                                                                 if el.name is 'table'
2191                                                                         break
2192                                                         reset_ins_mode()
2193                                                         process_token t
2194                                         when 'style', 'script', 'template'
2195                                                 ins_mode_in_head t
2196                                         when 'input'
2197                                                 unless is_input_hidden_tok t
2198                                                         ins_mode_in_table_else t
2199                                                 else
2200                                                         parse_error()
2201                                                         el = insert_html_element t
2202                                                         open_els.shift()
2203                                                         t.acknowledge_self_closing()
2204                                         when 'form'
2205                                                 parse_error()
2206                                                 if form_element_pointer?
2207                                                         return
2208                                                 if template_tag_is_open()
2209                                                         return
2210                                                 form_element_pointer = insert_html_element t
2211                                                 open_els.shift()
2212                                         else
2213                                                 ins_mode_in_table_else t
2214                         when TYPE_END_TAG
2215                                 switch t.name
2216                                         when 'table'
2217                                                 if is_in_table_scope 'table'
2218                                                         loop
2219                                                                 el = open_els.shift()
2220                                                                 if el.name is 'table'
2221                                                                         break
2222                                                         reset_ins_mode()
2223                                                 else
2224                                                         parse_error
2225                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2226                                                 parse_error()
2227                                         when 'template'
2228                                                 ins_mode_in_head t
2229                                         else
2230                                                 ins_mode_in_table_else t
2231                         when TYPE_EOF
2232                                 ins_mode_in_body t
2233                         else
2234                                 ins_mode_in_table_else t
2235
2236
2237         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2238         ins_mode_in_table_text = (t) ->
2239                 if t.type is TYPE_TEXT and t.text is "\u0000"
2240                         # huh? I thought the tokenizer didn't emit these
2241                         parse_error()
2242                         return
2243                 if t.type is TYPE_TEXT
2244                         pending_table_character_tokens.push t
2245                         return
2246                 # Anything else
2247                 all_space = true
2248                 for old in pending_table_character_tokens
2249                         unless is_space_tok old
2250                                 all_space = false
2251                                 break
2252                 if all_space
2253                         for old in pending_table_character_tokens
2254                                 insert_character old
2255                 else
2256                         for old in pending_table_character_tokens
2257                                 ins_mode_table_else old
2258                 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
2259                 ins_mode = original_ins_mode
2260                 process_token t
2261
2262         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2263         ins_mode_in_caption = (t) ->
2264                 if t.type is TYPE_END_TAG and t.name is 'caption'
2265                         if is_in_table_scope 'caption'
2266                                 generate_implied_end_tags()
2267                                 if open_els[0].name isnt 'caption'
2268                                         parse_error()
2269                                 loop
2270                                         el = open_els.shift()
2271                                         if el.name is 'caption'
2272                                                 break
2273                                 clear_afe_to_marker()
2274                                 ins_mode = ins_mode_in_table
2275                         else
2276                                 parse_error()
2277                                 # fragment case
2278                         return
2279                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2280                         parse_error()
2281                         if is_in_table_scope 'caption'
2282                                 loop
2283                                         el = open_els.shift()
2284                                         if el.name is 'caption'
2285                                                 break
2286                                 clear_afe_to_marker()
2287                                 ins_mode = ins_mode_in_table
2288                                 process_token t
2289                         # else fragment case
2290                         return
2291                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2292                         parse_error()
2293                         return
2294                 # Anything else
2295                 ins_mode_in_body t
2296
2297         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2298         ins_mode_in_column_group = (t) ->
2299                 if is_space_tok t
2300                         insert_character t
2301                         return
2302                 if t.type is TYPE_COMMENT
2303                         insert_comment t
2304                         return
2305                 if t.type is TYPE_DOCTYPE
2306                         parse_error()
2307                         return
2308                 if t.type is TYPE_START_TAG and t.name is 'html'
2309                         ins_mode_in_body t
2310                         return
2311                 if t.type is TYPE_START_TAG and t.name is 'col'
2312                         el = insert_html_element t
2313                         open_els.shift()
2314                         t.acknowledge_self_closing()
2315                         return
2316                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2317                         if open_els[0].name is 'colgroup'
2318                                 open_els.shift()
2319                                 ins_mode = ins_mode_in_table
2320                         else
2321                                 parse_error()
2322                         return
2323                 if t.type is TYPE_END_TAG and t.name is 'col'
2324                         parse_error()
2325                         return
2326                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2327                         ins_mode_in_head t
2328                         return
2329                 if t.type is TYPE_EOF
2330                         ins_mode_in_body t
2331                         return
2332                 # Anything else
2333                 if open_els[0].name isnt 'colgroup'
2334                         parse_error()
2335                         return
2336                 open_els.shift()
2337                 ins_mode = ins_mode_in_table
2338                 process_token t
2339                 return
2340
2341         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2342         ins_mode_in_table_body = (t) ->
2343                 if t.type is TYPE_START_TAG and t.name is 'tr'
2344                         clear_stack_to_table_body_context()
2345                         insert_html_element t
2346                         ins_mode = ins_mode_in_row
2347                         return
2348                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2349                         parse_error()
2350                         clear_stack_to_table_body_context()
2351                         insert_html_element new_open_tag 'tr'
2352                         ins_mode = ins_mode_in_row
2353                         process_token t
2354                         return
2355                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2356                         unless is_in_table_scope t.name # fixfull check namespace
2357                                 parse_error()
2358                                 return
2359                         clear_stack_to_table_body_context()
2360                         open_els.shift()
2361                         ins_mode = ins_mode_in_table
2362                         return
2363                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2364                         has = false
2365                         for el in open_els
2366                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
2367                                         has = true
2368                                         break
2369                                 if table_scopers[el.name]
2370                                         break
2371                         if !has
2372                                 parse_error()
2373                                 return
2374                         clear_stack_to_table_body_context()
2375                         open_els.shift()
2376                         ins_mode = ins_mode_in_table
2377                         process_token t
2378                         return
2379                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2380                         parse_error()
2381                         return
2382                 # Anything else
2383                 ins_mode_in_table t
2384
2385         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2386         ins_mode_in_row = (t) ->
2387                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2388                         clear_stack_to_table_row_context()
2389                         insert_html_element t
2390                         ins_mode = ins_mode_in_cell
2391                         afe_push_marker()
2392                         return
2393                 if t.type is TYPE_END_TAG and t.name is 'tr'
2394                         if is_in_table_scope 'tr'
2395                                 clear_stack_to_table_row_context()
2396                                 open_els.shift()
2397                                 ins_mode = ins_mode_in_table_body
2398                         else
2399                                 parse_error()
2400                         return
2401                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2402                         if is_in_table_scope 'tr'
2403                                 clear_stack_to_table_row_context()
2404                                 open_els.shift()
2405                                 ins_mode = ins_mode_in_table_body
2406                                 process_token t
2407                         else
2408                                 parse_error()
2409                         return
2410                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2411                         if is_in_table_scope t.name # fixfull namespace
2412                                 if is_in_table_scope 'tr'
2413                                         clear_stack_to_table_row_context()
2414                                         open_els.shift()
2415                                         ins_mode = ins_mode_in_table_body
2416                                         process_token t
2417                         else
2418                                 parse_error()
2419                         return
2420                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2421                         parse_error()
2422                         return
2423                 # Anything else
2424                 ins_mode_in_table t
2425
2426         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2427         close_the_cell = ->
2428                 generate_implied_end_tags()
2429                 unless open_els[0].name is 'td' or open_els[0] is 'th'
2430                         parse_error()
2431                 loop
2432                         el = open_els.shift()
2433                         if el.name is 'td' or el.name is 'th'
2434                                 break
2435                 clear_afe_to_marker()
2436                 ins_mode = ins_mode_in_row
2437
2438         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2439         ins_mode_in_cell = (t) ->
2440                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2441                         if is_in_table_scope t.name
2442                                 generate_implied_end_tags()
2443                                 if open_els[0].name isnt t.name
2444                                         parse_error
2445                                 loop
2446                                         el = open_els.shift()
2447                                         if el.name is t.name
2448                                                 break
2449                                 clear_afe_to_marker()
2450                                 ins_mode = ins_mode_in_row
2451                         else
2452                                 parse_error()
2453                         return
2454                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2455                         has = false
2456                         for el in open_els
2457                                 if el.name is 'td' or el.name is 'th'
2458                                         has = true
2459                                         break
2460                                 if table_scopers[el.name]
2461                                         break
2462                         if !has
2463                                 parse_error()
2464                                 return
2465                         close_the_cell()
2466                         process_token t
2467                         return
2468                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2469                         parse_error()
2470                         return
2471                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2472                         if is_in_table_scope t.name # fixfull namespace
2473                                 close_the_cell()
2474                                 process_token t
2475                         else
2476                                 parse_error()
2477                         return
2478                 # Anything Else
2479                 ins_mode_in_body t
2480
2481         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2482         ins_mode_in_select = (t) ->
2483                 if t.type is TYPE_TEXT and t.text is "\u0000"
2484                         parse_error()
2485                         return
2486                 if t.type is TYPE_TEXT
2487                         insert_character t
2488                         return
2489                 if t.type is TYPE_COMMENT
2490                         insert_comment t
2491                         return
2492                 if t.type is TYPE_DOCTYPE
2493                         parse_error()
2494                         return
2495                 if t.type is TYPE_START_TAG and t.name is 'html'
2496                         ins_mode_in_body t
2497                         return
2498                 if t.type is TYPE_START_TAG and t.name is 'option'
2499                         if open_els[0].name is 'option'
2500                                 open_els.shift()
2501                         insert_html_element t
2502                         return
2503                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2504                         if open_els[0].name is 'option'
2505                                 open_els.shift()
2506                         if open_els[0].name is 'optgroup'
2507                                 open_els.shift()
2508                         insert_html_element t
2509                         return
2510                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2511                         if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
2512                                 open_els.shift()
2513                         if open_els[0].name is 'optgroup'
2514                                 open_els.shift()
2515                         else
2516                                 parse_error()
2517                         return
2518                 if t.type is TYPE_END_TAG and t.name is 'option'
2519                         if open_els[0].name is 'option'
2520                                 open_els.shift()
2521                         else
2522                                 parse_error()
2523                         return
2524                 if t.type is TYPE_END_TAG and t.name is 'select'
2525                         if is_in_select_scope 'select'
2526                                 loop
2527                                         el = open_els.shift()
2528                                         if el.name is 'select'
2529                                                 break
2530                                 reset_ins_mode()
2531                         else
2532                                 parse_error()
2533                         return
2534                 if t.type is TYPE_START_TAG and t.name is 'select'
2535                         parse_error()
2536                         loop
2537                                 el = open_els.shift()
2538                                 if el.name is 'select'
2539                                         break
2540                         reset_ins_mode()
2541                         # spec says that this is the same as </select> but it doesn't say
2542                         # to check scope first
2543                         return
2544                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2545                         parse_error()
2546                         if is_in_select_scope 'select'
2547                                 return
2548                         loop
2549                                 el = open_els.shift()
2550                                 if el.name is 'select'
2551                                         break
2552                         reset_ins_mode()
2553                         process_token t
2554                         return
2555                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2556                         ins_mode_in_head t
2557                         return
2558                 if t.type is TYPE_EOF
2559                         ins_mode_in_body t
2560                         return
2561                 # Anything else
2562                 parse_error()
2563                 return
2564
2565         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2566         ins_mode_in_select_in_table = (t) ->
2567                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2568                         parse_error()
2569                         loop
2570                                 el = open_els.shift()
2571                                 if el.name is 'select'
2572                                         break
2573                         reset_ins_mode()
2574                         process_token t
2575                         return
2576                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2577                         parse_error()
2578                         unless is_in_table_scope t.name, NS_HTML
2579                                 return
2580                         loop
2581                                 el = open_els.shift()
2582                                 if el.name is 'select'
2583                                         break
2584                         reset_ins_mode()
2585                         process_token t
2586                         return
2587                 # Anything else
2588                 ins_mode_in_select t
2589                 return
2590
2591         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2592         ins_mode_in_template = (t) ->
2593                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2594                         ins_mode_in_body t
2595                         return
2596                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2597                         ins_mode_in_head t
2598                         return
2599                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2600                         template_ins_modes.shift()
2601                         template_ins_modes.unshift ins_mode_in_table
2602                         ins_mode = ins_mode_in_table
2603                         process_token t
2604                         return
2605                 if t.type is TYPE_START_TAG and t.name is 'col'
2606                         template_ins_modes.shift()
2607                         template_ins_modes.unshift ins_mode_in_column_group
2608                         ins_mode = ins_mode_in_column_group
2609                         process_token t
2610                         return
2611                 if t.type is TYPE_START_TAG and t.name is 'tr'
2612                         template_ins_modes.shift()
2613                         template_ins_modes.unshift ins_mode_in_table_body
2614                         ins_mode = ins_mode_in_table_body
2615                         process_token t
2616                         return
2617                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2618                         template_ins_modes.shift()
2619                         template_ins_modes.unshift ins_mode_in_row
2620                         ins_mode = ins_mode_in_row
2621                         process_token t
2622                         return
2623                 if t.type is TYPE_START_TAG
2624                         template_ins_modes.shift()
2625                         template_ins_modes.unshift ins_mode_in_body
2626                         ins_mode = ins_mode_in_body
2627                         process_token t
2628                         return
2629                 if t.type is TYPE_END_TAG
2630                         parse_error()
2631                         return
2632                 if t.type is TYPE_EOF
2633                         unless template_tag_is_open()
2634                                 stop_parsing()
2635                                 return
2636                         parse_error()
2637                         loop
2638                                 el = open_els.shift()
2639                                 if el.name is 'template' # fixfull check namespace
2640                                         break
2641                         clear_afe_to_marker()
2642                         template_ins_modes.shift()
2643                         reset_ins_mode()
2644                         process_token t
2645
2646         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2647         ins_mode_after_body = (t) ->
2648                 if is_space_tok t
2649                         ins_mode_in_body t
2650                         return
2651                 if t.type is TYPE_COMMENT
2652                         insert_comment t, [open_els[0], open_els[0].children.length]
2653                         return
2654                 if t.type is TYPE_DOCTYPE
2655                         parse_error()
2656                         return
2657                 if t.type is TYPE_START_TAG and t.name is 'html'
2658                         ins_mode_in_body t
2659                         return
2660                 if t.type is TYPE_END_TAG and t.name is 'html'
2661                         # fixfull fragment case
2662                         ins_mode = ins_mode_after_after_body
2663                         return
2664                 if t.type is TYPE_EOF
2665                         stop_parsing()
2666                         return
2667                 # Anything ELse
2668                 parse_error()
2669                 ins_mode = ins_mode_in_body
2670                 process_token t
2671
2672         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2673         ins_mode_in_frameset = (t) ->
2674                 if is_space_tok t
2675                         insert_character t
2676                         return
2677                 if t.type is TYPE_COMMENT
2678                         insert_comment t
2679                         return
2680                 if t.type is TYPE_DOCTYPE
2681                         parse_error()
2682                         return
2683                 if t.type is TYPE_START_TAG and t.name is 'html'
2684                         ins_mode_in_body t
2685                         return
2686                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2687                         insert_html_element t
2688                         return
2689                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2690                         # TODO ?correct for: "if the current node is the root html element"
2691                         if open_els.length is 1
2692                                 parse_error()
2693                                 return # fragment case
2694                         open_els.shift()
2695                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2696                                 ins_mode = ins_mode_after_frameset
2697                         return
2698                 if t.type is TYPE_START_TAG and t.name is 'frame'
2699                         insert_html_element t
2700                         open_els.shift()
2701                         t.acknowledge_self_closing()
2702                         return
2703                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2704                         ins_mode_in_head t
2705                         return
2706                 if t.type is TYPE_EOF
2707                         # TODO ?correct for: "if the current node is not the root html element"
2708                         if open_els.length isnt 1
2709                                 parse_error()
2710                         stop_parsing()
2711                         return
2712                 # Anything else
2713                 parse_error()
2714                 return
2715
2716         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2717         ins_mode_after_frameset = (t) ->
2718                 if is_space_tok t
2719                         insert_character t
2720                         return
2721                 if t.type is TYPE_COMMENT
2722                         insert_comment t
2723                         return
2724                 if t.type is TYPE_DOCTYPE
2725                         parse_error()
2726                         return
2727                 if t.type is TYPE_START_TAG and t.name is 'html'
2728                         ins_mode_in_body t
2729                         return
2730                 if t.type is TYPE_END_TAG and t.name is 'html'
2731                         insert_mode = ins_mode_after_after_frameset
2732                         return
2733                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2734                         ins_mode_in_head t
2735                         return
2736                 if t.type is TYPE_EOF
2737                         stop_parsing()
2738                         return
2739                 # Anything else
2740                 parse_error()
2741                 return
2742
2743         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2744         ins_mode_after_after_body = (t) ->
2745                 if t.type is TYPE_COMMENT
2746                         insert_comment t, [doc, doc.children.length]
2747                         return
2748                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2749                         ins_mode_in_body t
2750                         return
2751                 if t.type is TYPE_EOF
2752                         stop_parsing()
2753                         return
2754                 # Anything else
2755                 parse_error()
2756                 ins_mode = ins_mode_in_body
2757                 return
2758
2759         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2760         ins_mode_after_after_frameset = (t) ->
2761                 if t.type is TYPE_COMMENT
2762                         insert_comment t, [doc, doc.children.length]
2763                         return
2764                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2765                         ins_mode_in_body t
2766                         return
2767                 if t.type is TYPE_EOF
2768                         stop_parsing()
2769                         return
2770                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2771                         ins_mode_in_head t
2772                         return
2773                 # Anything else
2774                 parse_error()
2775                 return
2776
2777         # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2778         has_color_face_or_size = (t) ->
2779                 for a in t.attrs_a
2780                         if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2781                                 return true
2782                 return false
2783         in_foreign_content_end_script = ->
2784                 open_els.shift()
2785                 # fixfull
2786                 return
2787         in_foreign_content_other_start = (t) ->
2788                 acn = adjusted_current_node()
2789                 if acn.namespace is NS_MATHML
2790                         adjust_mathml_attributes t
2791                 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
2792                         t.name = svg_name_fixes[t.name]
2793                 if acn.namespace is NS_SVG
2794                         adjust_svg_attributes t
2795                 adjust_foreign_attributes t
2796                 insert_foreign_element t, acn.namespace
2797                 if t.flag 'self-closing'
2798                         if t.name is 'script'
2799                                 t.acknowledge_self_closing()
2800                                 in_foreign_content_end_script()
2801                         else
2802                                 open_els.shift()
2803                                 t.acknowledge_self_closing()
2804                 return
2805         in_foreign_content = (t) ->
2806                 if t.type is TYPE_TEXT and t.text is "\u0000"
2807                         parse_error()
2808                         insert_character new_character_token "\ufffd"
2809                         return
2810                 if is_space_tok t
2811                         insert_character t
2812                         return
2813                 if t.type is TYPE_TEXT
2814                         flag_frameset_ok = false
2815                         insert_character t
2816                         return
2817                 if t.type is TYPE_COMMENT
2818                         insert_comment t
2819                         return
2820                 if t.type is TYPE_DOCTYPE
2821                         parse_error()
2822                         return
2823                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
2824                         parse_error()
2825                         if flag_fragment_parsing
2826                                 in_foreign_content_other_start t
2827                                 return
2828                         loop # is this safe?
2829                                 open_els.shift()
2830                                 cn = open_els[0]
2831                                 if is_mathml_text_integration_point(cn) or is_html_integration(cn) or cn.namespace is NS_HTML
2832                                         break
2833                         process_token t
2834                         return
2835                 if t.type is TYPE_START_TAG
2836                         in_foreign_content_other_start t
2837                         return
2838                 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
2839                         in_foreign_content_end_script()
2840                         return
2841                 if t.type is TYPE_END_TAG
2842                         if open_els[0].name.toLowerCase() isnt t.name
2843                                 parse_error()
2844                         for node in open_els
2845                                 if node is open_els[open_els.length - 1]
2846                                         return
2847                                 if node.name.toLowerCase() is t.name
2848                                         loop
2849                                                 el = open_els.shift()
2850                                                 if el is node
2851                                                         return
2852                                 if node.namespace is NS_HTML
2853                                         break
2854                         ins_mode t # explicitly call HTML insertion mode
2855
2856
2857         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2858         tok_state_data = ->
2859                 switch c = txt.charAt(cur++)
2860                         when '&'
2861                                 return new_text_node parse_character_reference()
2862                         when '<'
2863                                 tok_state = tok_state_tag_open
2864                         when "\u0000"
2865                                 parse_error()
2866                                 return new_text_node c
2867                         when '' # EOF
2868                                 return new_eof_token()
2869                         else
2870                                 return new_text_node c
2871                 return null
2872
2873         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2874         # not needed: tok_state_character_reference_in_data = ->
2875         # just call parse_character_reference()
2876
2877         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2878         tok_state_rcdata = ->
2879                 switch c = txt.charAt(cur++)
2880                         when '&'
2881                                 return new_text_node parse_character_reference()
2882                         when '<'
2883                                 tok_state = tok_state_rcdata_less_than_sign
2884                         when "\u0000"
2885                                 parse_error()
2886                                 return new_character_token "\ufffd"
2887                         when '' # EOF
2888                                 return new_eof_token()
2889                         else
2890                                 return new_character_token c
2891                 return null
2892
2893         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2894         # not needed: tok_state_character_reference_in_rcdata = ->
2895         # just call parse_character_reference()
2896
2897         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2898         tok_state_rawtext = ->
2899                 switch c = txt.charAt(cur++)
2900                         when '<'
2901                                 tok_state = tok_state_rawtext_less_than_sign
2902                         when "\u0000"
2903                                 parse_error()
2904                                 return new_character_token "\ufffd"
2905                         when '' # EOF
2906                                 return new_eof_token()
2907                         else
2908                                 return new_character_token c
2909                 return null
2910
2911         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2912         tok_state_script_data = ->
2913                 switch c = txt.charAt(cur++)
2914                         when '<'
2915                                 tok_state = tok_state_script_data_less_than_sign
2916                         when "\u0000"
2917                                 parse_error()
2918                                 return new_character_token "\ufffd"
2919                         when '' # EOF
2920                                 return new_eof_token()
2921                         else
2922                                 return new_character_token c
2923                 return null
2924
2925         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2926         tok_state_plaintext = ->
2927                 switch c = txt.charAt(cur++)
2928                         when "\u0000"
2929                                 parse_error()
2930                                 return new_character_token "\ufffd"
2931                         when '' # EOF
2932                                 return new_eof_token()
2933                         else
2934                                 return new_character_token c
2935                 return null
2936
2937
2938         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2939         tok_state_tag_open = ->
2940                 switch c = txt.charAt(cur++)
2941                         when '!'
2942                                 tok_state = tok_state_markup_declaration_open
2943                         when '/'
2944                                 tok_state = tok_state_end_tag_open
2945                         when '?'
2946                                 parse_error()
2947                                 tok_cur_tag = new_comment_token '?'
2948                                 tok_state = tok_state_bogus_comment
2949                         else
2950                                 if is_lc_alpha(c)
2951                                         tok_cur_tag = new_open_tag c
2952                                         tok_state = tok_state_tag_name
2953                                 else if is_uc_alpha(c)
2954                                         tok_cur_tag = new_open_tag c.toLowerCase()
2955                                         tok_state = tok_state_tag_name
2956                                 else
2957                                         parse_error()
2958                                         tok_state = tok_state_data
2959                                         cur -= 1 # we didn't parse/handle the char after <
2960                                         return new_text_node '<'
2961                 return null
2962
2963         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2964         tok_state_end_tag_open = ->
2965                 switch c = txt.charAt(cur++)
2966                         when '>'
2967                                 parse_error()
2968                                 tok_state = tok_state_data
2969                         when '' # EOF
2970                                 parse_error()
2971                                 tok_state = tok_state_data
2972                                 return new_text_node '</'
2973                         else
2974                                 if is_uc_alpha(c)
2975                                         tok_cur_tag = new_end_tag c.toLowerCase()
2976                                         tok_state = tok_state_tag_name
2977                                 else if is_lc_alpha(c)
2978                                         tok_cur_tag = new_end_tag c
2979                                         tok_state = tok_state_tag_name
2980                                 else
2981                                         parse_error()
2982                                         tok_cur_tag = new_comment_token '/'
2983                                         tok_state = tok_state_bogus_comment
2984                 return null
2985
2986         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2987         tok_state_tag_name = ->
2988                 switch c = txt.charAt(cur++)
2989                         when "\t", "\n", "\u000c", ' '
2990                                 tok_state = tok_state_before_attribute_name
2991                         when '/'
2992                                 tok_state = tok_state_self_closing_start_tag
2993                         when '>'
2994                                 tok_state = tok_state_data
2995                                 tmp = tok_cur_tag
2996                                 tok_cur_tag = null
2997                                 return tmp
2998                         when "\u0000"
2999                                 parse_error()
3000                                 tok_cur_tag.name += "\ufffd"
3001                         when '' # EOF
3002                                 parse_error()
3003                                 tok_state = tok_state_data
3004                         else
3005                                 if is_uc_alpha(c)
3006                                         tok_cur_tag.name += c.toLowerCase()
3007                                 else
3008                                         tok_cur_tag.name += c
3009                 return null
3010
3011         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3012         tok_state_rcdata_less_than_sign = ->
3013                 c = txt.charAt(cur++)
3014                 if c is '/'
3015                         temporary_buffer = ''
3016                         tok_state = tok_state_rcdata_end_tag_open
3017                         return null
3018                 # Anything else
3019                 tok_state = tok_state_rcdata
3020                 cur -= 1 # reconsume the input character
3021                 return new_character_token '<'
3022
3023         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3024         tok_state_rcdata_end_tag_open = ->
3025                 c = txt.charAt(cur++)
3026                 if is_uc_alpha(c)
3027                         tok_cur_tag = new_end_tag c.toLowerCase()
3028                         temporary_buffer += c
3029                         tok_state = tok_state_rcdata_end_tag_name
3030                         return null
3031                 if is_lc_alpha(c)
3032                         tok_cur_tag = new_end_tag c
3033                         temporary_buffer += c
3034                         tok_state = tok_state_rcdata_end_tag_name
3035                         return null
3036                 # Anything else
3037                 tok_state = tok_state_rcdata
3038                 cur -= 1 # reconsume the input character
3039                 return new_character_token "</" # fixfull separate these
3040
3041         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3042         is_appropriate_end_tag = (t) ->
3043                 # spec says to check against "the tag name of the last start tag to
3044                 # have been emitted from this tokenizer", but this is only called from
3045                 # the various "raw" states, so it's hopefully ok to assume that
3046                 # open_els[0].name will work instead TODO: verify this after the script
3047                 # data states are implemented
3048                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3049                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3050
3051         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3052         tok_state_rcdata_end_tag_name = ->
3053                 c = txt.charAt(cur++)
3054                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3055                         if is_appropriate_end_tag tok_cur_tag
3056                                 tok_state = tok_state_before_attribute_name
3057                                 return
3058                         # else fall through to "Anything else"
3059                 if c is '/'
3060                         if is_appropriate_end_tag tok_cur_tag
3061                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3062                                 return
3063                         # else fall through to "Anything else"
3064                 if c is '>'
3065                         if is_appropriate_end_tag tok_cur_tag
3066                                 tok_state = tok_state_data
3067                                 return tok_cur_tag
3068                         # else fall through to "Anything else"
3069                 if is_uc_alpha(c)
3070                         tok_cur_tag.name += c.toLowerCase()
3071                         temporary_buffer += c
3072                         return null
3073                 if is_lc_alpha(c)
3074                         tok_cur_tag.name += c
3075                         temporary_buffer += c
3076                         return null
3077                 # Anything else
3078                 tok_state = tok_state_rcdata
3079                 cur -= 1 # reconsume the input character
3080                 return new_character_token '</' + temporary_buffer # fixfull separate these
3081
3082         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3083         tok_state_rawtext_less_than_sign = ->
3084                 c = txt.charAt(cur++)
3085                 if c is '/'
3086                         temporary_buffer = ''
3087                         tok_state = tok_state_rawtext_end_tag_open
3088                         return null
3089                 # Anything else
3090                 tok_state = tok_state_rawtext
3091                 cur -= 1 # reconsume the input character
3092                 return new_character_token '<'
3093
3094         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3095         tok_state_rawtext_end_tag_open = ->
3096                 c = txt.charAt(cur++)
3097                 if is_uc_alpha(c)
3098                         tok_cur_tag = new_end_tag c.toLowerCase()
3099                         temporary_buffer += c
3100                         tok_state = tok_state_rawtext_end_tag_name
3101                         return null
3102                 if is_lc_alpha(c)
3103                         tok_cur_tag = new_end_tag c
3104                         temporary_buffer += c
3105                         tok_state = tok_state_rawtext_end_tag_name
3106                         return null
3107                 # Anything else
3108                 tok_state = tok_state_rawtext
3109                 cur -= 1 # reconsume the input character
3110                 return new_character_token "</" # fixfull separate these
3111
3112         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3113         tok_state_rawtext_end_tag_name = ->
3114                 c = txt.charAt(cur++)
3115                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3116                         if is_appropriate_end_tag tok_cur_tag
3117                                 tok_state = tok_state_before_attribute_name
3118                                 return
3119                         # else fall through to "Anything else"
3120                 if c is '/'
3121                         if is_appropriate_end_tag tok_cur_tag
3122                                 tok_state = tok_state_self_closing_start_tag
3123                                 return
3124                         # else fall through to "Anything else"
3125                 if c is '>'
3126                         if is_appropriate_end_tag tok_cur_tag
3127                                 tok_state = tok_state_data
3128                                 return tok_cur_tag
3129                         # else fall through to "Anything else"
3130                 if is_uc_alpha(c)
3131                         tok_cur_tag.name += c.toLowerCase()
3132                         temporary_buffer += c
3133                         return null
3134                 if is_lc_alpha(c)
3135                         tok_cur_tag.name += c
3136                         temporary_buffer += c
3137                         return null
3138                 # Anything else
3139                 tok_state = tok_state_rawtext
3140                 cur -= 1 # reconsume the input character
3141                 return new_character_token '</' + temporary_buffer # fixfull separate these
3142
3143         # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3144         tok_state_script_data_less_than_sign = ->
3145                 c = txt.charAt(cur++)
3146                 if c is '/'
3147                         temporary_buffer = ''
3148                         tok_state = tok_state_script_data_end_tag_open
3149                         return
3150                 if c is '!'
3151                         tok_state = tok_state_script_data_escape_start
3152                         return new_character_token '<!' # fixfull split
3153                 # Anything else
3154                 tok_state = tok_state_script_data
3155                 cur -= 1 # Reconsume
3156                 return new_character_token '<'
3157
3158         # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3159         tok_state_script_data_end_tag_open = ->
3160                 c = txt.charAt(cur++)
3161                 if is_uc_alpha(c)
3162                         tok_cur_tag = new_end_tag c.toLowerCase()
3163                         temporary_buffer += c
3164                         tok_state = tok_state_script_data_end_tag_name
3165                         return
3166                 if is_lc_alpha(c)
3167                         tok_cur_tag = new_end_tag c
3168                         temporary_buffer += c
3169                         tok_state = tok_state_script_data_end_tag_name
3170                         return
3171                 # Anything else
3172                 tok_state = tok_state_script_data
3173                 cur -= 1 # Reconsume
3174                 return new_character_token '</'
3175
3176         # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3177         tok_state_script_data_end_tag_name = ->
3178                 c = txt.charAt(cur++)
3179                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3180                         if is_appropriate_end_tag tok_cur_tag
3181                                 tok_state = tok_state_before_attribute_name
3182                                 return
3183                         # fall through
3184                 if c is '/'
3185                         if is_appropriate_end_tag tok_cur_tag
3186                                 tok_state = tok_state_self_closing_start_tag
3187                                 return
3188                         # fall through
3189                 if c is '>'
3190                         if is_appropriate_end_tag tok_cur_tag
3191                                 tok_state = tok_state_data
3192                                 return tok_cur_tag
3193                         # fall through
3194                 if is_uc_alpha(c)
3195                         tok_cur_tag.name += c.toLowerCase()
3196                         temporary_buffer += c
3197                         return
3198                 if is_lc_alpha(c)
3199                         tok_cur_tag.name += c
3200                         temporary_buffer += c
3201                         return
3202                 # Anything else
3203                 tok_state = tok_state_script_data
3204                 cur -= 1 # Reconsume
3205                 return new_character_token "</#{temporary_buffer}" # fixfull split
3206
3207         # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3208         tok_state_script_data_escape_start = ->
3209                 c = txt.charAt(cur++)
3210                 if c is '-'
3211                         tok_state = tok_state_script_data_escape_start_dash
3212                         return new_character_token '-'
3213                 # Anything else
3214                 tok_state = tok_state_script_data
3215                 cur -= 1 # Reconsume
3216                 return
3217
3218         # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3219         tok_state_script_data_escape_start_dash = ->
3220                 c = txt.charAt(cur++)
3221                 if c is '-'
3222                         tok_state = tok_state_script_data_escaped_dash_dash
3223                         return new_character_token '-'
3224                 # Anything else
3225                 tok_state = tok_state_script_data
3226                 cur -= 1 # Reconsume
3227                 return
3228
3229         # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3230         tok_state_script_data_escaped = ->
3231                 c = txt.charAt(cur++)
3232                 if c is '-'
3233                         tok_state = tok_state_script_data_escaped_dash
3234                         return new_character_token '-'
3235                 if c is '<'
3236                         tok_state = tok_state_script_data_escaped_less_than_sign
3237                         return
3238                 if c is "\u0000"
3239                         parse_error()
3240                         return new_character_token "\ufffd"
3241                 if c is '' # EOF
3242                         tok_state = tok_state_data
3243                         parse_error()
3244                         cur -= 1 # Reconsume
3245                         return
3246                 # Anything else
3247                 return new_character_token c
3248
3249         # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3250         tok_state_script_data_escaped_dash = ->
3251                 c = txt.charAt(cur++)
3252                 if c is '-'
3253                         tok_state = tok_state_script_data_escaped_dash_dash
3254                         return new_character_token '-'
3255                 if c is '<'
3256                         tok_state = tok_state_script_data_escaped_less_than_sign
3257                         return
3258                 if c is "\u0000"
3259                         parse_error()
3260                         tok_state = tok_state_script_data_escaped
3261                         return new_character_token "\ufffd"
3262                 if c is '' # EOF
3263                         tok_state = tok_state_data
3264                         parse_error()
3265                         cur -= 1 # Reconsume
3266                         return
3267                 # Anything else
3268                 tok_state = tok_state_script_data_escaped
3269                 return new_character_token c
3270
3271         # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3272         tok_state_script_data_escaped_dash_dash = ->
3273                 c = txt.charAt(cur++)
3274                 if c is '-'
3275                         return new_character_token '-'
3276                 if c is '<'
3277                         tok_state = tok_state_script_data_escaped_less_than_sign
3278                         return
3279                 if c is '>'
3280                         tok_state = tok_state_script_data
3281                         return new_character_token '>'
3282                 if c is "\u0000"
3283                         parse_error()
3284                         tok_state = tok_state_script_data_escaped
3285                         return new_character_token "\ufffd"
3286                 if c is '' # EOF
3287                         parse_error()
3288                         tok_state = tok_state_data
3289                         cur -= 1 # Reconsume
3290                         return
3291                 # Anything else
3292                 tok_state = tok_state_script_data_escaped
3293                 return new_character_token c
3294
3295         # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3296         tok_state_script_data_escaped_less_than_sign = ->
3297                 c = txt.charAt(cur++)
3298                 if c is '/'
3299                         temporary_buffer = ''
3300                         tok_state = tok_state_script_data_escaped_end_tag_open
3301                         return
3302                 if is_uc_alpha(c)
3303                         temporary_buffer = c.toLowerCase() # yes, really
3304                         tok_state = tok_state_script_data_double_escape_start
3305                         return new_character_token "<#{c}" # fixfull split
3306                 if is_lc_alpha(c)
3307                         temporary_buffer = c
3308                         tok_state = tok_state_script_data_double_escape_start
3309                         return new_character_token "<#{c}" # fixfull split
3310                 # Anything else
3311                 tok_state = tok_state_script_data_escaped
3312                 cur -= 1 # Reconsume
3313                 return new_character_token c
3314
3315         # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3316         tok_state_script_data_escaped_end_tag_open = ->
3317                 c = txt.charAt(cur++)
3318                 if is_uc_alpha(c)
3319                         tok_cur_tag = new_end_tag c.toLowerCase()
3320                         temporary_buffer += c
3321                         tok_state = tok_state_script_data_escaped_end_tag_name
3322                         return
3323                 if is_lc_alpha(c)
3324                         tok_cur_tag = new_end_tag c
3325                         temporary_buffer += c
3326                         tok_state = tok_state_script_data_escaped_end_tag_name
3327                         return
3328                 # Anything else
3329                 tok_state = tok_state_script_data_escaped
3330                 cur -= 1 # Reconsume
3331                 return new_character_token '</' # fixfull split
3332
3333         # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3334         tok_state_script_data_escaped_end_tag_name = ->
3335                 c = txt.charAt(cur++)
3336                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3337                         if is_appropriate_end_tag tok_cur_tag
3338                                 tok_state = tok_state_before_attribute_name
3339                                 return
3340                         # fall through
3341                 if c is '/'
3342                         if is_appropriate_end_tag tok_cur_tag
3343                                 tok_state = tok_state_self_closing_start_tag
3344                                 return
3345                         # fall through
3346                 if c is '>'
3347                         if is_appropriate_end_tag tok_cur_tag
3348                                 tok_state = tok_state_data
3349                                 return tok_cur_tag
3350                         # fall through
3351                 if is_uc_alpha(c)
3352                         tok_cur_tag.name += c.toLowerCase()
3353                         temporary_buffer += c.toLowerCase()
3354                         return
3355                 if is_lc_alpha(c)
3356                         tok_cur_tag.name += c
3357                         temporary_buffer += c.toLowerCase()
3358                         return
3359                 # Anything else
3360                 tok_state = tok_state_script_data_escaped
3361                 cur -= 1 # Reconsume
3362                 return new_character_token "</#{temporary_buffer}" # fixfull split
3363
3364         # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3365         tok_state_script_data_double_escape_start = ->
3366                 c = txt.charAt(cur++)
3367                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3368                         if temporary_buffer is 'script'
3369                                 tok_state = tok_state_script_data_double_escaped
3370                         else
3371                                 tok_state = tok_state_script_data_escaped
3372                         return new_character_token c
3373                 if is_uc_alpha(c)
3374                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3375                         return new_character_token c
3376                 if is_lc_alpha(c)
3377                         temporary_buffer += c
3378                         return new_character_token c
3379                 # Anything else
3380                 tok_state = tok_state_script_data_escaped
3381                 cur -= 1 # Reconsume
3382                 return
3383
3384         # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3385         tok_state_script_data_double_escaped = ->
3386                 c = txt.charAt(cur++)
3387                 if c is '-'
3388                         tok_state = tok_state_script_data_double_escaped_dash
3389                         return new_character_token '-'
3390                 if c is '<'
3391                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3392                         return new_character_token '<'
3393                 if c is "\u0000"
3394                         parse_error()
3395                         return new_character_token "\ufffd"
3396                 if c is '' # EOF
3397                         parse_error()
3398                         tok_state = tok_state_data
3399                         cur -= 1 # Reconsume
3400                         return
3401                 # Anything else
3402                 return new_character_token c
3403
3404         # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3405         tok_state_script_data_double_escaped_dash = ->
3406                 c = txt.charAt(cur++)
3407                 if c is '-'
3408                         tok_state = tok_state_script_data_double_escaped_dash_dash
3409                         return new_character_token '-'
3410                 if c is '<'
3411                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3412                         return new_character_token '<'
3413                 if c is "\u0000"
3414                         parse_error()
3415                         tok_state = tok_state_script_data_double_escaped
3416                         return new_character_token "\ufffd"
3417                 if c is '' # EOF
3418                         parse_error()
3419                         tok_state = tok_state_data
3420                         cur -= 1 # Reconsume
3421                         return
3422                 # Anything else
3423                 tok_state = tok_state_script_data_double_escaped
3424                 return new_character_token c
3425
3426         # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3427         tok_state_script_data_double_escaped_dash_dash = ->
3428                 c = txt.charAt(cur++)
3429                 if c is '-'
3430                         return new_character_token '-'
3431                 if c is '<'
3432                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3433                         return new_character_token '<'
3434                 if c is '>'
3435                         tok_state = tok_state_script_data
3436                         return new_character_token '>'
3437                 if c is "\u0000"
3438                         parse_error()
3439                         tok_state = tok_state_script_data_double_escaped
3440                         return new_character_token "\ufffd"
3441                 if c is '' # EOF
3442                         parse_error()
3443                         tok_state = tok_state_data
3444                         cur -= 1 # Reconsume
3445                         return
3446                 # Anything else
3447                 tok_state = tok_state_script_data_double_escaped
3448                 return new_character_token c
3449
3450         # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3451         tok_state_script_data_double_escaped_less_than_sign = ->
3452                 c = txt.charAt(cur++)
3453                 if c is '/'
3454                         temporary_buffer = ''
3455                         tok_state = tok_state_script_data_double_escape_end
3456                         return new_character_token '/'
3457                 # Anything else
3458                 tok_state = tok_state_script_data_double_escaped
3459                 cur -= 1 # Reconsume
3460                 return
3461
3462         # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3463         tok_state_script_data_double_escape_end = ->
3464                 c = txt.charAt(cur++)
3465                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3466                         if temporary_buffer is 'script'
3467                                 tok_state = tok_state_script_data_escaped
3468                         else
3469                                 tok_state = tok_state_script_data_double_escaped
3470                         return new_character_token c
3471                 if is_uc_alpha(c)
3472                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3473                         return new_character_token c
3474                 if is_lc_alpha(c)
3475                         temporary_buffer += c
3476                         return new_character_token c
3477                 # Anything else
3478                 tok_state = tok_state_script_data_double_escaped
3479                 cur -= 1 # Reconsume
3480                 return
3481
3482         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3483         tok_state_before_attribute_name = ->
3484                 attr_name = null
3485                 switch c = txt.charAt(cur++)
3486                         when "\t", "\n", "\u000c", ' '
3487                                 return null
3488                         when '/'
3489                                 tok_state = tok_state_self_closing_start_tag
3490                                 return null
3491                         when '>'
3492                                 tok_state = tok_state_data
3493                                 tmp = tok_cur_tag
3494                                 tok_cur_tag = null
3495                                 return tmp
3496                         when "\u0000"
3497                                 parse_error()
3498                                 attr_name = "\ufffd"
3499                         when '"', "'", '<', '='
3500                                 parse_error()
3501                                 attr_name = c
3502                         when '' # EOF
3503                                 parse_error()
3504                                 tok_state = tok_state_data
3505                         else
3506                                 if is_uc_alpha(c)
3507                                         attr_name = c.toLowerCase()
3508                                 else
3509                                         attr_name = c
3510                 if attr_name?
3511                         tok_cur_tag.attrs_a.unshift [attr_name, '']
3512                         tok_state = tok_state_attribute_name
3513                 return null
3514
3515         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3516         tok_state_attribute_name = ->
3517                 switch c = txt.charAt(cur++)
3518                         when "\t", "\n", "\u000c", ' '
3519                                 tok_state = tok_state_after_attribute_name
3520                         when '/'
3521                                 tok_state = tok_state_self_closing_start_tag
3522                         when '='
3523                                 tok_state = tok_state_before_attribute_value
3524                         when '>'
3525                                 tok_state = tok_state_data
3526                                 tmp = tok_cur_tag
3527                                 tok_cur_tag = null
3528                                 return tmp
3529                         when "\u0000"
3530                                 parse_error()
3531                                 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3532                         when '"', "'", '<'
3533                                 parse_error()
3534                                 tok_cur_tag.attrs_a[0][0] += c
3535                         when '' # EOF
3536                                 parse_error()
3537                                 tok_state = tok_state_data
3538                         else
3539                                 if is_uc_alpha(c)
3540                                         tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3541                                 else
3542                                         tok_cur_tag.attrs_a[0][0] += c
3543                 return null
3544
3545         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3546         tok_state_after_attribute_name = ->
3547                 c = txt.charAt(cur++)
3548                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3549                         return
3550                 if c is '/'
3551                         tok_state = tok_state_self_closing_start_tag
3552                         return
3553                 if c is '='
3554                         tok_state = tok_state_before_attribute_value
3555                         return
3556                 if c is '>'
3557                         tok_state = tok_state_data
3558                         return
3559                 if is_uc_alpha(c)
3560                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3561                         tok_state = tok_state_attribute_name
3562                         return
3563                 if c is "\u0000"
3564                         parse_error()
3565                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3566                         tok_state = tok_state_attribute_name
3567                         return
3568                 if c is '' # EOF
3569                         parse_error()
3570                         tok_state = tok_state_data
3571                         cur -= 1 # reconsume
3572                         return
3573                 if c is '"' or c is "'" or c is '<'
3574                         parse_error()
3575                         # fall through to Anything else
3576                 # Anything else
3577                 tok_cur_tag.attrs_a.unshift [c, '']
3578                 tok_state = tok_state_attribute_name
3579
3580         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3581         tok_state_before_attribute_value = ->
3582                 switch c = txt.charAt(cur++)
3583                         when "\t", "\n", "\u000c", ' '
3584                                 return null
3585                         when '"'
3586                                 tok_state = tok_state_attribute_value_double_quoted
3587                         when '&'
3588                                 tok_state = tok_state_attribute_value_unquoted
3589                                 cur -= 1
3590                         when "'"
3591                                 tok_state = tok_state_attribute_value_single_quoted
3592                         when "\u0000"
3593                                 # Parse error
3594                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3595                                 tok_state = tok_state_attribute_value_unquoted
3596                         when '>'
3597                                 # Parse error
3598                                 tok_state = tok_state_data
3599                                 tmp = tok_cur_tag
3600                                 tok_cur_tag = null
3601                                 return tmp
3602                         when '' # EOF
3603                                 parse_error()
3604                                 tok_state = tok_state_data
3605                         else
3606                                 tok_cur_tag.attrs_a[0][1] += c
3607                                 tok_state = tok_state_attribute_value_unquoted
3608                 return null
3609
3610         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3611         tok_state_attribute_value_double_quoted = ->
3612                 switch c = txt.charAt(cur++)
3613                         when '"'
3614                                 tok_state = tok_state_after_attribute_value_quoted
3615                         when '&'
3616                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3617                         when "\u0000"
3618                                 # Parse error
3619                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3620                         when '' # EOF
3621                                 parse_error()
3622                                 tok_state = tok_state_data
3623                         else
3624                                 tok_cur_tag.attrs_a[0][1] += c
3625                 return null
3626
3627         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3628         tok_state_attribute_value_single_quoted = ->
3629                 switch c = txt.charAt(cur++)
3630                         when "'"
3631                                 tok_state = tok_state_after_attribute_value_quoted
3632                         when '&'
3633                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3634                         when "\u0000"
3635                                 # Parse error
3636                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3637                         when '' # EOF
3638                                 parse_error()
3639                                 tok_state = tok_state_data
3640                         else
3641                                 tok_cur_tag.attrs_a[0][1] += c
3642                 return null
3643
3644         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3645         tok_state_attribute_value_unquoted = ->
3646                 switch c = txt.charAt(cur++)
3647                         when "\t", "\n", "\u000c", ' '
3648                                 tok_state = tok_state_before_attribute_name
3649                         when '&'
3650                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3651                         when '>'
3652                                 tok_state = tok_state_data
3653                                 tmp = tok_cur_tag
3654                                 tok_cur_tag = null
3655                                 return tmp
3656                         when "\u0000"
3657                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3658                         when '' # EOF
3659                                 parse_error()
3660                                 tok_state = tok_state_data
3661                         else
3662                                 # Parse Error if ', <, = or ` (backtick)
3663                                 tok_cur_tag.attrs_a[0][1] += c
3664                 return null
3665
3666         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3667         tok_state_after_attribute_value_quoted = ->
3668                 switch c = txt.charAt(cur++)
3669                         when "\t", "\n", "\u000c", ' '
3670                                 tok_state = tok_state_before_attribute_name
3671                         when '/'
3672                                 tok_state = tok_state_self_closing_start_tag
3673                         when '>'
3674                                 tok_state = tok_state_data
3675                                 tmp = tok_cur_tag
3676                                 tok_cur_tag = null
3677                                 return tmp
3678                         when '' # EOF
3679                                 parse_error()
3680                                 tok_state = tok_state_data
3681                         else
3682                                 # Parse Error
3683                                 tok_state = tok_state_before_attribute_name
3684                                 cur -= 1 # we didn't handle that char
3685                 return null
3686
3687         # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3688         tok_state_self_closing_start_tag = ->
3689                 c = txt.charAt(cur++)
3690                 if c is '>'
3691                         tok_cur_tag.flag 'self-closing'
3692                         tok_state = tok_state_data
3693                         return tok_cur_tag
3694                 if c is ''
3695                         parse_error()
3696                         tok_state = tok_state_data
3697                         cur -= 1 # Reconsume
3698                         return
3699                 # Anything else
3700                 parse_error()
3701                 tok_state = tok_state_before_attribute_name
3702                 cur -= 1 # Reconsume
3703                 return
3704
3705         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3706         # WARNING: put a comment token in tok_cur_tag before setting this state
3707         tok_state_bogus_comment = ->
3708                 next_gt = txt.indexOf '>', cur
3709                 if next_gt is -1
3710                         val = txt.substr cur
3711                         cur = txt.length
3712                 else
3713                         val = txt.substr cur, (next_gt - cur)
3714                         cur = next_gt + 1
3715                 val = val.replace "\u0000", "\ufffd"
3716                 tok_cur_tag.text += val
3717                 tok_state = tok_state_data
3718                 return tok_cur_tag
3719
3720         # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3721         tok_state_markup_declaration_open = ->
3722                 if txt.substr(cur, 2) is '--'
3723                         cur += 2
3724                         tok_cur_tag = new_comment_token ''
3725                         tok_state = tok_state_comment_start
3726                         return
3727                 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3728                         cur += 7
3729                         tok_state = tok_state_doctype
3730                         return
3731                 acn = adjusted_current_node()
3732                 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3733                         cur += 7
3734                         tok_state = tok_state_cdata_section
3735                         return
3736                 # Otherwise
3737                 parse_error()
3738                 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
3739                 tok_state = tok_state_bogus_comment
3740                 return
3741
3742         # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3743         tok_state_comment_start = ->
3744                 switch c = txt.charAt(cur++)
3745                         when '-'
3746                                 tok_state = tok_state_comment_start_dash
3747                         when "\u0000"
3748                                 parse_error()
3749                                 return new_character_token "\ufffd"
3750                         when '>'
3751                                 parse_error()
3752                                 tok_state = tok_state_data
3753                                 return tok_cur_tag
3754                         when '' # EOF
3755                                 parse_error()
3756                                 tok_state = tok_state_data
3757                                 cur -= 1 # Reconsume
3758                                 return tok_cur_tag
3759                         else
3760                                 tok_cur_tag.text += c
3761                 return null
3762
3763         # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3764         tok_state_comment_start_dash = ->
3765                 switch c = txt.charAt(cur++)
3766                         when '-'
3767                                 tok_state = tok_state_comment_end
3768                         when "\u0000"
3769                                 parse_error()
3770                                 tok_cur_tag.text += "-\ufffd"
3771                                 tok_state = tok_state_comment
3772                         when '>'
3773                                 parse_error()
3774                                 tok_state = tok_state_data
3775                                 return tok_cur_tag
3776                         when '' # EOF
3777                                 parse_error()
3778                                 tok_state = tok_state_data
3779                                 cur -= 1 # Reconsume
3780                                 return tok_cur_tag
3781                         else
3782                                 tok_cur_tag.text += "-#{c}"
3783                                 tok_state = tok_state_comment
3784                 return null
3785
3786         # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3787         tok_state_comment = ->
3788                 switch c = txt.charAt(cur++)
3789                         when '-'
3790                                 tok_state = tok_state_comment_end_dash
3791                         when "\u0000"
3792                                 parse_error()
3793                                 tok_cur_tag.text += "\ufffd"
3794                         when '' # EOF
3795                                 parse_error()
3796                                 tok_state = tok_state_data
3797                                 cur -= 1 # Reconsume
3798                                 return tok_cur_tag
3799                         else
3800                                 tok_cur_tag.text += c
3801                 return null
3802
3803         # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3804         tok_state_comment_end_dash = ->
3805                 switch c = txt.charAt(cur++)
3806                         when '-'
3807                                 tok_state = tok_state_comment_end
3808                         when "\u0000"
3809                                 parse_error()
3810                                 tok_cur_tag.text += "-\ufffd"
3811                                 tok_state = tok_state_comment
3812                         when '' # EOF
3813                                 parse_error()
3814                                 tok_state = tok_state_data
3815                                 cur -= 1 # Reconsume
3816                                 return tok_cur_tag
3817                         else
3818                                 tok_cur_tag.text += "-#{c}"
3819                                 tok_state = tok_state_comment
3820                 return null
3821
3822         # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3823         tok_state_comment_end = ->
3824                 switch c = txt.charAt(cur++)
3825                         when '>'
3826                                 tok_state = tok_state_data
3827                                 return tok_cur_tag
3828                         when "\u0000"
3829                                 parse_error()
3830                                 tok_cur_tag.text += "--\ufffd"
3831                                 tok_state = tok_state_comment
3832                         when '!'
3833                                 parse_error()
3834                                 tok_state = tok_state_comment_end_bang
3835                         when '-'
3836                                 parse_error()
3837                                 tok_cur_tag.text += '-'
3838                         when '' # EOF
3839                                 parse_error()
3840                                 tok_state = tok_state_data
3841                                 cur -= 1 # Reconsume
3842                                 return tok_cur_tag
3843                         else
3844                                 parse_error()
3845                                 tok_cur_tag.text += "--#{c}"
3846                                 tok_state = tok_state_comment
3847                 return null
3848
3849         # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3850         tok_state_comment_end_bang = ->
3851                 switch c = txt.charAt(cur++)
3852                         when '-'
3853                                 tok_cur_tag.text += "--!#{c}"
3854                                 tok_state = tok_state_comment_end_dash
3855                         when '>'
3856                                 tok_state = tok_state_data
3857                                 return tok_cur_tag
3858                         when "\u0000"
3859                                 parse_error()
3860                                 tok_cur_tag.text += "--!\ufffd"
3861                                 tok_state = tok_state_comment
3862                         when '' # EOF
3863                                 parse_error()
3864                                 tok_state = tok_state_data
3865                                 cur -= 1 # Reconsume
3866                                 return tok_cur_tag
3867                         else
3868                                 tok_cur_tag.text += "--!#{c}"
3869                                 tok_state = tok_state_comment
3870                 return null
3871
3872         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3873         tok_state_doctype = ->
3874                 switch c = txt.charAt(cur++)
3875                         when "\t", "\u000a", "\u000c", ' '
3876                                 tok_state = tok_state_before_doctype_name
3877                         when '' # EOF
3878                                 parse_error()
3879                                 tok_state = tok_state_data
3880                                 el = new_doctype_token ''
3881                                 el.flag 'force-quirks', true
3882                                 cur -= 1 # Reconsume
3883                                 return el
3884                         else
3885                                 parse_error()
3886                                 tok_state = tok_state_before_doctype_name
3887                                 cur -= 1 # Reconsume
3888                 return null
3889
3890         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3891         tok_state_before_doctype_name = ->
3892                 c = txt.charAt(cur++)
3893                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3894                         return
3895                 if is_uc_alpha(c)
3896                         tok_cur_tag = new_doctype_token c.toLowerCase()
3897                         tok_state = tok_state_doctype_name
3898                         return
3899                 if c is "\u0000"
3900                         parse_error()
3901                         tok_cur_tag = new_doctype_token "\ufffd"
3902                         tok_state = tok_state_doctype_name
3903                         return
3904                 if c is '>'
3905                         parse_error()
3906                         el = new_doctype_token ''
3907                         el.flag 'force-quirks', true
3908                         tok_state = tok_state_data
3909                         return el
3910                 if c is '' # EOF
3911                         parse_error()
3912                         tok_state = tok_state_data
3913                         el = new_doctype_token ''
3914                         el.flag 'force-quirks', true
3915                         cur -= 1 # Reconsume
3916                         return el
3917                 # Anything else
3918                 tok_cur_tag = new_doctype_token c
3919                 tok_state = tok_state_doctype_name
3920                 return null
3921
3922         # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3923         tok_state_doctype_name = ->
3924                 c = txt.charAt(cur++)
3925                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3926                         tok_state = tok_state_after_doctype_name
3927                         return
3928                 if c is '>'
3929                         tok_state = tok_state_data
3930                         return tok_cur_tag
3931                 if is_uc_alpha(c)
3932                         tok_cur_tag.name += c.toLowerCase()
3933                         return
3934                 if c is "\u0000"
3935                         parse_error()
3936                         tok_cur_tag.name += "\ufffd"
3937                         return
3938                 if c is '' # EOF
3939                         parse_error()
3940                         tok_state = tok_state_data
3941                         tok_cur_tag.flag 'force-quirks', true
3942                         cur -= 1 # Reconsume
3943                         return tok_cur_tag
3944                 # Anything else
3945                 tok_cur_tag.name += c
3946                 return null
3947
3948         # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3949         tok_state_after_doctype_name = ->
3950                 c = txt.charAt(cur++)
3951                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3952                         return
3953                 if c is '>'
3954                         tok_state = tok_state_data
3955                         return tok_cur_tag
3956                 if c is '' # EOF
3957                         parse_error()
3958                         tok_state = tok_state_data
3959                         tok_cur_tag.flag 'force-quirks', true
3960                         cur -= 1 # Reconsume
3961                         return tok_cur_tag
3962                 # Anything else
3963                 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3964                         cur += 5
3965                         tok_state = tok_state_after_doctype_public_keyword
3966                         return
3967                 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3968                         cur += 5
3969                         tok_state = tok_state_after_doctype_system_keyword
3970                         return
3971                 parse_error()
3972                 tok_cur_tag.flag 'force-quirks', true
3973                 tok_state = tok_state_bogus_doctype
3974                 return null
3975
3976         # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3977         tok_state_after_doctype_public_keyword = ->
3978                 c = txt.charAt(cur++)
3979                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3980                         tok_state = tok_state_before_doctype_public_identifier
3981                         return
3982                 if c is '"'
3983                         parse_error()
3984                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3985                         tok_state = tok_state_doctype_public_identifier_double_quoted
3986                         return
3987                 if c is "'"
3988                         parse_error()
3989                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3990                         tok_state = tok_state_doctype_public_identifier_single_quoted
3991                         return
3992                 if c is '>'
3993                         parse_error()
3994                         tok_cur_tag.flag 'force-quirks', true
3995                         tok_state = tok_state_data
3996                         return tok_cur_tag
3997                 if c is '' # EOF
3998                         parse_error()
3999                         tok_state = tok_state_data
4000                         tok_cur_tag.flag 'force-quirks', true
4001                         cur -= 1 # Reconsume
4002                         return tok_cur_tag
4003                 # Anything else
4004                 parse_error()
4005                 tok_cur_tag.flag 'force-quirks', true
4006                 tok_state = tok_state_bogus_doctype
4007                 return null
4008
4009         # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4010         tok_state_before_doctype_public_identifier = ->
4011                 c = txt.charAt(cur++)
4012                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4013                         return
4014                 if c is '"'
4015                         parse_error()
4016                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
4017                         tok_state = tok_state_doctype_public_identifier_double_quoted
4018                         return
4019                 if c is "'"
4020                         parse_error()
4021                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
4022                         tok_state = tok_state_doctype_public_identifier_single_quoted
4023                         return
4024                 if c is '>'
4025                         parse_error()
4026                         tok_cur_tag.flag 'force-quirks', true
4027                         tok_state = tok_state_data
4028                         return tok_cur_tag
4029                 if c is '' # EOF
4030                         parse_error()
4031                         tok_state = tok_state_data
4032                         tok_cur_tag.flag 'force-quirks', true
4033                         cur -= 1 # Reconsume
4034                         return tok_cur_tag
4035                 # Anything else
4036                 parse_error()
4037                 tok_cur_tag.flag 'force-quirks', true
4038                 tok_state = tok_state_bogus_doctype
4039                 return null
4040
4041
4042         # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4043         tok_state_doctype_public_identifier_double_quoted = ->
4044                 c = txt.charAt(cur++)
4045                 if c is '"'
4046                         tok_state = tok_state_after_doctype_public_identifier
4047                         return
4048                 if c is "\u0000"
4049                         parse_error()
4050                         tok_cur_tag.public_identifier += "\ufffd"
4051                         return
4052                 if c is '>'
4053                         parse_error()
4054                         tok_cur_tag.flag 'force-quirks', true
4055                         tok_state = tok_state_data
4056                         return tok_cur_tag
4057                 if c is '' # EOF
4058                         parse_error()
4059                         tok_state = tok_state_data
4060                         tok_cur_tag.flag 'force-quirks', true
4061                         cur -= 1 # Reconsume
4062                         return tok_cur_tag
4063                 # Anything else
4064                 tok_cur_tag.public_identifier += c
4065                 return null
4066
4067         # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4068         tok_state_doctype_public_identifier_single_quoted = ->
4069                 c = txt.charAt(cur++)
4070                 if c is "'"
4071                         tok_state = tok_state_after_doctype_public_identifier
4072                         return
4073                 if c is "\u0000"
4074                         parse_error()
4075                         tok_cur_tag.public_identifier += "\ufffd"
4076                         return
4077                 if c is '>'
4078                         parse_error()
4079                         tok_cur_tag.flag 'force-quirks', true
4080                         tok_state = tok_state_data
4081                         return tok_cur_tag
4082                 if c is '' # EOF
4083                         parse_error()
4084                         tok_state = tok_state_data
4085                         tok_cur_tag.flag 'force-quirks', true
4086                         cur -= 1 # Reconsume
4087                         return tok_cur_tag
4088                 # Anything else
4089                 tok_cur_tag.public_identifier += c
4090                 return null
4091
4092         # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4093         tok_state_after_doctype_public_identifier = ->
4094                 c = txt.charAt(cur++)
4095                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4096                         tok_state = tok_state_between_doctype_public_and_system_identifiers
4097                         return
4098                 if c is '>'
4099                         tok_state = tok_state_data
4100                         return tok_cur_tag
4101                 if c is '"'
4102                         parse_error()
4103                         tok_cur_tag.system_identifier = ''
4104                         tok_state = tok_state_doctype_system_identifier_double_quoted
4105                         return
4106                 if c is "'"
4107                         parse_error()
4108                         tok_cur_tag.system_identifier = ''
4109                         tok_state = tok_state_doctype_system_identifier_single_quoted
4110                         return
4111                 if c is '' # EOF
4112                         parse_error()
4113                         tok_state = tok_state_data
4114                         tok_cur_tag.flag 'force-quirks', true
4115                         cur -= 1 # Reconsume
4116                         return tok_cur_tag
4117                 # Anything else
4118                 parse_error()
4119                 tok_cur_tag.flag 'force-quirks', true
4120                 tok_state = tok_state_bogus_doctype
4121                 return null
4122
4123         # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4124         tok_state_between_doctype_public_and_system_identifiers = ->
4125                 c = txt.charAt(cur++)
4126                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4127                         return
4128                 if c is '>'
4129                         tok_state = tok_state_data
4130                         return tok_cur_tag
4131                 if c is '"'
4132                         parse_error()
4133                         tok_cur_tag.system_identifier = ''
4134                         tok_state = tok_state_doctype_system_identifier_double_quoted
4135                         return
4136                 if c is "'"
4137                         parse_error()
4138                         tok_cur_tag.system_identifier = ''
4139                         tok_state = tok_state_doctype_system_identifier_single_quoted
4140                         return
4141                 if c is '' # EOF
4142                         parse_error()
4143                         tok_state = tok_state_data
4144                         tok_cur_tag.flag 'force-quirks', true
4145                         cur -= 1 # Reconsume
4146                         return tok_cur_tag
4147                 # Anything else
4148                 parse_error()
4149                 tok_cur_tag.flag 'force-quirks', true
4150                 tok_state = tok_state_bogus_doctype
4151                 return null
4152
4153         # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4154         tok_state_after_doctype_system_keyword = ->
4155                 c = txt.charAt(cur++)
4156                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4157                         tok_state = tok_state_before_doctype_system_identifier
4158                         return
4159                 if c is '"'
4160                         parse_error()
4161                         tok_cur_tag.system_identifier = ''
4162                         tok_state = tok_state_doctype_system_identifier_double_quoted
4163                         return
4164                 if c is "'"
4165                         parse_error()
4166                         tok_cur_tag.system_identifier = ''
4167                         tok_state = tok_state_doctype_system_identifier_single_quoted
4168                         return
4169                 if c is '>'
4170                         parse_error()
4171                         tok_cur_tag.flag 'force-quirks', true
4172                         tok_state = tok_state_data
4173                         return tok_cur_tag
4174                 if c is '' # EOF
4175                         parse_error()
4176                         tok_state = tok_state_data
4177                         tok_cur_tag.flag 'force-quirks', true
4178                         cur -= 1 # Reconsume
4179                         return tok_cur_tag
4180                 # Anything else
4181                 parse_error()
4182                 tok_cur_tag.flag 'force-quirks', true
4183                 tok_state = tok_state_bogus_doctype
4184                 return null
4185
4186         # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4187         tok_state_before_doctype_system_identifier = ->
4188                 c = txt.charAt(cur++)
4189                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4190                         return
4191                 if c is '"'
4192                         tok_cur_tag.system_identifier = ''
4193                         tok_state = tok_state_doctype_system_identifier_double_quoted
4194                         return
4195                 if c is "'"
4196                         tok_cur_tag.system_identifier = ''
4197                         tok_state = tok_state_doctype_system_identifier_single_quoted
4198                         return
4199                 if c is '>'
4200                         parse_error()
4201                         tok_cur_tag.flag 'force-quirks', true
4202                         tok_state = tok_state_data
4203                         return tok_cur_tag
4204                 if c is '' # EOF
4205                         parse_error()
4206                         tok_state = tok_state_data
4207                         tok_cur_tag.flag 'force-quirks', true
4208                         cur -= 1 # Reconsume
4209                         return tok_cur_tag
4210                 # Anything else
4211                 parse_error()
4212                 tok_cur_tag.flag 'force-quirks', true
4213                 tok_state = tok_state_bogus_doctype
4214                 return null
4215
4216         # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4217         tok_state_doctype_system_identifier_double_quoted = ->
4218                 c = txt.charAt(cur++)
4219                 if c is '"'
4220                         tok_state = tok_state_after_doctype_system_identifier
4221                         return
4222                 if c is "\u0000"
4223                         parse_error()
4224                         tok_cur_tag.system_identifier += "\ufffd"
4225                         return
4226                 if c is '>'
4227                         parse_error()
4228                         tok_cur_tag.flag 'force-quirks', true
4229                         tok_state = tok_state_data
4230                         return tok_cur_tag
4231                 if c is '' # EOF
4232                         parse_error()
4233                         tok_state = tok_state_data
4234                         tok_cur_tag.flag 'force-quirks', true
4235                         cur -= 1 # Reconsume
4236                         return tok_cur_tag
4237                 # Anything else
4238                 tok_cur_tag.system_identifier += c
4239                 return null
4240
4241         # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4242         tok_state_doctype_system_identifier_single_quoted = ->
4243                 c = txt.charAt(cur++)
4244                 if c is "'"
4245                         tok_state = tok_state_after_doctype_system_identifier
4246                         return
4247                 if c is "\u0000"
4248                         parse_error()
4249                         tok_cur_tag.system_identifier += "\ufffd"
4250                         return
4251                 if c is '>'
4252                         parse_error()
4253                         tok_cur_tag.flag 'force-quirks', true
4254                         tok_state = tok_state_data
4255                         return tok_cur_tag
4256                 if c is '' # EOF
4257                         parse_error()
4258                         tok_state = tok_state_data
4259                         tok_cur_tag.flag 'force-quirks', true
4260                         cur -= 1 # Reconsume
4261                         return tok_cur_tag
4262                 # Anything else
4263                 tok_cur_tag.system_identifier += c
4264                 return null
4265
4266         # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4267         tok_state_after_doctype_system_identifier = ->
4268                 c = txt.charAt(cur++)
4269                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4270                         return
4271                 if c is '>'
4272                         tok_state = tok_state_data
4273                         return tok_cur_tag
4274                 if c is '' # EOF
4275                         parse_error()
4276                         tok_state = tok_state_data
4277                         tok_cur_tag.flag 'force-quirks', true
4278                         cur -= 1 # Reconsume
4279                         return tok_cur_tag
4280                 # Anything else
4281                 parse_error()
4282                 # do _not_ tok_cur_tag.flag 'force-quirks', true
4283                 tok_state = tok_state_bogus_doctype
4284                 return null
4285
4286         # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4287         tok_state_bogus_doctype = ->
4288                 c = txt.charAt(cur++)
4289                 if c is '>'
4290                         tok_state = tok_state_data
4291                         return tok_cur_tag
4292                 if c is '' # EOF
4293                         tok_state = tok_state_data
4294                         cur -= 1 # Reconsume
4295                         return tok_cur_tag
4296                 # Anything else
4297                 return null
4298
4299         # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4300         tok_state_cdata_section = ->
4301                 tok_state = tok_state_data
4302                 next_gt = txt.indexOf ']]>', cur
4303                 if next_gt is -1
4304                         val = txt.substr cur
4305                         cur = txt.length
4306                 else
4307                         val = txt.substr cur, (next_gt - cur)
4308                         cur = next_gt + 3
4309                 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4310                 val = val.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4311                 val = val.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4312                 return new_character_token val # fixfull split
4313
4314         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4315         # Don't set this as a state, just call it
4316         # returns a string (NOT a text node)
4317         parse_character_reference = (allowed_char = null, in_attr = false) ->
4318                 if cur >= txt.length
4319                         return '&'
4320                 switch c = txt.charAt(cur)
4321                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4322                                 # explicitly not a parse error
4323                                 return '&'
4324                         when ';'
4325                                 # there has to be "one or more" alnums between & and ; to be a parse error
4326                                 return '&'
4327                         when '#'
4328                                 if cur + 1 >= txt.length
4329                                         return '&'
4330                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
4331                                         prefix = '#x'
4332                                         charset = hex_chars
4333                                         start = cur + 2
4334                                 else
4335                                         charset = digits
4336                                         start = cur + 1
4337                                         prefix = '#'
4338                                 i = 0
4339                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4340                                         i += 1
4341                                 if i is 0
4342                                         return '&'
4343                                 if txt.charAt(start + i) is ';'
4344                                         i += 1
4345                                 # FIXME This is supposed to generate parse errors for some chars
4346                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
4347                                 if decoded?
4348                                         cur = start + i
4349                                         return decoded
4350                                 return '&'
4351                         else
4352                                 for i in [0...31]
4353                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
4354                                                 break
4355                                 if i is 0
4356                                         # exit early, because parse_error() below needs at least one alnum
4357                                         return '&'
4358                                 if txt.charAt(cur + i) is ';'
4359                                         i += 1 # include ';' terminator in value
4360                                         decoded = decode_named_char_ref txt.substr(cur, i)
4361                                         if decoded?
4362                                                 cur += i
4363                                                 return decoded
4364                                         parse_error()
4365                                         return '&'
4366                                 else
4367                                         # no ';' terminator (only legacy char refs)
4368                                         max = i
4369                                         for i in [2..max] # no prefix matches, so ok to check shortest first
4370                                                 c = legacy_char_refs[txt.substr(cur, i)]
4371                                                 if c?
4372                                                         if in_attr
4373                                                                 if txt.charAt(cur + i) is '='
4374                                                                         # "because some legacy user agents will
4375                                                                         # misinterpret the markup in those cases"
4376                                                                         parse_error()
4377                                                                         return '&'
4378                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
4379                                                                         # this makes attributes forgiving about url args
4380                                                                         return '&'
4381                                                         # ok, and besides the weird exceptions for attributes...
4382                                                         # return the matching char
4383                                                         cur += i # consume entity chars
4384                                                         parse_error() # because no terminating ";"
4385                                                         return c
4386                                         parse_error()
4387                                         return '&'
4388                 return # never reached
4389
4390         # tree constructor initialization
4391         # see comments on TYPE_TAG/etc for the structure of this data
4392         txt = args.html
4393         cur = 0
4394         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4395         open_els = []
4396         afe = [] # active formatting elements
4397         template_ins_modes = []
4398         ins_mode = ins_mode_initial
4399         original_ins_mode = ins_mode # TODO check spec
4400         flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4401         flag_frameset_ok = true
4402         flag_parsing = true
4403         flag_foster_parenting = false
4404         form_element_pointer = null
4405         temporary_buffer = null
4406         pending_table_character_tokens = []
4407         head_element_pointer = null
4408         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4409         context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4410
4411         # tokenizer initialization
4412         tok_state = tok_state_data
4413
4414         # proccess input
4415         # http://www.w3.org/TR/html5/syntax.html#tree-construction
4416         while flag_parsing
4417                 t = tok_state()
4418                 if t?
4419                         process_token t
4420                         # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4421         return doc.children
4422
4423 serialize_els = (els, shallow, show_ids) ->
4424         serialized = ''
4425         sep = ''
4426         for t in els
4427                 serialized += sep
4428                 sep = ','
4429                 serialized += t.serialize shallow, show_ids
4430         return serialized
4431
4432 # TODO export TYPE_*
4433 module.exports.parse_html = parse_html
4434 module.exports.debug_log_reset = debug_log_reset
4435 module.exports.debug_log_each = debug_log_each
4436 module.exports.TYPE_TAG = TYPE_TAG
4437 module.exports.TYPE_TEXT = TYPE_TEXT
4438 module.exports.TYPE_COMMENT = TYPE_COMMENT
4439 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4440 module.exports.NS_HTML = NS_HTML
4441 module.exports.NS_MATHML = NS_MATHML
4442 module.exports.NS_SVG = NS_SVG