parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor.
  20
  21 # The implementation is a pretty direct implementation of the parsing algorithm
  22 # described here:
  23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
  24 #
  25 # Deviations from that spec:
  26 #
  27 #   Purposeful: search this file for "WHATWG"
  28 #
  29 #   Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
  30
  31
  32 # stacks/lists
  33 #
  34 # the spec uses a many different words do indicate which ends of lists/stacks
  35 # they are talking about (and relative movement within the lists/stacks). This
  36 # section splains. I'm implementing "lists" (afe and open_els) the same way
  37 # (both as stacks)
  38 #
  39 # stacks grow downward (current element is index=0)
  40 #
  41 # example: open_els = [a, b, c, d, e, f, g]
  42 #
  43 # "grows downwards" means it's visualized like this: (index: el, names)
  44 #
  45 #   6: g "start of the list", "topmost", "first"
  46 #   5: f
  47 #   4: e "previous" (to d), "above", "before"
  48 #   3: d   (previous/next are relative to this element)
  49 #   2: c "next", "after", "lower", "below"
  50 #   1: b
  51 #   0: a "end of the list", "current node", "bottommost", "last"
  52
  53
  54 # browser
  55 # note: to get this to run outside a browser, you'll have to write a native
  56 # implementation of decode_named_char_ref()
  57 unless module?.exports?
  58         window.wheic = {}
  59         module = exports: window.wheic
  60
  61 from_code_point = (x) ->
  62         if String.fromCodePoint?
  63                 return String.fromCodePoint x
  64         else
  65                 if x <= 0xffff
  66                         return String.fromCharCode x
  67                 x -= 0x10000
  68                 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
  69
  70 # Each node is an obect of the Node class. Here are the Node types:
  71 TYPE_TAG = 0 # name, {attributes}, [children]
  72 TYPE_TEXT = 1 # "text"
  73 TYPE_COMMENT = 2
  74 TYPE_DOCTYPE = 3
  75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
  77 TYPE_END_TAG = 5 # name
  78 TYPE_EOF = 6
  79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
  80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
  81
  82 # namespace constants
  83 NS_HTML = 1
  84 NS_MATHML = 2
  85 NS_SVG = 3
  86
  87 # quirks mode constants
  88 QUIRKS_NO = 1
  89 QUIRKS_LIMITED = 2
  90 QUIRKS_YES = 3
  91
  92 g_debug_log = []
  93 debug_log_reset = ->
  94         g_debug_log = []
  95 debug_log = (str) ->
  96         g_debug_log.push str
  97 debug_log_each = (cb) ->
  98         for str in g_debug_log
  99                 cb str
 100
 101 prev_node_id = 0
 102 class Node
 103         constructor: (type, args = {}) ->
 104                 @type = type # one of the TYPE_* constants above
 105                 @name = args.name ? '' # tag name
 106                 @text = args.text ? '' # contents for text/comment nodes
 107                 @attrs = args.attrs ? {}
 108                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
 109                 @children = args.children ? []
 110                 @namespace = args.namespace ? NS_HTML
 111                 @parent = args.parent ? null
 112                 @token = args.token ? null
 113                 @flags = args.flags ? {}
 114                 if args.id?
 115                         @id = "#{args.id}+"
 116                 else
 117                         @id = "#{++prev_node_id}"
 118         acknowledge_self_closing: ->
 119                 if @token?
 120                         @token.flag 'did_self_close', true
 121                 else
 122                         @flag 'did_self_close', true
 123         flag: (key, value = null) ->
 124                 if value?
 125                         @flags[key] = value
 126                 else
 127                         return @flags[key]
 128         serialize: (shallow = false, show_ids = false) -> # for unit tests
 129                 ret = ''
 130                 switch @type
 131                         when TYPE_TAG
 132                                 ret += 'tag:'
 133                                 ret += JSON.stringify @name
 134                                 ret += ','
 135                                 if show_ids
 136                                         ret += "##{@id},"
 137                                 if shallow
 138                                         break
 139                                 attr_keys = []
 140                                 for k of @attrs
 141                                         attr_keys.push k
 142                                 attr_keys.sort()
 143                                 ret += '{'
 144                                 sep = ''
 145                                 for k in attr_keys
 146                                         ret += sep
 147                                         sep = ','
 148                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
 149                                 ret += '},['
 150                                 sep = ''
 151                                 for c in @children
 152                                         ret += sep
 153                                         sep = ','
 154                                         ret += c.serialize shallow, show_ids
 155                                 ret += ']'
 156                         when TYPE_TEXT
 157                                 ret += 'text:'
 158                                 ret += JSON.stringify @text
 159                         when TYPE_COMMENT
 160                                 ret += 'comment:'
 161                                 ret += JSON.stringify @text
 162                         when TYPE_DOCTYPE
 163                                 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
 164                         when TYPE_AFE_MARKER
 165                                 ret += 'marker'
 166                         when TYPE_AAA_BOOKMARK
 167                                 ret += 'aaa_bookmark'
 168                         else
 169                                 ret += 'unknown:'
 170                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
 171                 return ret
 172
 173 # helpers: (only take args that are normally known when parser creates nodes)
 174 new_open_tag = (name) ->
 175         return new Node TYPE_START_TAG, name: name
 176 new_end_tag = (name) ->
 177         return new Node TYPE_END_TAG, name: name
 178 new_element = (name) ->
 179         return new Node TYPE_TAG, name: name
 180 new_text_node = (txt) ->
 181         return new Node TYPE_TEXT, text: txt
 182 new_character_token = new_text_node
 183 new_comment_token = (txt) ->
 184         return new Node TYPE_COMMENT, text: txt
 185 new_doctype_token = (name) ->
 186         return new Node TYPE_DOCTYPE, name: name
 187 new_eof_token = ->
 188         return new Node TYPE_EOF
 189 new_afe_marker = ->
 190         return new Node TYPE_AFE_MARKER
 191 new_aaa_bookmark = ->
 192         return new Node TYPE_AAA_BOOKMARK
 193
 194 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
 195 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 196 digits = "0123456789"
 197 alnum = lc_alpha + uc_alpha + digits
 198 hex_chars = digits + "abcdefABCDEF"
 199
 200 is_uc_alpha = (str) ->
 201         return str.length is 1 and uc_alpha.indexOf(str) > -1
 202 is_lc_alpha = (str) ->
 203         return str.length is 1 and lc_alpha.indexOf(str) > -1
 204
 205 # some SVG elements have dashes in them
 206 tag_name_chars = alnum + "-"
 207
 208 # http://www.w3.org/TR/html5/infrastructure.html#space-character
 209 space_chars = "\u0009\u000a\u000c\u000d\u0020"
 210 is_space = (txt) ->
 211         return txt.length is 1 and space_chars.indexOf(txt) > -1
 212 is_space_tok = (t) ->
 213         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
 214
 215 is_input_hidden_tok = (t) ->
 216         return false unless t.type is TYPE_START_TAG
 217         for a in t.attrs_a
 218                 if a[0] is 'type'
 219                         if a[1].toLowerCase() is 'hidden'
 220                                 return true
 221                         return false
 222         return false
 223
 224 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
 225 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
 226
 227 unicode_fixes = {}
 228 unicode_fixes[0x00] = "\uFFFD"
 229 unicode_fixes[0x80] = "\u20AC"
 230 unicode_fixes[0x82] = "\u201A"
 231 unicode_fixes[0x83] = "\u0192"
 232 unicode_fixes[0x84] = "\u201E"
 233 unicode_fixes[0x85] = "\u2026"
 234 unicode_fixes[0x86] = "\u2020"
 235 unicode_fixes[0x87] = "\u2021"
 236 unicode_fixes[0x88] = "\u02C6"
 237 unicode_fixes[0x89] = "\u2030"
 238 unicode_fixes[0x8A] = "\u0160"
 239 unicode_fixes[0x8B] = "\u2039"
 240 unicode_fixes[0x8C] = "\u0152"
 241 unicode_fixes[0x8E] = "\u017D"
 242 unicode_fixes[0x91] = "\u2018"
 243 unicode_fixes[0x92] = "\u2019"
 244 unicode_fixes[0x93] = "\u201C"
 245 unicode_fixes[0x94] = "\u201D"
 246 unicode_fixes[0x95] = "\u2022"
 247 unicode_fixes[0x96] = "\u2013"
 248 unicode_fixes[0x97] = "\u2014"
 249 unicode_fixes[0x98] = "\u02DC"
 250 unicode_fixes[0x99] = "\u2122"
 251 unicode_fixes[0x9A] = "\u0161"
 252 unicode_fixes[0x9B] = "\u203A"
 253 unicode_fixes[0x9C] = "\u0153"
 254 unicode_fixes[0x9E] = "\u017E"
 255 unicode_fixes[0x9F] = "\u0178"
 256
 257 quirks_yes_pi_prefixes = [
 258         "+//silmaril//dtd html pro v0r11 19970101//"
 259         "-//as//dtd html 3.0 aswedit + extensions//"
 260         "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
 261         "-//ietf//dtd html 2.0 level 1//"
 262         "-//ietf//dtd html 2.0 level 2//"
 263         "-//ietf//dtd html 2.0 strict level 1//"
 264         "-//ietf//dtd html 2.0 strict level 2//"
 265         "-//ietf//dtd html 2.0 strict//"
 266         "-//ietf//dtd html 2.0//"
 267         "-//ietf//dtd html 2.1e//"
 268         "-//ietf//dtd html 3.0//"
 269         "-//ietf//dtd html 3.2 final//"
 270         "-//ietf//dtd html 3.2//"
 271         "-//ietf//dtd html 3//"
 272         "-//ietf//dtd html level 0//"
 273         "-//ietf//dtd html level 1//"
 274         "-//ietf//dtd html level 2//"
 275         "-//ietf//dtd html level 3//"
 276         "-//ietf//dtd html strict level 0//"
 277         "-//ietf//dtd html strict level 1//"
 278         "-//ietf//dtd html strict level 2//"
 279         "-//ietf//dtd html strict level 3//"
 280         "-//ietf//dtd html strict//"
 281         "-//ietf//dtd html//"
 282         "-//metrius//dtd metrius presentational//"
 283         "-//microsoft//dtd internet explorer 2.0 html strict//"
 284         "-//microsoft//dtd internet explorer 2.0 html//"
 285         "-//microsoft//dtd internet explorer 2.0 tables//"
 286         "-//microsoft//dtd internet explorer 3.0 html strict//"
 287         "-//microsoft//dtd internet explorer 3.0 html//"
 288         "-//microsoft//dtd internet explorer 3.0 tables//"
 289         "-//netscape comm. corp.//dtd html//"
 290         "-//netscape comm. corp.//dtd strict html//"
 291         "-//o'reilly and associates//dtd html 2.0//"
 292         "-//o'reilly and associates//dtd html extended 1.0//"
 293         "-//o'reilly and associates//dtd html extended relaxed 1.0//"
 294         "-//sq//dtd html 2.0 hotmetal + extensions//"
 295         "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
 296         "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
 297         "-//spyglass//dtd html 2.0 extended//"
 298         "-//sun microsystems corp.//dtd hotjava html//"
 299         "-//sun microsystems corp.//dtd hotjava strict html//"
 300         "-//w3c//dtd html 3 1995-03-24//"
 301         "-//w3c//dtd html 3.2 draft//"
 302         "-//w3c//dtd html 3.2 final//"
 303         "-//w3c//dtd html 3.2//"
 304         "-//w3c//dtd html 3.2s draft//"
 305         "-//w3c//dtd html 4.0 frameset//"
 306         "-//w3c//dtd html 4.0 transitional//"
 307         "-//w3c//dtd html experimental 19960712//"
 308         "-//w3c//dtd html experimental 970421//"
 309         "-//w3c//dtd w3 html//"
 310         "-//w3o//dtd w3 html 3.0//"
 311         "-//webtechs//dtd mozilla html 2.0//"
 312         "-//webtechs//dtd mozilla html//"
 313 ]
 314
 315 # These are the character references that don't need a terminating semicolon
 316 # min length: 2, max: 6, none are a prefix of any other.
 317 legacy_char_refs = {
 318         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
 319         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
 320         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
 321         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
 322         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
 323         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
 324         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
 325         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
 326         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
 327         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
 328         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
 329         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
 330         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
 331         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
 332         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
 333         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
 334         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
 335         yen: '¥', yuml: 'ÿ'
 336 }
 337
 338 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
 339 raw_text_elements = ['script', 'style']
 340 escapable_raw_text_elements = ['textarea', 'title']
 341 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
 342 svg_elements = [
 343         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
 344         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
 345         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
 346         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
 347         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
 348         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
 349         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
 350         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
 351         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
 352         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
 353         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
 354         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
 355         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
 356         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
 357         'view', 'vkern'
 358 ]
 359
 360 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
 361 mathml_elements = [
 362         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
 363         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
 364         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
 365         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
 366         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
 367         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 368         'determinant', 'diff', 'divergence', 'divide', 'domain',
 369         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 370         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 371         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 372         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 373         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 374         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 375         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 376         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 377         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 378         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 379         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 380         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 381         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 382         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 383         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 384         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 385         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 386         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 387         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 388         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 389         'vectorproduct', 'xor'
 390 ]
 391 # foreign_elements = [svg_elements..., mathml_elements...]
 392 #normal_elements = All other allowed HTML elements are normal elements.
 393
 394 special_elements = {
 395         # HTML:
 396         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
 397         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
 398         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
 399         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
 400         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
 401         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
 402         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
 403         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
 404         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
 405         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
 406         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
 407
 408         menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
 409
 410         meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
 411         noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
 412         plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
 413         select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
 414         table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
 415         textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
 416         tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
 417
 418         # MathML:
 419         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
 420         'annotation-xml':NS_MATHML,
 421
 422         # SVG:
 423         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
 424 }
 425
 426 formatting_elements = {
 427          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
 428          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
 429          u: true
 430 }
 431
 432 mathml_text_integration = {
 433         mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
 434 }
 435 is_mathml_text_integration_point = (el) ->
 436         return mathml_text_integration[el.name] is el.namespace
 437 is_html_integration = (el) -> # DON'T PASS A TOKEN
 438         if el.namespace is NS_MATHML
 439                 if el.name is 'annotation-xml'
 440                         if el.attrs.encoding?
 441                                 if el.attrs.encoding.toLowerCase() is 'text/html'
 442                                         return true
 443                                 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
 444                                         return true
 445                 return false
 446         if el.namespace is NS_SVG
 447                 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
 448                         return true
 449         return false
 450
 451 h_tags = {
 452         h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
 453 }
 454
 455 foster_parenting_targets = {
 456         table: NS_HTML
 457         tbody: NS_HTML
 458         tfoot: NS_HTML
 459         thead: NS_HTML
 460         tr: NS_HTML
 461 }
 462
 463 end_tag_implied = {
 464         dd: NS_HTML
 465         dt: NS_HTML
 466         li: NS_HTML
 467         option: NS_HTML
 468         optgroup: NS_HTML
 469         p: NS_HTML
 470         rb: NS_HTML
 471         rp: NS_HTML
 472         rt: NS_HTML
 473         rtc: NS_HTML
 474 }
 475
 476 el_is_special = (e) ->
 477         return special_elements[e.name] is e.namespace
 478
 479 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
 480 el_is_special_not_adp = (el) ->
 481         return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
 482
 483 svg_name_fixes = {
 484         altglyph: 'altGlyph'
 485         altglyphdef: 'altGlyphDef'
 486         altglyphitem: 'altGlyphItem'
 487         animatecolor: 'animateColor'
 488         animatemotion: 'animateMotion'
 489         animatetransform: 'animateTransform'
 490         clippath: 'clipPath'
 491         feblend: 'feBlend'
 492         fecolormatrix: 'feColorMatrix'
 493         fecomponenttransfer: 'feComponentTransfer'
 494         fecomposite: 'feComposite'
 495         feconvolvematrix: 'feConvolveMatrix'
 496         fediffuselighting: 'feDiffuseLighting'
 497         fedisplacementmap: 'feDisplacementMap'
 498         fedistantlight: 'feDistantLight'
 499         fedropshadow: 'feDropShadow'
 500         feflood: 'feFlood'
 501         fefunca: 'feFuncA'
 502         fefuncb: 'feFuncB'
 503         fefuncg: 'feFuncG'
 504         fefuncr: 'feFuncR'
 505         fegaussianblur: 'feGaussianBlur'
 506         feimage: 'feImage'
 507         femerge: 'feMerge'
 508         femergenode: 'feMergeNode'
 509         femorphology: 'feMorphology'
 510         feoffset: 'feOffset'
 511         fepointlight: 'fePointLight'
 512         fespecularlighting: 'feSpecularLighting'
 513         fespotlight: 'feSpotLight'
 514         fetile: 'feTile'
 515         feturbulence: 'feTurbulence'
 516         foreignobject: 'foreignObject'
 517         glyphref: 'glyphRef'
 518         lineargradient: 'linearGradient'
 519         radialgradient: 'radialGradient'
 520         textpath: 'textPath'
 521 }
 522 svg_attribute_fixes = {
 523         attributename: 'attributeName'
 524         attributetype: 'attributeType'
 525         basefrequency: 'baseFrequency'
 526         baseprofile: 'baseProfile'
 527         calcmode: 'calcMode'
 528         clippathunits: 'clipPathUnits'
 529         contentscripttype: 'contentScriptType'
 530         contentstyletype: 'contentStyleType'
 531         diffuseconstant: 'diffuseConstant'
 532         edgemode: 'edgeMode'
 533         externalresourcesrequired: 'externalResourcesRequired'
 534         # WHATWG removes this: filterres: 'filterRes'
 535         filterunits: 'filterUnits'
 536         glyphref: 'glyphRef'
 537         gradienttransform: 'gradientTransform'
 538         gradientunits: 'gradientUnits'
 539         kernelmatrix: 'kernelMatrix'
 540         kernelunitlength: 'kernelUnitLength'
 541         keypoints: 'keyPoints'
 542         keysplines: 'keySplines'
 543         keytimes: 'keyTimes'
 544         lengthadjust: 'lengthAdjust'
 545         limitingconeangle: 'limitingConeAngle'
 546         markerheight: 'markerHeight'
 547         markerunits: 'markerUnits'
 548         markerwidth: 'markerWidth'
 549         maskcontentunits: 'maskContentUnits'
 550         maskunits: 'maskUnits'
 551         numoctaves: 'numOctaves'
 552         pathlength: 'pathLength'
 553         patterncontentunits: 'patternContentUnits'
 554         patterntransform: 'patternTransform'
 555         patternunits: 'patternUnits'
 556         pointsatx: 'pointsAtX'
 557         pointsaty: 'pointsAtY'
 558         pointsatz: 'pointsAtZ'
 559         preservealpha: 'preserveAlpha'
 560         preserveaspectratio: 'preserveAspectRatio'
 561         primitiveunits: 'primitiveUnits'
 562         refx: 'refX'
 563         refy: 'refY'
 564         repeatcount: 'repeatCount'
 565         repeatdur: 'repeatDur'
 566         requiredextensions: 'requiredExtensions'
 567         requiredfeatures: 'requiredFeatures'
 568         specularconstant: 'specularConstant'
 569         specularexponent: 'specularExponent'
 570         spreadmethod: 'spreadMethod'
 571         startoffset: 'startOffset'
 572         stddeviation: 'stdDeviation'
 573         stitchtiles: 'stitchTiles'
 574         surfacescale: 'surfaceScale'
 575         systemlanguage: 'systemLanguage'
 576         tablevalues: 'tableValues'
 577         targetx: 'targetX'
 578         targety: 'targetY'
 579         textlength: 'textLength'
 580         viewbox: 'viewBox'
 581         viewtarget: 'viewTarget'
 582         xchannelselector: 'xChannelSelector'
 583         ychannelselector: 'yChannelSelector'
 584         zoomandpan: 'zoomAndPan'
 585 }
 586 foreign_attr_fixes = {
 587         'xlink:actuate': 'xlink actuate'
 588         'xlink:arcrole': 'xlink arcrole'
 589         'xlink:href': 'xlink href'
 590         'xlink:role': 'xlink role'
 591         'xlink:show': 'xlink show'
 592         'xlink:title': 'xlink title'
 593         'xlink:type': 'xlink type'
 594         'xml:base': 'xml base'
 595         'xml:lang': 'xml lang'
 596         'xml:space': 'xml space'
 597         'xmlns': 'xmlns'
 598         'xmlns:xlink': 'xmlns xlink'
 599 }
 600 adjust_mathml_attributes = (t) ->
 601         for a in t.attrs_a
 602                 if a[0] is 'definitionurl'
 603                         a[0] = 'definitionURL'
 604         return
 605 adjust_svg_attributes = (t) ->
 606         for a in t.attrs_a
 607                 if svg_attribute_fixes[a[0]]?
 608                         a[0] = svg_attribute_fixes[a[0]]
 609         return
 610 adjust_foreign_attributes = (t) ->
 611         # fixfull
 612         for a in t.attrs_a
 613                 if foreign_attr_fixes[a[0]]?
 614                         a[0] = foreign_attr_fixes[a[0]]
 615         return
 616
 617 # decode_named_char_ref()
 618 #
 619 # The list of named character references is _huge_ so ask the browser to decode
 620 # for us instead of wasting bandwidth/space on including the table here.
 621 #
 622 # Pass without the "&" but with the ";" examples:
 623 #    for "&amp" pass "amp;"
 624 #    for "&#x2032" pass "x2032;"
 625 g_dncr = {
 626         cache: {}
 627         textarea: document.createElement('textarea')
 628 }
 629 # TODO test this in IE8
 630 decode_named_char_ref = (txt) ->
 631         txt = "&#{txt}"
 632         decoded = g_dncr.cache[txt]
 633         return decoded if decoded?
 634         g_dncr.textarea.innerHTML = txt
 635         decoded = g_dncr.textarea.value
 636         return null if decoded is txt
 637         return g_dncr.cache[txt] = decoded
 638
 639 parse_html = (args) ->
 640         txt = null
 641         cur = null # index of next char in txt to be parsed
 642         # declare doc and tokenizer variables so they're in scope below
 643         doc = null
 644         open_els = null # stack of open elements
 645         afe = null # active formatting elements
 646         template_ins_modes = null
 647         ins_mode = null
 648         original_ins_mode = null
 649         tok_state = null
 650         tok_cur_tag = null # partially parsed tag
 651         flag_scripting = null
 652         flag_frameset_ok = null
 653         flag_parsing = null
 654         flag_foster_parenting = null
 655         form_element_pointer = null
 656         temporary_buffer = null
 657         pending_table_character_tokens = null
 658         head_element_pointer = null
 659         flag_fragment_parsing = null
 660         context_element = null
 661
 662         stop_parsing = ->
 663                 flag_parsing = false
 664
 665         parse_error = ->
 666                 if args.error_cb?
 667                         args.error_cb cur
 668                 else
 669                         console.log "Parse error at character #{cur} of #{txt.length}"
 670
 671         afe_push = (new_el) ->
 672                 matches = 0
 673                 for el, i in afe
 674                         if el.name is new_el.name and el.namespace is new_el.namespace
 675                                 for k, v of el.attrs
 676                                         continue unless new_el.attrs[k] is v
 677                                 for k, v of new_el.attrs
 678                                         continue unless el.attrs[k] is v
 679                                 matches += 1
 680                                 if matches is 3
 681                                         afe.splice i, 1
 682                                         break
 683                 afe.unshift new_el
 684         afe_push_marker = ->
 685                 afe.unshift new_afe_marker()
 686
 687         # the functions below impliment the Tree Contstruction algorithm
 688         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 689
 690         # But first... the helpers
 691         template_tag_is_open = ->
 692                 for el in open_els
 693                         if el.name is 'template' and el.namespace is NS_HTML
 694                                 return true
 695                 return false
 696         is_in_scope_x = (tag_name, scope, namespace) ->
 697                 for el in open_els
 698                         if el.name is tag_name and (namespace is null or namespace is el.namespace)
 699                                 return true
 700                         if scope[el.name] is el.namespace
 701                                 return false
 702                 return false
 703         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
 704                 for el in open_els
 705                         if el.name is tag_name and (namespace is null or namespace is el.namespace)
 706                                 return true
 707                         if scope[el.name] is el.namespace
 708                                 return false
 709                         if scope2[el.name] is el.namespace
 710                                 return false
 711                 return false
 712         standard_scopers = {
 713                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
 714                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
 715                 template: NS_HTML,
 716
 717                 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
 718                 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
 719
 720                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
 721         }
 722         button_scopers = button: NS_HTML
 723         li_scopers = ol: NS_HTML, ul: NS_HTML
 724         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
 725         is_in_scope = (tag_name, namespace = null) ->
 726                 return is_in_scope_x tag_name, standard_scopers, namespace
 727         is_in_button_scope = (tag_name, namespace = null) ->
 728                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
 729         is_in_table_scope = (tag_name, namespace = null) ->
 730                 return is_in_scope_x tag_name, table_scopers, namespace
 731         # aka is_in_list_item_scope
 732         is_in_li_scope = (tag_name, namespace = null) ->
 733                 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
 734         is_in_select_scope = (tag_name, namespace = null) ->
 735                 for t in open_els
 736                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 737                                 return true
 738                         if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
 739                                 return false
 740                 return false
 741         # this checks for a particular element, not by name
 742         # this requires a namespace match
 743         el_is_in_scope = (needle) ->
 744                 for el in open_els
 745                         if el is needle
 746                                 return true
 747                         if standard_scopers[el.name] is el.namespace
 748                                 return false
 749                 return false
 750
 751         clear_to_table_stopers = {
 752                 'table': true
 753                 'template': true
 754                 'html': true
 755         }
 756         clear_stack_to_table_context = ->
 757                 loop
 758                         if clear_to_table_stopers[open_els[0].name]?
 759                                 break
 760                         open_els.shift()
 761                 return
 762         clear_to_table_body_stopers = {
 763                 tbody: NS_HTML
 764                 tfoot: NS_HTML
 765                 thead: NS_HTML
 766                 template: NS_HTML
 767                 html: NS_HTML
 768         }
 769         clear_stack_to_table_body_context = ->
 770                 loop
 771                         if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
 772                                 break
 773                         open_els.shift()
 774                 return
 775         clear_to_table_row_stopers = {
 776                 'tr': true
 777                 'template': true
 778                 'html': true
 779         }
 780         clear_stack_to_table_row_context = ->
 781                 loop
 782                         if clear_to_table_row_stopers[open_els[0].name]?
 783                                 break
 784                         open_els.shift()
 785                 return
 786         clear_afe_to_marker = ->
 787                 loop
 788                         return unless afe.length > 0 # this happens in fragment case, ?spec error
 789                         el = afe.shift()
 790                         if el.type is TYPE_AFE_MARKER
 791                                 return
 792                 return
 793
 794         # 8.2.3.1 ...
 795         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
 796         reset_ins_mode = ->
 797                 # 1. Let last be false.
 798                 last = false
 799                 # 2. Let node be the last node in the stack of open elements.
 800                 node_i = 0
 801                 node = open_els[node_i]
 802                 # 3. Loop: If node is the first node in the stack of open elements,
 803                 # then set last to true, and, if the parser was originally created as
 804                 # part of the HTML fragment parsing algorithm (fragment case) set node
 805                 # to the context element.
 806                 loop
 807                         if node_i is open_els.length - 1
 808                                 last = true
 809                                 # fixfull (fragment case)
 810
 811                         # 4. If node is a select element, run these substeps:
 812                         if node.name is 'select' and node.namespace is NS_HTML
 813                                 # 1. If last is true, jump to the step below labeled done.
 814                                 unless last
 815                                         # 2. Let ancestor be node.
 816                                         ancestor_i = node_i
 817                                         ancestor = node
 818                                         # 3. Loop: If ancestor is the first node in the stack of
 819                                         # open elements, jump to the step below labeled done.
 820                                         loop
 821                                                 if ancestor_i is open_els.length - 1
 822                                                         break
 823                                                 # 4. Let ancestor be the node before ancestor in the stack
 824                                                 # of open elements.
 825                                                 ancestor_i += 1
 826                                                 ancestor = open_els[ancestor_i]
 827                                                 # 5. If ancestor is a template node, jump to the step below
 828                                                 # labeled done.
 829                                                 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
 830                                                         break
 831                                                 # 6. If ancestor is a table node, switch the insertion mode
 832                                                 # to "in select in table" and abort these steps.
 833                                                 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
 834                                                         ins_mode = ins_mode_in_select_in_table
 835                                                         return
 836                                                 # 7. Jump back to the step labeled loop.
 837                                 # 8. Done: Switch the insertion mode to "in select" and abort
 838                                 # these steps.
 839                                 ins_mode = ins_mode_in_select
 840                                 return
 841                         # 5. If node is a td or th element and last is false, then switch
 842                         # the insertion mode to "in cell" and abort these steps.
 843                         if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
 844                                 ins_mode = ins_mode_in_cell
 845                                 return
 846                         # 6. If node is a tr element, then switch the insertion mode to "in
 847                         # row" and abort these steps.
 848                         if node.name is 'tr' and node.namespace is NS_HTML
 849                                 ins_mode = ins_mode_in_row
 850                                 return
 851                         # 7. If node is a tbody, thead, or tfoot element, then switch the
 852                         # insertion mode to "in table body" and abort these steps.
 853                         if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
 854                                 ins_mode = ins_mode_in_table_body
 855                                 return
 856                         # 8. If node is a caption element, then switch the insertion mode
 857                         # to "in caption" and abort these steps.
 858                         if node.name is 'caption' and node.namespace is NS_HTML
 859                                 ins_mode = ins_mode_in_caption
 860                                 return
 861                         # 9. If node is a colgroup element, then switch the insertion mode
 862                         # to "in column group" and abort these steps.
 863                         if node.name is 'colgroup' and node.namespace is NS_HTML
 864                                 ins_mode = ins_mode_in_column_group
 865                                 return
 866                         # 10. If node is a table element, then switch the insertion mode to
 867                         # "in table" and abort these steps.
 868                         if node.name is 'table' and node.namespace is NS_HTML
 869                                 ins_mode = ins_mode_in_table
 870                                 return
 871                         # 11. If node is a template element, then switch the insertion mode
 872                         # to the current template insertion mode and abort these steps.
 873                         if node.name is 'template' and node.namespace is NS_HTML
 874                                 ins_mode = template_ins_modes[0]
 875                                 return
 876                         # 12. If node is a head element and last is true, then switch the
 877                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
 878                         # these steps. (fragment case)
 879                         if node.name is 'head' and node.namespace is NS_HTML and last
 880                                 ins_mode = ins_mode_in_body
 881                                 return
 882                         # 13. If node is a head element and last is false, then switch the
 883                         # insertion mode to "in head" and abort these steps.
 884                         if node.name is 'head' and node.namespace is NS_HTML and last is false
 885                                 ins_mode = ins_mode_in_head
 886                                 return
 887                         # 14. If node is a body element, then switch the insertion mode to
 888                         # "in body" and abort these steps.
 889                         if node.name is 'body' and node.namespace is NS_HTML
 890                                 ins_mode = ins_mode_in_body
 891                                 return
 892                         # 15. If node is a frameset element, then switch the insertion mode
 893                         # to "in frameset" and abort these steps. (fragment case)
 894                         if node.name is 'frameset' and node.namespace is NS_HTML
 895                                 ins_mode = ins_mode_in_frameset
 896                                 return
 897                         # 16. If node is an html element, run these substeps:
 898                         if node.name is 'html' and node.namespace is NS_HTML
 899                                 # 1. If the head element pointer is null, switch the insertion
 900                                 # mode to "before head" and abort these steps. (fragment case)
 901                                 if head_element_pointer is null
 902                                         ins_mode = ins_mode_before_head
 903                                 else
 904                                         # 2. Otherwise, the head element pointer is not null,
 905                                         # switch the insertion mode to "after head" and abort these
 906                                         # steps.
 907                                         ins_mode = ins_mode_after_head
 908                                 return
 909                         # 17. If last is true, then switch the insertion mode to "in body"
 910                         # and abort these steps. (fragment case)
 911                         if last
 912                                 ins_mode = ins_mode_in_body
 913                                 return
 914                         # 18. Let node now be the node before node in the stack of open
 915                         # elements.
 916                         node_i += 1
 917                         node = open_els[node_i]
 918                         # 19. Return to the step labeled loop.
 919
 920         # 8.2.3.2
 921
 922         # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
 923         adjusted_current_node = ->
 924                 if open_els.length is 1 and flag_fragment_parsing
 925                         return context_element
 926                 return open_els[0]
 927
 928         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
 929         # this implementation is structured (mostly) as described at the link above.
 930         # capitalized comments are the "labels" described at the link above.
 931         reconstruct_afe = ->
 932                 return if afe.length is 0
 933                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
 934                         return
 935                 # Rewind
 936                 i = 0
 937                 loop
 938                         if i is afe.length - 1
 939                                 break
 940                         i += 1
 941                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
 942                                 i -= 1 # Advance
 943                                 break
 944                 # Create
 945                 loop
 946                         el = insert_html_element afe[i].token
 947                         afe[i] = el
 948                         break if i is 0
 949                         i -= 1 # Advance
 950
 951         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
 952         # adoption agency algorithm
 953         # overview here:
 954         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
 955         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
 956         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
 957         adoption_agency = (subject) ->
 958                 debug_log "adoption_agency()"
 959                 debug_log "tree: #{serialize_els doc.children, false, true}"
 960                 debug_log "open_els: #{serialize_els open_els, true, true}"
 961                 debug_log "afe: #{serialize_els afe, true, true}"
 962 # this block implements tha W3C spec
 963 #               # 1. If the current node is an HTML element whose tag name is subject,
 964 #               # then run these substeps:
 965 #               #
 966 #               # 1. Let element be the current node.
 967 #               #
 968 #               # 2. Pop element off the stack of open elements.
 969 #               #
 970 #               # 3. If element is also in the list of active formatting elements,
 971 #               # remove the element from the list.
 972 #               #
 973 #               # 4. Abort the adoption agency algorithm.
 974 #               if open_els[0].name is subject and open_els[0].namespace is NS_HTML
 975 #                       el = open_els.shift()
 976 #                       # remove it from the list of active formatting elements (if found)
 977 #                       for t, i in afe
 978 #                               if t is el
 979 #                                       afe.splice i, 1
 980 #                                       break
 981 #                       debug_log "aaa: starting off with subject on top of stack, exiting"
 982 #                       return
 983 # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
 984                 # If the current node is an HTML element whose tag name is subject, and
 985                 # the current node is not in the list of active formatting elements,
 986                 # then pop the current node off the stack of open elements, and abort
 987                 # these steps.
 988                 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
 989                         debug_log "aaa: starting off with subject on top of stack, exiting"
 990                         # remove it from the list of active formatting elements (if found)
 991                         in_afe = false
 992                         for el, i in afe
 993                                 if el is open_els[0]
 994                                         in_afe = true
 995                                         break
 996                         unless in_afe
 997                                 debug_log "aaa: ...and not in afe, aaa done"
 998                                 open_els.shift()
 999                                 return
1000                         # fall through
1001 # END WHATWG
1002                 outer = 0
1003                 loop
1004                         if outer >= 8
1005                                 return
1006                         outer += 1
1007                         # 5. Let formatting element be the last element in the list of
1008                         # active formatting elements that: is between the end of the list
1009                         # and the last scope marker in the list, if any, or the start of
1010                         # the list otherwise, and  has the tag name subject.
1011                         fe = null
1012                         for t, fe_of_afe in afe
1013                                 if t.type is TYPE_AFE_MARKER
1014                                         break
1015                                 if t.name is subject
1016                                         fe = t
1017                                         break
1018                         # If there is no such element, then abort these steps and instead
1019                         # act as described in the "any other end tag" entry above.
1020                         if fe is null
1021                                 debug_log "aaa: fe not found in afe"
1022                                 in_body_any_other_end_tag subject
1023                                 return
1024                         # 6. If formatting element is not in the stack of open elements,
1025                         # then this is a parse error; remove the element from the list, and
1026                         # abort these steps.
1027                         in_open_els = false
1028                         for t, fe_of_open_els in open_els
1029                                 if t is fe
1030                                         in_open_els = true
1031                                         break
1032                         unless in_open_els
1033                                 debug_log "aaa: fe not found in open_els"
1034                                 parse_error()
1035                                 # "remove it from the list" must mean afe, since it's not in open_els
1036                                 afe.splice fe_of_afe, 1
1037                                 return
1038                         # 7. If formatting element is in the stack of open elements, but
1039                         # the element is not in scope, then this is a parse error; abort
1040                         # these steps.
1041                         unless el_is_in_scope fe
1042                                 debug_log "aaa: fe not in scope"
1043                                 parse_error()
1044                                 return
1045                         # 8. If formatting element is not the current node, this is a parse
1046                         # error. (But do not abort these steps.)
1047                         unless open_els[0] is fe
1048                                 parse_error()
1049                                 # continue
1050                         # 9. Let furthest block be the topmost node in the stack of open
1051                         # elements that is lower in the stack than formatting element, and
1052                         # is an element in the special category. There might not be one.
1053                         fb = null
1054                         fb_of_open_els = null
1055                         for t, i in open_els
1056                                 if t is fe
1057                                         break
1058                                 if el_is_special t
1059                                         fb = t
1060                                         fb_of_open_els = i
1061                                         # and continue, to see if there's one that's more "topmost"
1062                         # 10. If there is no furthest block, then the UA must first pop all
1063                         # the nodes from the bottom of the stack of open elements, from the
1064                         # current node up to and including formatting element, then remove
1065                         # formatting element from the list of active formatting elements,
1066                         # and finally abort these steps.
1067                         if fb is null
1068                                 debug_log "aaa: no fb"
1069                                 loop
1070                                         t = open_els.shift()
1071                                         if t is fe
1072                                                 afe.splice fe_of_afe, 1
1073                                                 return
1074                         # 11. Let common ancestor be the element immediately above
1075                         # formatting element in the stack of open elements.
1076                         ca = open_els[fe_of_open_els + 1] # common ancestor
1077
1078                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
1079                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
1080                         bookmark = new_aaa_bookmark()
1081                         for t, i in afe
1082                                 if t is fe
1083                                         afe.splice i, 0, bookmark
1084                                         break
1085                         node = last_node = fb
1086                         inner = 0
1087                         loop
1088                                 inner += 1
1089                                 # 3. Let node be the element immediately above node in the
1090                                 # stack of open elements, or if node is no longer in the stack
1091                                 # of open elements (e.g. because it got removed by this
1092                                 # algorithm), the element that was immediately above node in
1093                                 # the stack of open elements before node was removed.
1094                                 node_next = null
1095                                 for t, i in open_els
1096                                         if t is node
1097                                                 node_next = open_els[i + 1]
1098                                                 break
1099                                 node = node_next ? node_above
1100                                 debug_log "inner loop #{inner}"
1101                                 debug_log "tree: #{serialize_els doc.children, false, true}"
1102                                 debug_log "open_els: #{serialize_els open_els, true, true}"
1103                                 debug_log "afe: #{serialize_els afe, true, true}"
1104                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1105                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1106                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1107                                 debug_log "node: #{node.serialize true, true}"
1108                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
1109
1110                                 # 4. If node is formatting element, then go to the next step in
1111                                 # the overall algorithm.
1112                                 if node is fe
1113                                         break
1114                                 debug_log "the meat"
1115                                 # 5. If inner loop counter is greater than three and node is in
1116                                 # the list of active formatting elements, then remove node from
1117                                 # the list of active formatting elements.
1118                                 node_in_afe = false
1119                                 for t, i in afe
1120                                         if t is node
1121                                                 if inner > 3
1122                                                         afe.splice i, 1
1123                                                         debug_log "max out inner"
1124                                                 else
1125                                                         node_in_afe = true
1126                                                         debug_log "in afe"
1127                                                 break
1128                                 # 6. If node is not in the list of active formatting elements,
1129                                 # then remove node from the stack of open elements and then go
1130                                 # back to the step labeled inner loop.
1131                                 unless node_in_afe
1132                                         debug_log "not in afe"
1133                                         for t, i in open_els
1134                                                 if t is node
1135                                                         node_above = open_els[i + 1]
1136                                                         open_els.splice i, 1
1137                                                         break
1138                                         continue
1139                                 debug_log "the bones"
1140                                 # 7. create an element for the token for which the element node
1141                                 # was created, in the HTML namespace, with common ancestor as
1142                                 # the intended parent; replace the entry for node in the list
1143                                 # of active formatting elements with an entry for the new
1144                                 # element, replace the entry for node in the stack of open
1145                                 # elements with an entry for the new element, and let node be
1146                                 # the new element.
1147                                 new_node = token_to_element node.token, NS_HTML, ca
1148                                 for t, i in afe
1149                                         if t is node
1150                                                 afe[i] = new_node
1151                                                 debug_log "replaced in afe"
1152                                                 break
1153                                 for t, i in open_els
1154                                         if t is node
1155                                                 node_above = open_els[i + 1]
1156                                                 open_els[i] = new_node
1157                                                 debug_log "replaced in open_els"
1158                                                 break
1159                                 node = new_node
1160                                 # 8. If last node is furthest block, then move the
1161                                 # aforementioned bookmark to be immediately after the new node
1162                                 # in the list of active formatting elements.
1163                                 if last_node is fb
1164                                         for t, i in afe
1165                                                 if t is bookmark
1166                                                         afe.splice i, 1
1167                                                         debug_log "removed bookmark"
1168                                                         break
1169                                         for t, i in afe
1170                                                 if t is node
1171                                                         # "after" means lower
1172                                                         afe.splice i, 0, bookmark # "after as <-
1173                                                         debug_log "placed bookmark after node"
1174                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1175                                                         break
1176                                 # 9. Insert last node into node, first removing it from its
1177                                 # previous parent node if any.
1178                                 if last_node.parent?
1179                                         debug_log "last_node has parent"
1180                                         for c, i in last_node.parent.children
1181                                                 if c is last_node
1182                                                         debug_log "removing last_node from parent"
1183                                                         last_node.parent.children.splice i, 1
1184                                                         break
1185                                 node.children.push last_node
1186                                 last_node.parent = node
1187                                 # 10. Let last node be node.
1188                                 last_node = node
1189                                 debug_log "at last"
1190                                 # 11. Return to the step labeled inner loop.
1191                         # 14. Insert whatever last node ended up being in the previous step
1192                         # at the appropriate place for inserting a node, but using common
1193                         # ancestor as the override target.
1194
1195                         # In the case where fe is immediately followed by fb:
1196                         #   * inner loop exits out early (node==fe)
1197                         #   * last_node is fb
1198                         #   * last_node is still in the tree (not a duplicate)
1199                         if last_node.parent?
1200                                 debug_log "FEFIRST? last_node has parent"
1201                                 for c, i in last_node.parent.children
1202                                         if c is last_node
1203                                                 debug_log "removing last_node from parent"
1204                                                 last_node.parent.children.splice i, 1
1205                                                 break
1206
1207                         debug_log "after aaa inner loop"
1208                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1209                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1210                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1211                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1212                         debug_log "tree: #{serialize_els doc.children, false, true}"
1213
1214                         debug_log "insert"
1215
1216
1217                         # can't use standard insert token thing, because it's already in
1218                         # open_els and must stay at it's current position in open_els
1219                         dest = adjusted_insertion_location ca
1220                         dest[0].children.splice dest[1], 0, last_node
1221                         last_node.parent = dest[0]
1222
1223
1224                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1225                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1226                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1227                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1228                         debug_log "tree: #{serialize_els doc.children, false, true}"
1229
1230                         # 15. Create an element for the token for which formatting element
1231                         # was created, in the HTML namespace, with furthest block as the
1232                         # intended parent.
1233                         new_element = token_to_element fe.token, NS_HTML, fb
1234                         # 16. Take all of the child nodes of furthest block and append them
1235                         # to the element created in the last step.
1236                         while fb.children.length
1237                                 t = fb.children.shift()
1238                                 t.parent = new_element
1239                                 new_element.children.push t
1240                         # 17. Append that new element to furthest block.
1241                         new_element.parent = fb
1242                         fb.children.push new_element
1243                         # 18. Remove formatting element from the list of active formatting
1244                         # elements, and insert the new element into the list of active
1245                         # formatting elements at the position of the aforementioned
1246                         # bookmark.
1247                         for t, i in afe
1248                                 if t is fe
1249                                         afe.splice i, 1
1250                                         break
1251                         for t, i in afe
1252                                 if t is bookmark
1253                                         afe[i] = new_element
1254                                         break
1255                         # 19. Remove formatting element from the stack of open elements,
1256                         # and insert the new element into the stack of open elements
1257                         # immediately below the position of furthest block in that stack.
1258                         for t, i in open_els
1259                                 if t is fe
1260                                         open_els.splice i, 1
1261                                         break
1262                         for t, i in open_els
1263                                 if t is fb
1264                                         open_els.splice i, 0, new_element
1265                                         break
1266                         # 20. Jump back to the step labeled outer loop.
1267                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1268                         debug_log "tree: #{serialize_els doc.children, false, true}"
1269                         debug_log "open_els: #{serialize_els open_els, true, true}"
1270                         debug_log "afe: #{serialize_els afe, true, true}"
1271                 debug_log "AAA DONE"
1272
1273         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1274         close_p_element = ->
1275                 generate_implied_end_tags 'p' # arg is exception
1276                 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1277                         parse_error()
1278                 while open_els.length > 1 # just in case
1279                         el = open_els.shift()
1280                         if el.name is 'p' and el.namespace is NS_HTML
1281                                 return
1282         close_p_if_in_button_scope = ->
1283                 if is_in_button_scope 'p', NS_HTML
1284                         close_p_element()
1285
1286         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1287         # aka insert_a_character = (t) ->
1288         insert_character = (t) ->
1289                 dest = adjusted_insertion_location()
1290                 # fixfull check for Document node
1291                 if dest[1] > 0
1292                         prev = dest[0].children[dest[1] - 1]
1293                         if prev.type is TYPE_TEXT
1294                                 prev.text += t.text
1295                                 return
1296                 dest[0].children.splice dest[1], 0, t
1297
1298
1299         # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1300         process_token = (t) ->
1301                 acn = adjusted_current_node()
1302                 unless acn?
1303                         ins_mode t
1304                         return
1305                 if acn.namespace is NS_HTML
1306                         ins_mode t
1307                         return
1308                 if is_mathml_text_integration_point(acn)
1309                         if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1310                                 ins_mode t
1311                                 return
1312                         if t.type is TYPE_TEXT
1313                                 ins_mode t
1314                                 return
1315                 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1316                         ins_mode t
1317                         return
1318                 if is_html_integration acn
1319                         if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1320                                 ins_mode t
1321                                 return
1322                 if t.type is TYPE_EOF
1323                         ins_mode t
1324                         return
1325                 in_foreign_content t
1326                 return
1327
1328         # 8.2.5.1
1329         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1330         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1331         adjusted_insertion_location = (override_target = null) ->
1332                 # 1. If there was an override target specified, then let target be the
1333                 # override target.
1334                 if override_target?
1335                         target = override_target
1336                 else # Otherwise, let target be the current node.
1337                         target = open_els[0]
1338                 # 2. Determine the adjusted insertion location using the first matching
1339                 # steps from the following list:
1340                 #
1341                 # If foster parenting is enabled and target is a table, tbody, tfoot,
1342                 # thead, or tr element Foster parenting happens when content is
1343                 # misnested in tables.
1344                 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1345                         loop # once. this is here so we can ``break`` to "abort these substeps"
1346                                 # 1. Let last template be the last template element in the
1347                                 # stack of open elements, if any.
1348                                 last_template = null
1349                                 last_template_i = null
1350                                 for el, i in open_els
1351                                         if el.name is 'template' and el.namespace is NS_HTML
1352                                                 last_template = el
1353                                                 last_template_i = i
1354                                                 break
1355                                 # 2. Let last table be the last table element in the stack of
1356                                 # open elements, if any.
1357                                 last_table = null
1358                                 last_table_i
1359                                 for el, i in open_els
1360                                         if el.name is 'table' and el.namespace is NS_HTML
1361                                                 last_table = el
1362                                                 last_table_i = i
1363                                                 break
1364                                 # 3. If there is a last template and either there is no last
1365                                 # table, or there is one, but last template is lower (more
1366                                 # recently added) than last table in the stack of open
1367                                 # elements, then: let adjusted insertion location be inside
1368                                 # last template's template contents, after its last child (if
1369                                 # any), and abort these substeps.
1370                                 if last_template and (last_table is null or last_template_i < last_table_i)
1371                                         target = last_template # fixfull should be it's contents
1372                                         target_i = target.children.length
1373                                         break
1374                                 # 4. If there is no last table, then let adjusted insertion
1375                                 # location be inside the first element in the stack of open
1376                                 # elements (the html element), after its last child (if any),
1377                                 # and abort these substeps. (fragment case)
1378                                 if last_table is null
1379                                         # this is odd
1380                                         target = open_els[open_els.length - 1]
1381                                         target_i = target.children.length
1382                                         break
1383                                 # 5. If last table has a parent element, then let adjusted
1384                                 # insertion location be inside last table's parent element,
1385                                 # immediately before last table, and abort these substeps.
1386                                 if last_table.parent?
1387                                         for c, i in last_table.parent.children
1388                                                 if c is last_table
1389                                                         target = last_table.parent
1390                                                         target_i = i
1391                                                         break
1392                                         break
1393                                 # 6. Let previous element be the element immediately above last
1394                                 # table in the stack of open elements.
1395                                 #
1396                                 # huh? how could it not have a parent?
1397                                 previous_element = open_els[last_table_i + 1]
1398                                 # 7. Let adjusted insertion location be inside previous
1399                                 # element, after its last child (if any).
1400                                 target = previous_element
1401                                 target_i = target.children.length
1402                                 # Note: These steps are involved in part because it's possible
1403                                 # for elements, the table element in this case in particular,
1404                                 # to have been moved by a script around in the DOM, or indeed
1405                                 # removed from the DOM entirely, after the element was inserted
1406                                 # by the parser.
1407                                 break # don't really loop
1408                 else
1409                         # Otherwise Let adjusted insertion location be inside target, after
1410                         # its last child (if any).
1411                         target_i = target.children.length
1412
1413                 # 3. If the adjusted insertion location is inside a template element,
1414                 # let it instead be inside the template element's template contents,
1415                 # after its last child (if any).
1416                 # fixfull (template)
1417
1418                 # 4. Return the adjusted insertion location.
1419                 return [target, target_i]
1420
1421         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1422         # aka create_an_element_for_token
1423         token_to_element = (t, namespace, intended_parent) ->
1424                 # convert attributes into a hash
1425                 attrs = {}
1426                 for a in t.attrs_a
1427                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1428                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1429
1430                 # TODO 2. If the newly created element has an xmlns attribute in the
1431                 # XMLNS namespace whose value is not exactly the same as the element's
1432                 # namespace, that is a parse error. Similarly, if the newly created
1433                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1434                 # value is not the XLink Namespace, that is a parse error.
1435
1436                 # fixfull: the spec says stuff about form pointers and ownerDocument
1437
1438                 return el
1439
1440         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1441         insert_foreign_element = (token, namespace) ->
1442                 ail = adjusted_insertion_location()
1443                 ail_el = ail[0]
1444                 ail_i = ail[1]
1445                 el = token_to_element token, namespace, ail_el
1446                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1447                 el.parent = ail_el
1448                 ail_el.children.splice ail_i, 0, el
1449                 open_els.unshift el
1450                 return el
1451         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1452         insert_html_element = (token) ->
1453                 insert_foreign_element token, NS_HTML
1454
1455         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1456         # position should be [node, index_within_children]
1457         insert_comment = (t, position = null) ->
1458                 position ?= adjusted_insertion_location()
1459                 position[0].children.splice position[1], 0, t
1460
1461         # 8.2.5.2
1462         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1463         parse_generic_raw_text = (t) ->
1464                 insert_html_element t
1465                 tok_state = tok_state_rawtext
1466                 original_ins_mode = ins_mode
1467                 ins_mode = ins_mode_text
1468         parse_generic_rcdata_text = (t) ->
1469                 insert_html_element t
1470                 tok_state = tok_state_rcdata
1471                 original_ins_mode = ins_mode
1472                 ins_mode = ins_mode_text
1473
1474         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1475         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1476         generate_implied_end_tags = (except = null) ->
1477                 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1478                         open_els.shift()
1479
1480         # 8.2.5.4 The rules for parsing tokens in HTML content
1481         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1482
1483         # 8.2.5.4.1 The "initial" insertion mode
1484         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1485         is_quirks_yes_doctype = (t) ->
1486                 if t.flag 'force-quirks'
1487                         return true
1488                 if t.name isnt 'html'
1489                         return true
1490                 if t.public_identifier?
1491                         pi = t.public_identifier.toLowerCase()
1492                         for p in quirks_yes_pi_prefixes
1493                                 if pi.substr(0, p.length) is p
1494                                         return true
1495                         if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
1496                                 return true
1497                 if t.system_identifier?
1498                         if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
1499                                 return true
1500                 else if t.public_identifier?
1501                         # already did this: pi = t.public_identifier.toLowerCase()
1502                         if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1503                                 return true
1504                 return false
1505         is_quirks_limited_doctype = (t) ->
1506                 if t.public_identifier?
1507                         pi = t.public_identifier.toLowerCase()
1508                         if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
1509                                 return true
1510                         if t.system_identifier?
1511                                 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1512                                         return true
1513                 return false
1514         ins_mode_initial = (t) ->
1515                 if is_space_tok t
1516                         return
1517                 if t.type is TYPE_COMMENT
1518                         # ?fixfull
1519                         doc.children.push t
1520                         return
1521                 if t.type is TYPE_DOCTYPE
1522                         # fixfull syntax error from first paragraph and following bullets
1523                         # fixfull set doc.doctype
1524                         # fixfull is the "not an iframe srcdoc" thing relevant?
1525                         if is_quirks_yes_doctype t
1526                                 doc.flag 'quirks mode', QUIRKS_YES
1527                         else if is_quirks_limited_doctype t
1528                                 doc.flag 'quirks mode', QUIRKS_LIMITED
1529                         doc.children.push t
1530                         ins_mode = ins_mode_before_html
1531                         return
1532                 # Anything else
1533                 # fixfull not iframe srcdoc?
1534                 parse_error()
1535                 doc.flag 'quirks mode', QUIRKS_YES
1536                 ins_mode = ins_mode_before_html
1537                 process_token t
1538                 return
1539
1540         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1541         ins_mode_before_html = (t) ->
1542                 if t.type is TYPE_DOCTYPE
1543                         parse_error()
1544                         return
1545                 if t.type is TYPE_COMMENT
1546                         doc.children.push t
1547                         return
1548                 if is_space_tok t
1549                         return
1550                 if t.type is TYPE_START_TAG and t.name is 'html'
1551                         el = token_to_element t, NS_HTML, doc
1552                         doc.children.push el
1553                         open_els.unshift(el)
1554                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1555                         ins_mode = ins_mode_before_head
1556                         return
1557                 if t.type is TYPE_END_TAG
1558                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1559                                 # fall through to "anything else"
1560                         else
1561                                 parse_error()
1562                                 return
1563                 # Anything else
1564                 el = token_to_element new_open_tag('html'), NS_HTML, doc
1565                 doc.children.push el
1566                 el.parent = doc
1567                 open_els.unshift el
1568                 # ?fixfull browsing context
1569                 ins_mode = ins_mode_before_head
1570                 process_token t
1571                 return
1572
1573         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1574         ins_mode_before_head = (t) ->
1575                 if is_space_tok t
1576                         return
1577                 if t.type is TYPE_COMMENT
1578                         insert_comment t
1579                         return
1580                 if t.type is TYPE_DOCTYPE
1581                         parse_error()
1582                         return
1583                 if t.type is TYPE_START_TAG and t.name is 'html'
1584                         ins_mode_in_body t
1585                         return
1586                 if t.type is TYPE_START_TAG and t.name is 'head'
1587                         el = insert_html_element t
1588                         head_element_pointer = el
1589                         ins_mode = ins_mode_in_head
1590                         return
1591                 if t.type is TYPE_END_TAG
1592                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1593                                 # fall through to Anything else below
1594                         else
1595                                 parse_error()
1596                                 return
1597                 # Anything else
1598                 el = insert_html_element new_open_tag 'head'
1599                 head_element_pointer = el
1600                 ins_mode = ins_mode_in_head
1601                 process_token t
1602
1603         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1604         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1605                 open_els.shift() # spec says this will be a 'head' node
1606                 ins_mode = ins_mode_after_head
1607                 process_token t
1608         ins_mode_in_head = (t) ->
1609                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1610                         insert_character t
1611                         return
1612                 if t.type is TYPE_COMMENT
1613                         insert_comment t
1614                         return
1615                 if t.type is TYPE_DOCTYPE
1616                         parse_error()
1617                         return
1618                 if t.type is TYPE_START_TAG and t.name is 'html'
1619                         ins_mode_in_body t
1620                         return
1621                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1622                         el = insert_html_element t
1623                         open_els.shift()
1624                         t.acknowledge_self_closing()
1625                         return
1626                 if t.type is TYPE_START_TAG and t.name is 'meta'
1627                         el = insert_html_element t
1628                         open_els.shift()
1629                         t.acknowledge_self_closing()
1630                         # fixfull encoding stuff
1631                         return
1632                 if t.type is TYPE_START_TAG and t.name is 'title'
1633                         parse_generic_rcdata_text t
1634                         return
1635                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1636                         parse_generic_raw_text t
1637                         return
1638                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1639                         insert_html_element t
1640                         ins_mode = ins_mode_in_head_noscript
1641                         return
1642                 if t.type is TYPE_START_TAG and t.name is 'script'
1643                         ail = adjusted_insertion_location()
1644                         el = token_to_element t, NS_HTML, ail
1645                         el.flag 'parser-inserted', true
1646                         # fixfull frament case
1647                         ail[0].children.splice ail[1], 0, el
1648                         open_els.unshift el
1649                         tok_state = tok_state_script_data
1650                         original_ins_mode = ins_mode # make sure orig... is defined
1651                         ins_mode = ins_mode_text
1652                         return
1653                 if t.type is TYPE_END_TAG and t.name is 'head'
1654                         open_els.shift() # will be a head element... spec says so
1655                         ins_mode = ins_mode_after_head
1656                         return
1657                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1658                         ins_mode_in_head_else t
1659                         return
1660                 if t.type is TYPE_START_TAG and t.name is 'template'
1661                         insert_html_element t
1662                         afe_push_marker()
1663                         flag_frameset_ok = false
1664                         ins_mode = ins_mode_in_template
1665                         template_ins_modes.unshift ins_mode_in_template
1666                         return
1667                 if t.type is TYPE_END_TAG and t.name is 'template'
1668                         if template_tag_is_open()
1669                                 generate_implied_end_tags
1670                                 if open_els[0].name isnt 'template'
1671                                         parse_error()
1672                                 loop
1673                                         el = open_els.shift()
1674                                         if el.name is 'template' and el.namespace is NS_HTML
1675                                                 break
1676                                 clear_afe_to_marker()
1677                                 template_ins_modes.shift()
1678                                 reset_ins_mode()
1679                         else
1680                                 parse_error()
1681                         return
1682                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1683                         parse_error()
1684                         return
1685                 ins_mode_in_head_else t
1686
1687         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1688         ins_mode_in_head_noscript_else = (t) ->
1689                 parse_error()
1690                 open_els.shift()
1691                 ins_mode = ins_mode_in_head
1692                 process_token t
1693         ins_mode_in_head_noscript = (t) ->
1694                 if t.type is TYPE_DOCTYPE
1695                         parse_error()
1696                         return
1697                 if t.type is TYPE_START_TAG and t.name is 'html'
1698                         ins_mode_in_body t
1699                         return
1700                 if t.type is TYPE_END_TAG and t.name is 'noscript'
1701                         open_els.shift()
1702                         ins_mode = ins_mode_in_head
1703                         return
1704                 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1705                         ins_mode_in_head t
1706                         return
1707                 if t.type is TYPE_END_TAG and t.name is 'br'
1708                         ins_mode_in_head_noscript_else t
1709                         return
1710                 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1711                         parse_error()
1712                         return
1713                 # Anything else
1714                 ins_mode_in_head_noscript_else t
1715                 return
1716
1717
1718
1719         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1720         ins_mode_after_head_else = (t) ->
1721                 body_tok = new_open_tag 'body'
1722                 insert_html_element body_tok
1723                 ins_mode = ins_mode_in_body
1724                 process_token t
1725                 return
1726         ins_mode_after_head = (t) ->
1727                 if is_space_tok t
1728                         insert_character t
1729                         return
1730                 if t.type is TYPE_COMMENT
1731                         insert_comment t
1732                         return
1733                 if t.type is TYPE_DOCTYPE
1734                         parse_error()
1735                         return
1736                 if t.type is TYPE_START_TAG and t.name is 'html'
1737                         ins_mode_in_body t
1738                         return
1739                 if t.type is TYPE_START_TAG and t.name is 'body'
1740                         insert_html_element t
1741                         flag_frameset_ok = false
1742                         ins_mode = ins_mode_in_body
1743                         return
1744                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1745                         insert_html_element t
1746                         ins_mode = ins_mode_in_frameset
1747                         return
1748                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1749                         parse_error()
1750                         open_els.unshift head_element_pointer
1751                         ins_mode_in_head t
1752                         for el, i in open_els
1753                                 if el is head_element_pointer
1754                                         open_els.splice i, 1
1755                                         return
1756                         console.log "warning: 23904 couldn't find head element in open_els"
1757                         return
1758                 if t.type is TYPE_END_TAG and t.name is 'template'
1759                         ins_mode_in_head t
1760                         return
1761                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1762                         ins_mode_after_head_else t
1763                         return
1764                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1765                         parse_error()
1766                         return
1767                 # Anything else
1768                 ins_mode_after_head_else t
1769
1770         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1771         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1772                 for el, i in open_els
1773                         if el.name is name and el.namespace is NS_HTML
1774                                 generate_implied_end_tags name # arg is exception
1775                                 parse_error() unless i is 0
1776                                 while i >= 0
1777                                         open_els.shift()
1778                                         i -= 1
1779                                 return
1780                         if special_elements[el.name] is el.namespace
1781                                 parse_error()
1782                                 return
1783                 return
1784         ins_mode_in_body = (t) ->
1785                 if t.type is TYPE_TEXT and t.text is "\u0000"
1786                         parse_error()
1787                         return
1788                 if is_space_tok t
1789                         reconstruct_afe()
1790                         insert_character t
1791                         return
1792                 if t.type is TYPE_TEXT
1793                         reconstruct_afe()
1794                         insert_character t
1795                         flag_frameset_ok = false
1796                         return
1797                 if t.type is TYPE_COMMENT
1798                         insert_comment t
1799                         return
1800                 if t.type is TYPE_DOCTYPE
1801                         parse_error()
1802                         return
1803                 if t.type is TYPE_START_TAG and t.name is 'html'
1804                         parse_error()
1805                         return if template_tag_is_open()
1806                         root_attrs = open_els[open_els.length - 1].attrs
1807                         for a in t.attrs_a
1808                                 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1809                         return
1810
1811                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1812                         ins_mode_in_head t
1813                         return
1814                 if t.type is TYPE_START_TAG and t.name is 'body'
1815                         parse_error()
1816                         return if open_els.length < 2
1817                         second = open_els[open_els.length - 2]
1818                         return unless second.namespace is NS_HTML
1819                         return unless second.name is 'body'
1820                         return if template_tag_is_open()
1821                         flag_frameset_ok = false
1822                         for a in t.attrs_a
1823                                 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1824                         return
1825                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1826                         parse_error()
1827                         return if open_els.length < 2
1828                         second_i = open_els.length - 2
1829                         second = open_els[second_i]
1830                         return unless second.namespace is NS_HTML
1831                         return unless second.name is 'body'
1832                         if flag_frameset_ok is false
1833                                 return
1834                         if second.parent?
1835                                 for el, i in second.parent.children
1836                                         if el is second
1837                                                 second.parent.children.splice i, 1
1838                                                 break
1839                         open_els.splice second_i, 1
1840                         # pop everything except the "root html element"
1841                         while open_els.length > 1
1842                                 open_els.shift()
1843                         insert_html_element t
1844                         ins_mode = ins_mode_in_frameset
1845                         return
1846                 if t.type is TYPE_EOF
1847                         ok_tags = {
1848                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1849                                 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1850                                 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1851                         }
1852                         for el in open_els
1853                                 unless ok_tags[t.name] is el.namespace
1854                                         parse_error()
1855                                         break
1856                         if template_ins_modes.length > 0
1857                                 ins_mode_in_template t
1858                         else
1859                                 stop_parsing()
1860                         return
1861                 if t.type is TYPE_END_TAG and t.name is 'body'
1862                         unless is_in_scope 'body', NS_HTML
1863                                 parse_error()
1864                                 return
1865                         ok_tags = {
1866                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1867                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1868                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1869                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1870                                 html:NS_HTML
1871                         }
1872                         for el in open_els
1873                                 unless ok_tags[t.name] is el.namespace
1874                                         parse_error()
1875                                         break
1876                         ins_mode = ins_mode_after_body
1877                         return
1878                 if t.type is TYPE_END_TAG and t.name is 'html'
1879                         unless is_in_scope 'body', NS_HTML
1880                                 parse_error()
1881                                 return
1882                         ok_tags = {
1883                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1884                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1885                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1886                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1887                                 html:NS_HTML
1888                         }
1889                         for el in open_els
1890                                 unless ok_tags[t.name] is el.namespace
1891                                         parse_error()
1892                                         break
1893                         ins_mode = ins_mode_after_body
1894                         process_token t
1895                         return
1896                 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1897                         close_p_if_in_button_scope()
1898                         insert_html_element t
1899                         return
1900                 if t.type is TYPE_START_TAG and h_tags[t.name]?
1901                         close_p_if_in_button_scope()
1902                         if h_tags[open_els[0].name] is open_els[0].namespace
1903                                 parse_error()
1904                                 open_els.shift()
1905                         insert_html_element t
1906                         return
1907                 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1908                         close_p_if_in_button_scope()
1909                         insert_html_element t
1910                         # spec: If the next token is a "LF" (U+000A) character token, then
1911                         # ignore that token and move on to the next one. (Newlines at the
1912                         # start of pre blocks are ignored as an authoring convenience.)
1913                         if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1914                                 cur += 1
1915                         flag_frameset_ok = false
1916                         return
1917                 if t.type is TYPE_START_TAG and t.name is 'form'
1918                         unless form_element_pointer is null or template_tag_is_open()
1919                                 parse_error()
1920                                 return
1921                         close_p_if_in_button_scope()
1922                         el = insert_html_element t
1923                         unless template_tag_is_open()
1924                                 form_element_pointer = el
1925                         return
1926                 if t.type is TYPE_START_TAG and t.name is 'li'
1927                         flag_frameset_ok = false
1928                         for node in open_els
1929                                 if node.name is 'li' and node.namespace is NS_HTML
1930                                         generate_implied_end_tags 'li' # arg is exception
1931                                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1932                                                 parse_error()
1933                                         loop
1934                                                 el = open_els.shift()
1935                                                 if el.name is 'li' and el.namespace is NS_HTML
1936                                                         break
1937                                         break
1938                                 if el_is_special_not_adp node
1939                                                 break
1940                         close_p_if_in_button_scope()
1941                         insert_html_element t
1942                         return
1943                 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1944                         flag_frameset_ok = false
1945                         for node in open_els
1946                                 if node.name is 'dd' and node.namespace is NS_HTML
1947                                         generate_implied_end_tags 'dd' # arg is exception
1948                                         if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1949                                                 parse_error()
1950                                         loop
1951                                                 el = open_els.shift()
1952                                                 if el.name is 'dd' and el.namespace is NS_HTML
1953                                                         break
1954                                         break
1955                                 if node.name is 'dt' and node.namespace is NS_HTML
1956                                         generate_implied_end_tags 'dt' # arg is exception
1957                                         if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1958                                                 parse_error()
1959                                         loop
1960                                                 el = open_els.shift()
1961                                                 if el.name is 'dt' and el.namespace is NS_HTML
1962                                                         break
1963                                         break
1964                                 if el_is_special_not_adp node
1965                                         break
1966                         close_p_if_in_button_scope()
1967                         insert_html_element t
1968                         return
1969                 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1970                         close_p_if_in_button_scope()
1971                         insert_html_element t
1972                         tok_state = tok_state_plaintext
1973                         return
1974                 if t.type is TYPE_START_TAG and t.name is 'button'
1975                         if is_in_scope 'button', NS_HTML
1976                                 parse_error()
1977                                 generate_implied_end_tags()
1978                                 loop
1979                                         el = open_els.shift()
1980                                         if el.name is 'button' and el.namespace is NS_HTML
1981                                                 break
1982                         reconstruct_afe()
1983                         insert_html_element t
1984                         flag_frameset_ok = false
1985                         return
1986                 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1987                         unless is_in_scope t.name, NS_HTML
1988                                 parse_error()
1989                                 return
1990                         generate_implied_end_tags()
1991                         unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1992                                 parse_error()
1993                         loop
1994                                 el = open_els.shift()
1995                                 if el.name is t.name and el.namespace is NS_HTML
1996                                         return
1997                         return
1998                 if t.type is TYPE_END_TAG and t.name is 'form'
1999                         unless template_tag_is_open()
2000                                 node = form_element_pointer
2001                                 form_element_pointer = null
2002                                 if node is null or not el_is_in_scope node
2003                                         parse_error()
2004                                         return
2005                                 generate_implied_end_tags()
2006                                 if open_els[0] isnt node
2007                                         parse_error()
2008                                 for el, i in open_els
2009                                         if el is node
2010                                                 open_els.splice i, 1
2011                                                 break
2012                         else
2013                                 unless is_in_scope 'form', NS_HTML
2014                                         parse_error()
2015                                         return
2016                                 generate_implied_end_tags()
2017                                 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
2018                                         parse_error()
2019                                 loop
2020                                         el = open_els.shift()
2021                                         if el.name is 'form' and el.namespace is NS_HTML
2022                                                 break
2023                         return
2024                 if t.type is TYPE_END_TAG and t.name is 'p'
2025                         unless is_in_button_scope 'p', NS_HTML
2026                                 parse_error()
2027                                 insert_html_element new_open_tag 'p'
2028                         close_p_element()
2029                         return
2030                 if t.type is TYPE_END_TAG and t.name is 'li'
2031                         unless is_in_li_scope 'li', NS_HTML
2032                                 parse_error()
2033                                 return
2034                         generate_implied_end_tags 'li' # arg is exception
2035                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
2036                                 parse_error()
2037                         loop
2038                                 el = open_els.shift()
2039                                 if el.name is 'li' and el.namespace is NS_HTML
2040                                         break
2041                         return
2042                 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
2043                         unless is_in_scope t.name, NS_HTML
2044                                 parse_error()
2045                                 return
2046                         generate_implied_end_tags t.name # arg is exception
2047                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2048                                 parse_error()
2049                         loop
2050                                 el = open_els.shift()
2051                                 if el.name is t.name and el.namespace is NS_HTML
2052                                         break
2053                         return
2054                 if t.type is TYPE_END_TAG and h_tags[t.name]?
2055                         h_in_scope = false
2056                         for el in open_els
2057                                 if h_tags[el.name] is el.namespace
2058                                         h_in_scope = true
2059                                         break
2060                                 if standard_scopers[el.name] is el.namespace
2061                                         break
2062                         unless h_in_scope
2063                                 parse_error()
2064                                 return
2065                         generate_implied_end_tags()
2066                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2067                                 parse_error()
2068                         loop
2069                                 el = open_els.shift()
2070                                 if h_tags[el.name] is el.namespace
2071                                         break
2072                         return
2073                 # deep breath!
2074                 if t.type is TYPE_START_TAG and t.name is 'a'
2075                         # If the list of active formatting elements contains an a element
2076                         # between the end of the list and the last marker on the list (or
2077                         # the start of the list if there is no marker on the list), then
2078                         # this is a parse error; run the adoption agency algorithm for the
2079                         # tag name "a", then remove that element from the list of active
2080                         # formatting elements and the stack of open elements if the
2081                         # adoption agency algorithm didn't already remove it (it might not
2082                         # have if the element is not in table scope).
2083                         found = false
2084                         for el in afe
2085                                 if el.type is TYPE_AFE_MARKER
2086                                         break
2087                                 if el.name is 'a' and el.namespace is NS_HTML
2088                                         found = el
2089                         if found?
2090                                 parse_error()
2091                                 adoption_agency 'a'
2092                                 for el, i in afe
2093                                         if el is found
2094                                                 afe.splice i, 1
2095                                 for el, i in open_els
2096                                         if el is found
2097                                                 open_els.splice i, 1
2098                         reconstruct_afe()
2099                         el = insert_html_element t
2100                         afe_push el
2101                         return
2102                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2103                         reconstruct_afe()
2104                         el = insert_html_element t
2105                         afe_push el
2106                         return
2107                 if t.type is TYPE_START_TAG and t.name is 'nobr'
2108                         reconstruct_afe()
2109                         el = insert_html_element t
2110                         afe_push el
2111                         return
2112                 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2113                         adoption_agency t.name
2114                         return
2115                 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2116                         reconstruct_afe()
2117                         insert_html_element t
2118                         afe_push_marker()
2119                         flag_frameset_ok = false
2120                         return
2121                 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2122                         unless is_in_scope t.name, NS_HTML
2123                                 parse_error()
2124                                 return
2125                         generate_implied_end_tags()
2126                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2127                                 parse_error()
2128                         loop
2129                                 el = open_els.shift()
2130                                 if el.name is t.name and el.namespace is NS_HTML
2131                                         break
2132                         clear_afe_to_marker()
2133                         return
2134                 if t.type is TYPE_START_TAG and t.name is 'table'
2135                         unless doc.flag('quirks mode') is QUIRKS_YES
2136                                 close_p_if_in_button_scope() # test
2137                         insert_html_element t
2138                         flag_frameset_ok = false
2139                         ins_mode = ins_mode_in_table
2140                         return
2141                 if t.type is TYPE_END_TAG and t.name is 'br'
2142                         parse_error()
2143                         t.type = TYPE_START_TAG
2144                         # fall through
2145                 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2146                         reconstruct_afe()
2147                         insert_html_element t
2148                         open_els.shift()
2149                         t.acknowledge_self_closing()
2150                         flag_frameset_ok = false
2151                         return
2152                 if t.type is TYPE_START_TAG and t.name is 'input'
2153                         reconstruct_afe()
2154                         insert_html_element t
2155                         open_els.shift()
2156                         t.acknowledge_self_closing()
2157                         unless is_input_hidden_tok t
2158                                 flag_frameset_ok = false
2159                         return
2160                 if t.type is TYPE_START_TAG and (t.name is 'param' or t.name is 'source' or t.name is 'track')
2161                         insert_html_element t
2162                         open_els.shift()
2163                         t.acknowledge_self_closing()
2164                         return
2165                 if t.type is TYPE_START_TAG and t.name is 'hr'
2166                         close_p_if_in_button_scope()
2167                         insert_html_element t
2168                         open_els.shift()
2169                         t.acknowledge_self_closing()
2170                         flag_frameset_ok = false
2171                         return
2172                 if t.type is TYPE_START_TAG and t.name is 'image'
2173                         parse_error()
2174                         t.name = 'img'
2175                         process_token t
2176                         return
2177                 if t.type is TYPE_START_TAG and t.name is 'isindex'
2178                         parse_error()
2179                         if template_tag_is_open() is false and form_element_pointer isnt null
2180                                 return
2181                         t.acknowledge_self_closing()
2182                         flag_frameset_ok = false
2183                         close_p_if_in_button_scope()
2184                         el = insert_html_element new_open_tag 'form'
2185                         unless template_tag_is_open()
2186                                 form_element_pointer = el
2187                         for a in t.attrs_a
2188                                 if a[0] is 'action'
2189                                         el.attrs['action'] = a[1]
2190                                         break
2191                         insert_html_element new_open_tag 'hr'
2192                         open_els.shift()
2193                         reconstruct_afe()
2194                         insert_html_element new_open_tag 'label'
2195                         # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2196                         input_el = new_open_tag 'input'
2197                         prompt = null
2198                         for a in t.attrs_a
2199                                 if a[0] is 'prompt'
2200                                         prompt = a[1]
2201                                 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2202                                         input_el.attrs_a.push [a[0], a[1]]
2203                         input_el.attrs_a.push ['name', 'isindex']
2204                         # fixfull this next bit is in english... internationalize?
2205                         prompt ?= "This is a searchable index. Enter search keywords: "
2206                         insert_character new_character_token prompt # fixfull split
2207                         # TODO submit typo "balue" in spec
2208                         insert_html_element input_el
2209                         open_els.shift()
2210                         # insert_character '' # you can put chars here if promt attr missing
2211                         open_els.shift()
2212                         insert_html_element new_open_tag 'hr'
2213                         open_els.shift()
2214                         open_els.shift()
2215                         unless template_tag_is_open()
2216                                 form_element_pointer = null
2217                         return
2218                 if t.type is TYPE_START_TAG and t.name is 'textarea'
2219                         insert_html_element t
2220                         if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2221                                 cur += 1
2222                         tok_state = tok_state_rcdata
2223                         original_ins_mode = ins_mode
2224                         flag_frameset_ok = false
2225                         ins_mode = ins_mode_text
2226                         return
2227                 if t.type is TYPE_START_TAG and t.name is 'xmp'
2228                         close_p_if_in_button_scope()
2229                         reconstruct_afe()
2230                         flag_frameset_ok = false
2231                         parse_generic_raw_text t
2232                         return
2233                 if t.type is TYPE_START_TAG and t.name is 'iframe'
2234                         flag_frameset_ok = false
2235                         parse_generic_raw_text t
2236                         return
2237                 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2238                         parse_generic_raw_text t
2239                         return
2240                 if t.type is TYPE_START_TAG and t.name is 'select'
2241                         reconstruct_afe()
2242                         insert_html_element t
2243                         flag_frameset_ok = false
2244                         if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2245                                 ins_mode = ins_mode_in_select_in_table
2246                         else
2247                                 ins_mode = ins_mode_in_select
2248                         return
2249                 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2250                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2251                                 open_els.shift()
2252                         reconstruct_afe()
2253                         insert_html_element t
2254                         return
2255 # this comment block implements the W3C spec
2256 #               if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2257 #                       if is_in_scope 'ruby', NS_HTML
2258 #                               generate_implied_end_tags()
2259 #                               unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2260 #                                       parse_error()
2261 #                       insert_html_element t
2262 #                       return
2263 #               if t.type is TYPE_START_TAG and t.name is 'rt'
2264 #                       if is_in_scope 'ruby', NS_HTML
2265 #                               generate_implied_end_tags 'rtc' # arg is exception
2266 #                               unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2267 #                                       parse_error()
2268 #                       insert_html_element t
2269 #                       return
2270 # below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2271                 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2272                         if is_in_scope 'ruby', NS_HTML
2273                                 generate_implied_end_tags()
2274                                 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2275                                         parse_error()
2276                         insert_html_element t
2277                         return
2278                 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2279                         if is_in_scope 'ruby', NS_HTML
2280                                 generate_implied_end_tags 'rtc'
2281                                 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2282                                         parse_error()
2283                         insert_html_element t
2284                         return
2285 # end WHATWG chunk
2286                 if t.type is TYPE_START_TAG and t.name is 'math'
2287                         reconstruct_afe()
2288                         adjust_mathml_attributes t
2289                         adjust_foreign_attributes t
2290                         insert_foreign_element t, NS_MATHML
2291                         if t.flag 'self-closing'
2292                                 open_els.shift()
2293                                 t.acknowledge_self_closing()
2294                         return
2295                 if t.type is TYPE_START_TAG and t.name is 'svg'
2296                         reconstruct_afe()
2297                         adjust_svg_attributes t
2298                         adjust_foreign_attributes t
2299                         insert_foreign_element t, NS_SVG
2300                         if t.flag 'self-closing'
2301                                 open_els.shift()
2302                                 t.acknowledge_self_closing()
2303                         return
2304                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2305                         parse_error()
2306                         return
2307                 if t.type is TYPE_START_TAG # any other start tag
2308                         reconstruct_afe()
2309                         insert_html_element t
2310                         return
2311                 if t.type is TYPE_END_TAG # any other end tag
2312                         in_body_any_other_end_tag t.name
2313                         return
2314                 return
2315
2316         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2317         ins_mode_text = (t) ->
2318                 if t.type is TYPE_TEXT
2319                         insert_character t
2320                         return
2321                 if t.type is TYPE_EOF
2322                         parse_error()
2323                         if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2324                                 open_els[0].flag 'already started', true
2325                         open_els.shift()
2326                         ins_mode = original_ins_mode
2327                         process_token t
2328                         return
2329                 if t.type is TYPE_END_TAG and t.name is 'script'
2330                         open_els.shift()
2331                         ins_mode = original_ins_mode
2332                         # fixfull the spec seems to assume that I'm going to run the script
2333                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2334                         return
2335                 if t.type is TYPE_END_TAG
2336                         open_els.shift()
2337                         ins_mode = original_ins_mode
2338                         return
2339                 console.log 'warning: end of ins_mode_text reached'
2340
2341         # the functions below implement the tokenizer stats described here:
2342         # http://www.w3.org/TR/html5/syntax.html#tokenization
2343
2344         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2345         ins_mode_in_table_else = (t) ->
2346                 parse_error()
2347                 flag_foster_parenting = true
2348                 ins_mode_in_body t
2349                 flag_foster_parenting = false
2350                 return
2351         ins_mode_in_table = (t) ->
2352                 switch t.type
2353                         when TYPE_TEXT
2354                                 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2355                                         pending_table_character_tokens = []
2356                                         original_ins_mode = ins_mode
2357                                         ins_mode = ins_mode_in_table_text
2358                                         process_token t
2359                                 else
2360                                         ins_mode_in_table_else t
2361                         when TYPE_COMMENT
2362                                 insert_comment t
2363                         when TYPE_DOCTYPE
2364                                 parse_error()
2365                         when TYPE_START_TAG
2366                                 switch t.name
2367                                         when 'caption'
2368                                                 clear_stack_to_table_context()
2369                                                 afe_push_marker()
2370                                                 insert_html_element t
2371                                                 ins_mode = ins_mode_in_caption
2372                                         when 'colgroup'
2373                                                 clear_stack_to_table_context()
2374                                                 insert_html_element t
2375                                                 ins_mode = ins_mode_in_column_group
2376                                         when 'col'
2377                                                 clear_stack_to_table_context()
2378                                                 insert_html_element new_open_tag 'colgroup'
2379                                                 ins_mode = ins_mode_in_column_group
2380                                                 process_token t
2381                                         when 'tbody', 'tfoot', 'thead'
2382                                                 clear_stack_to_table_context()
2383                                                 insert_html_element t
2384                                                 ins_mode = ins_mode_in_table_body
2385                                         when 'td', 'th', 'tr'
2386                                                 clear_stack_to_table_context()
2387                                                 insert_html_element new_open_tag 'tbody'
2388                                                 ins_mode = ins_mode_in_table_body
2389                                                 process_token t
2390                                         when 'table'
2391                                                 parse_error()
2392                                                 if is_in_table_scope 'table', NS_HTML
2393                                                         loop
2394                                                                 el = open_els.shift()
2395                                                                 if el.name is 'table' and el.namespace is NS_HTML
2396                                                                         break
2397                                                         reset_ins_mode()
2398                                                         process_token t
2399                                         when 'style', 'script', 'template'
2400                                                 ins_mode_in_head t
2401                                         when 'input'
2402                                                 unless is_input_hidden_tok t
2403                                                         ins_mode_in_table_else t
2404                                                 else
2405                                                         parse_error()
2406                                                         el = insert_html_element t
2407                                                         open_els.shift()
2408                                                         t.acknowledge_self_closing()
2409                                         when 'form'
2410                                                 parse_error()
2411                                                 if form_element_pointer?
2412                                                         return
2413                                                 if template_tag_is_open()
2414                                                         return
2415                                                 form_element_pointer = insert_html_element t
2416                                                 open_els.shift()
2417                                         else
2418                                                 ins_mode_in_table_else t
2419                         when TYPE_END_TAG
2420                                 switch t.name
2421                                         when 'table'
2422                                                 if is_in_table_scope 'table', NS_HTML
2423                                                         loop
2424                                                                 el = open_els.shift()
2425                                                                 if el.name is 'table' and el.namespace is NS_HTML
2426                                                                         break
2427                                                         reset_ins_mode()
2428                                                 else
2429                                                         parse_error()
2430                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2431                                                 parse_error()
2432                                         when 'template'
2433                                                 ins_mode_in_head t
2434                                         else
2435                                                 ins_mode_in_table_else t
2436                         when TYPE_EOF
2437                                 ins_mode_in_body t
2438                         else
2439                                 ins_mode_in_table_else t
2440
2441
2442         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2443         ins_mode_in_table_text = (t) ->
2444                 if t.type is TYPE_TEXT and t.text is "\u0000"
2445                         # from javascript?
2446                         parse_error()
2447                         return
2448                 if t.type is TYPE_TEXT
2449                         pending_table_character_tokens.push t
2450                         return
2451                 # Anything else
2452                 all_space = true
2453                 for old in pending_table_character_tokens
2454                         unless is_space_tok old
2455                                 all_space = false
2456                                 break
2457                 if all_space
2458                         for old in pending_table_character_tokens
2459                                 insert_character old
2460                 else
2461                         for old in pending_table_character_tokens
2462                                 ins_mode_in_table_else old
2463                 pending_table_character_tokens = []
2464                 ins_mode = original_ins_mode
2465                 process_token t
2466
2467         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2468         ins_mode_in_caption = (t) ->
2469                 if t.type is TYPE_END_TAG and t.name is 'caption'
2470                         if is_in_table_scope 'caption', NS_HTML
2471                                 generate_implied_end_tags()
2472                                 if open_els[0].name isnt 'caption'
2473                                         parse_error()
2474                                 loop
2475                                         el = open_els.shift()
2476                                         if el.name is 'caption' and el.namespace is NS_HTML
2477                                                 break
2478                                 clear_afe_to_marker()
2479                                 ins_mode = ins_mode_in_table
2480                         else
2481                                 parse_error()
2482                                 # fragment case
2483                         return
2484                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2485                         parse_error()
2486                         if is_in_table_scope 'caption', NS_HTML
2487                                 loop
2488                                         el = open_els.shift()
2489                                         if el.name is 'caption' and el.namespace is NS_HTML
2490                                                 break
2491                                 clear_afe_to_marker()
2492                                 ins_mode = ins_mode_in_table
2493                                 process_token t
2494                         # else fragment case
2495                         return
2496                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2497                         parse_error()
2498                         return
2499                 # Anything else
2500                 ins_mode_in_body t
2501
2502         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2503         ins_mode_in_column_group = (t) ->
2504                 if is_space_tok t
2505                         insert_character t
2506                         return
2507                 if t.type is TYPE_COMMENT
2508                         insert_comment t
2509                         return
2510                 if t.type is TYPE_DOCTYPE
2511                         parse_error()
2512                         return
2513                 if t.type is TYPE_START_TAG and t.name is 'html'
2514                         ins_mode_in_body t
2515                         return
2516                 if t.type is TYPE_START_TAG and t.name is 'col'
2517                         el = insert_html_element t
2518                         open_els.shift()
2519                         t.acknowledge_self_closing()
2520                         return
2521                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2522                         if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2523                                 open_els.shift()
2524                                 ins_mode = ins_mode_in_table
2525                         else
2526                                 parse_error()
2527                         return
2528                 if t.type is TYPE_END_TAG and t.name is 'col'
2529                         parse_error()
2530                         return
2531                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2532                         ins_mode_in_head t
2533                         return
2534                 if t.type is TYPE_EOF
2535                         ins_mode_in_body t
2536                         return
2537                 # Anything else
2538                 if open_els[0].name isnt 'colgroup'
2539                         parse_error()
2540                         return
2541                 open_els.shift()
2542                 ins_mode = ins_mode_in_table
2543                 process_token t
2544                 return
2545
2546         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2547         ins_mode_in_table_body = (t) ->
2548                 if t.type is TYPE_START_TAG and t.name is 'tr'
2549                         clear_stack_to_table_body_context()
2550                         insert_html_element t
2551                         ins_mode = ins_mode_in_row
2552                         return
2553                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2554                         parse_error()
2555                         clear_stack_to_table_body_context()
2556                         insert_html_element new_open_tag 'tr'
2557                         ins_mode = ins_mode_in_row
2558                         process_token t
2559                         return
2560                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2561                         unless is_in_table_scope t.name, NS_HTML
2562                                 parse_error()
2563                                 return
2564                         clear_stack_to_table_body_context()
2565                         open_els.shift()
2566                         ins_mode = ins_mode_in_table
2567                         return
2568                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2569                         has = false
2570                         for el in open_els
2571                                 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2572                                         has = true
2573                                         break
2574                                 if table_scopers[el.name] is el.namespace
2575                                         break
2576                         if !has
2577                                 parse_error()
2578                                 return
2579                         clear_stack_to_table_body_context()
2580                         open_els.shift()
2581                         ins_mode = ins_mode_in_table
2582                         process_token t
2583                         return
2584                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2585                         parse_error()
2586                         return
2587                 # Anything else
2588                 ins_mode_in_table t
2589
2590         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2591         ins_mode_in_row = (t) ->
2592                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2593                         clear_stack_to_table_row_context()
2594                         insert_html_element t
2595                         ins_mode = ins_mode_in_cell
2596                         afe_push_marker()
2597                         return
2598                 if t.type is TYPE_END_TAG and t.name is 'tr'
2599                         if is_in_table_scope 'tr', NS_HTML
2600                                 clear_stack_to_table_row_context()
2601                                 open_els.shift()
2602                                 ins_mode = ins_mode_in_table_body
2603                         else
2604                                 parse_error()
2605                         return
2606                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2607                         if is_in_table_scope 'tr', NS_HTML
2608                                 clear_stack_to_table_row_context()
2609                                 open_els.shift()
2610                                 ins_mode = ins_mode_in_table_body
2611                                 process_token t
2612                         else
2613                                 parse_error()
2614                         return
2615                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2616                         if is_in_table_scope t.name, NS_HTML
2617                                 if is_in_table_scope 'tr', NS_HTML
2618                                         clear_stack_to_table_row_context()
2619                                         open_els.shift()
2620                                         ins_mode = ins_mode_in_table_body
2621                                         process_token t
2622                         else
2623                                 parse_error()
2624                         return
2625                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2626                         parse_error()
2627                         return
2628                 # Anything else
2629                 ins_mode_in_table t
2630
2631         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2632         close_the_cell = ->
2633                 generate_implied_end_tags()
2634                 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2635                         parse_error()
2636                 loop
2637                         el = open_els.shift()
2638                         if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2639                                 break
2640                 clear_afe_to_marker()
2641                 ins_mode = ins_mode_in_row
2642
2643         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2644         ins_mode_in_cell = (t) ->
2645                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2646                         if is_in_table_scope t.name, NS_HTML
2647                                 generate_implied_end_tags()
2648                                 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2649                                         parse_error()
2650                                 loop
2651                                         el = open_els.shift()
2652                                         if el.name is t.name and el.namespace is NS_HTML
2653                                                 break
2654                                 clear_afe_to_marker()
2655                                 ins_mode = ins_mode_in_row
2656                         else
2657                                 parse_error()
2658                         return
2659                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2660                         has = false
2661                         for el in open_els
2662                                 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2663                                         has = true
2664                                         break
2665                                 if table_scopers[el.name] is el.namespace
2666                                         break
2667                         if !has
2668                                 parse_error()
2669                                 return
2670                         close_the_cell()
2671                         process_token t
2672                         return
2673                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2674                         parse_error()
2675                         return
2676                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2677                         if is_in_table_scope t.name, NS_HTML
2678                                 close_the_cell()
2679                                 process_token t
2680                         else
2681                                 parse_error()
2682                         return
2683                 # Anything Else
2684                 ins_mode_in_body t
2685
2686         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2687         ins_mode_in_select = (t) ->
2688                 if t.type is TYPE_TEXT and t.text is "\u0000"
2689                         parse_error()
2690                         return
2691                 if t.type is TYPE_TEXT
2692                         insert_character t
2693                         return
2694                 if t.type is TYPE_COMMENT
2695                         insert_comment t
2696                         return
2697                 if t.type is TYPE_DOCTYPE
2698                         parse_error()
2699                         return
2700                 if t.type is TYPE_START_TAG and t.name is 'html'
2701                         ins_mode_in_body t
2702                         return
2703                 if t.type is TYPE_START_TAG and t.name is 'option'
2704                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2705                                 open_els.shift()
2706                         insert_html_element t
2707                         return
2708                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2709                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2710                                 open_els.shift()
2711                         if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2712                                 open_els.shift()
2713                         insert_html_element t
2714                         return
2715                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2716                         if open_els[0].name is 'option' and open_els[0].namespace in NS_HTML
2717                                 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2718                                         open_els.shift()
2719                         if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2720                                 open_els.shift()
2721                         else
2722                                 parse_error()
2723                         return
2724                 if t.type is TYPE_END_TAG and t.name is 'option'
2725                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2726                                 open_els.shift()
2727                         else
2728                                 parse_error()
2729                         return
2730                 if t.type is TYPE_END_TAG and t.name is 'select'
2731                         if is_in_select_scope 'select', NS_HTML
2732                                 loop
2733                                         el = open_els.shift()
2734                                         if el.name is 'select' and el.namespace is NS_HTML
2735                                                 break
2736                                 reset_ins_mode()
2737                         else
2738                                 parse_error()
2739                         return
2740                 if t.type is TYPE_START_TAG and t.name is 'select'
2741                         parse_error()
2742                         loop
2743                                 el = open_els.shift()
2744                                 if el.name is 'select' and el.namespace is NS_HTML
2745                                         break
2746                         reset_ins_mode()
2747                         # spec says that this is the same as </select> but it doesn't say
2748                         # to check scope first
2749                         return
2750                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2751                         parse_error()
2752                         if is_in_select_scope 'select', NS_HTML
2753                                 return
2754                         loop
2755                                 el = open_els.shift()
2756                                 if el.name is 'select' and el.namespace is NS_HTML
2757                                         break
2758                         reset_ins_mode()
2759                         process_token t
2760                         return
2761                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2762                         ins_mode_in_head t
2763                         return
2764                 if t.type is TYPE_EOF
2765                         ins_mode_in_body t
2766                         return
2767                 # Anything else
2768                 parse_error()
2769                 return
2770
2771         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2772         ins_mode_in_select_in_table = (t) ->
2773                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2774                         parse_error()
2775                         loop
2776                                 el = open_els.shift()
2777                                 if el.name is 'select' and el.namespace is NS_HTML
2778                                         break
2779                         reset_ins_mode()
2780                         process_token t
2781                         return
2782                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2783                         parse_error()
2784                         unless is_in_table_scope t.name, NS_HTML
2785                                 return
2786                         loop
2787                                 el = open_els.shift()
2788                                 if el.name is 'select' and el.namespace is NS_HTML
2789                                         break
2790                         reset_ins_mode()
2791                         process_token t
2792                         return
2793                 # Anything else
2794                 ins_mode_in_select t
2795                 return
2796
2797         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2798         ins_mode_in_template = (t) ->
2799                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2800                         ins_mode_in_body t
2801                         return
2802                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2803                         ins_mode_in_head t
2804                         return
2805                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2806                         template_ins_modes.shift()
2807                         template_ins_modes.unshift ins_mode_in_table
2808                         ins_mode = ins_mode_in_table
2809                         process_token t
2810                         return
2811                 if t.type is TYPE_START_TAG and t.name is 'col'
2812                         template_ins_modes.shift()
2813                         template_ins_modes.unshift ins_mode_in_column_group
2814                         ins_mode = ins_mode_in_column_group
2815                         process_token t
2816                         return
2817                 if t.type is TYPE_START_TAG and t.name is 'tr'
2818                         template_ins_modes.shift()
2819                         template_ins_modes.unshift ins_mode_in_table_body
2820                         ins_mode = ins_mode_in_table_body
2821                         process_token t
2822                         return
2823                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2824                         template_ins_modes.shift()
2825                         template_ins_modes.unshift ins_mode_in_row
2826                         ins_mode = ins_mode_in_row
2827                         process_token t
2828                         return
2829                 if t.type is TYPE_START_TAG
2830                         template_ins_modes.shift()
2831                         template_ins_modes.unshift ins_mode_in_body
2832                         ins_mode = ins_mode_in_body
2833                         process_token t
2834                         return
2835                 if t.type is TYPE_END_TAG
2836                         parse_error()
2837                         return
2838                 if t.type is TYPE_EOF
2839                         unless template_tag_is_open()
2840                                 stop_parsing()
2841                                 return
2842                         parse_error()
2843                         loop
2844                                 el = open_els.shift()
2845                                 if el.name is 'template' and el.namespace is NS_HTML
2846                                         break
2847                         clear_afe_to_marker()
2848                         template_ins_modes.shift()
2849                         reset_ins_mode()
2850                         process_token t
2851
2852         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2853         ins_mode_after_body = (t) ->
2854                 if is_space_tok t
2855                         ins_mode_in_body t
2856                         return
2857                 if t.type is TYPE_COMMENT
2858                         first = open_els[open_els.length - 1]
2859                         insert_comment t, [first, first.children.length]
2860                         return
2861                 if t.type is TYPE_DOCTYPE
2862                         parse_error()
2863                         return
2864                 if t.type is TYPE_START_TAG and t.name is 'html'
2865                         ins_mode_in_body t
2866                         return
2867                 if t.type is TYPE_END_TAG and t.name is 'html'
2868                         if flag_fragment_parsing
2869                                 parse_error()
2870                                 return
2871                         ins_mode = ins_mode_after_after_body
2872                         return
2873                 if t.type is TYPE_EOF
2874                         stop_parsing()
2875                         return
2876                 # Anything ELse
2877                 parse_error()
2878                 ins_mode = ins_mode_in_body
2879                 process_token t
2880
2881         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2882         ins_mode_in_frameset = (t) ->
2883                 if is_space_tok t
2884                         insert_character t
2885                         return
2886                 if t.type is TYPE_COMMENT
2887                         insert_comment t
2888                         return
2889                 if t.type is TYPE_DOCTYPE
2890                         parse_error()
2891                         return
2892                 if t.type is TYPE_START_TAG and t.name is 'html'
2893                         ins_mode_in_body t
2894                         return
2895                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2896                         insert_html_element t
2897                         return
2898                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2899                         if open_els.length is 1
2900                                 parse_error()
2901                                 return # fragment case
2902                         open_els.shift()
2903                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2904                                 ins_mode = ins_mode_after_frameset
2905                         return
2906                 if t.type is TYPE_START_TAG and t.name is 'frame'
2907                         insert_html_element t
2908                         open_els.shift()
2909                         t.acknowledge_self_closing()
2910                         return
2911                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2912                         ins_mode_in_head t
2913                         return
2914                 if t.type is TYPE_EOF
2915                         if open_els.length isnt 1
2916                                 parse_error()
2917                         stop_parsing()
2918                         return
2919                 # Anything else
2920                 parse_error()
2921                 return
2922
2923         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2924         ins_mode_after_frameset = (t) ->
2925                 if is_space_tok t
2926                         insert_character t
2927                         return
2928                 if t.type is TYPE_COMMENT
2929                         insert_comment t
2930                         return
2931                 if t.type is TYPE_DOCTYPE
2932                         parse_error()
2933                         return
2934                 if t.type is TYPE_START_TAG and t.name is 'html'
2935                         ins_mode_in_body t
2936                         return
2937                 if t.type is TYPE_END_TAG and t.name is 'html'
2938                         ins_mode = ins_mode_after_after_frameset
2939                         return
2940                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2941                         ins_mode_in_head t
2942                         return
2943                 if t.type is TYPE_EOF
2944                         stop_parsing()
2945                         return
2946                 # Anything else
2947                 parse_error()
2948                 return
2949
2950         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2951         ins_mode_after_after_body = (t) ->
2952                 if t.type is TYPE_COMMENT
2953                         insert_comment t, [doc, doc.children.length]
2954                         return
2955                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2956                         ins_mode_in_body t
2957                         return
2958                 if t.type is TYPE_EOF
2959                         stop_parsing()
2960                         return
2961                 # Anything else
2962                 parse_error()
2963                 ins_mode = ins_mode_in_body
2964                 process_token t
2965                 return
2966
2967         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2968         ins_mode_after_after_frameset = (t) ->
2969                 if t.type is TYPE_COMMENT
2970                         insert_comment t, [doc, doc.children.length]
2971                         return
2972                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2973                         ins_mode_in_body t
2974                         return
2975                 if t.type is TYPE_EOF
2976                         stop_parsing()
2977                         return
2978                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2979                         ins_mode_in_head t
2980                         return
2981                 # Anything else
2982                 parse_error()
2983                 return
2984
2985         # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
2986         has_color_face_or_size = (t) ->
2987                 for a in t.attrs_a
2988                         if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
2989                                 return true
2990                 return false
2991         in_foreign_content_end_script = ->
2992                 open_els.shift()
2993                 # fixfull
2994                 return
2995         in_foreign_content_other_start = (t) ->
2996                 acn = adjusted_current_node()
2997                 if acn.namespace is NS_MATHML
2998                         adjust_mathml_attributes t
2999                 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
3000                         t.name = svg_name_fixes[t.name]
3001                 if acn.namespace is NS_SVG
3002                         adjust_svg_attributes t
3003                 adjust_foreign_attributes t
3004                 insert_foreign_element t, acn.namespace
3005                 if t.flag 'self-closing'
3006                         if t.name is 'script'
3007                                 t.acknowledge_self_closing()
3008                                 in_foreign_content_end_script()
3009                                 # fixfull
3010                         else
3011                                 open_els.shift()
3012                                 t.acknowledge_self_closing()
3013                 return
3014         in_foreign_content = (t) ->
3015                 if t.type is TYPE_TEXT and t.text is "\u0000"
3016                         parse_error()
3017                         insert_character new_character_token "\ufffd"
3018                         return
3019                 if is_space_tok t
3020                         insert_character t
3021                         return
3022                 if t.type is TYPE_TEXT
3023                         flag_frameset_ok = false
3024                         insert_character t
3025                         return
3026                 if t.type is TYPE_COMMENT
3027                         insert_comment t
3028                         return
3029                 if t.type is TYPE_DOCTYPE
3030                         parse_error()
3031                         return
3032                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
3033                         parse_error()
3034                         if flag_fragment_parsing
3035                                 in_foreign_content_other_start t
3036                                 return
3037                         loop # is this safe?
3038                                 open_els.shift()
3039                                 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
3040                                         break
3041                         process_token t
3042                         return
3043                 if t.type is TYPE_START_TAG
3044                         in_foreign_content_other_start t
3045                         return
3046                 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
3047                         in_foreign_content_end_script()
3048                         return
3049                 if t.type is TYPE_END_TAG
3050                         i = 0
3051                         node = open_els[i]
3052                         if node.name.toLowerCase() isnt t.name
3053                                 parse_error()
3054                         loop
3055                                 if node is open_els[open_els.length - 1]
3056                                         return
3057                                 if node.name.toLowerCase() is t.name
3058                                         loop
3059                                                 el = open_els.shift()
3060                                                 if el is node
3061                                                         return
3062                                 i += 1
3063                                 node = open_els[i]
3064                                 if node.namespace is NS_HTML
3065                                         break
3066                         ins_mode t # explicitly call HTML insertion mode
3067
3068
3069         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
3070         tok_state_data = ->
3071                 switch c = txt.charAt(cur++)
3072                         when '&'
3073                                 return new_text_node parse_character_reference()
3074                         when '<'
3075                                 tok_state = tok_state_tag_open
3076                         when "\u0000"
3077                                 parse_error()
3078                                 return new_text_node "\ufffd"
3079                         when '' # EOF
3080                                 return new_eof_token()
3081                         else
3082                                 return new_text_node c
3083                 return null
3084
3085         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
3086         # not needed: tok_state_character_reference_in_data = ->
3087         # just call parse_character_reference()
3088
3089         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
3090         tok_state_rcdata = ->
3091                 switch c = txt.charAt(cur++)
3092                         when '&'
3093                                 return new_text_node parse_character_reference()
3094                         when '<'
3095                                 tok_state = tok_state_rcdata_less_than_sign
3096                         when "\u0000"
3097                                 parse_error()
3098                                 return new_character_token "\ufffd"
3099                         when '' # EOF
3100                                 return new_eof_token()
3101                         else
3102                                 return new_character_token c
3103                 return null
3104
3105         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
3106         # not needed: tok_state_character_reference_in_rcdata = ->
3107         # just call parse_character_reference()
3108
3109         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
3110         tok_state_rawtext = ->
3111                 switch c = txt.charAt(cur++)
3112                         when '<'
3113                                 tok_state = tok_state_rawtext_less_than_sign
3114                         when "\u0000"
3115                                 parse_error()
3116                                 return new_character_token "\ufffd"
3117                         when '' # EOF
3118                                 return new_eof_token()
3119                         else
3120                                 return new_character_token c
3121                 return null
3122
3123         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
3124         tok_state_script_data = ->
3125                 switch c = txt.charAt(cur++)
3126                         when '<'
3127                                 tok_state = tok_state_script_data_less_than_sign
3128                         when "\u0000"
3129                                 parse_error()
3130                                 return new_character_token "\ufffd"
3131                         when '' # EOF
3132                                 return new_eof_token()
3133                         else
3134                                 return new_character_token c
3135                 return null
3136
3137         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3138         tok_state_plaintext = ->
3139                 switch c = txt.charAt(cur++)
3140                         when "\u0000"
3141                                 parse_error()
3142                                 return new_character_token "\ufffd"
3143                         when '' # EOF
3144                                 return new_eof_token()
3145                         else
3146                                 return new_character_token c
3147                 return null
3148
3149
3150         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3151         tok_state_tag_open = ->
3152                 c = txt.charAt(cur++)
3153                 if c is '!'
3154                         tok_state = tok_state_markup_declaration_open
3155                         return
3156                 if c is '/'
3157                         tok_state = tok_state_end_tag_open
3158                         return
3159                 if is_uc_alpha(c)
3160                         tok_cur_tag = new_open_tag c.toLowerCase()
3161                         tok_state = tok_state_tag_name
3162                         return
3163                 if is_lc_alpha(c)
3164                         tok_cur_tag = new_open_tag c
3165                         tok_state = tok_state_tag_name
3166                         return
3167                 if c is '?'
3168                         parse_error()
3169                         tok_cur_tag = new_comment_token '?' # FIXME right?
3170                         tok_state = tok_state_bogus_comment
3171                         return
3172                 # Anything else
3173                 parse_error()
3174                 tok_state = tok_state_data
3175                 cur -= 1 # we didn't parse/handle the char after <
3176                 return new_text_node '<'
3177
3178         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3179         tok_state_end_tag_open = ->
3180                 c = txt.charAt(cur++)
3181                 if is_uc_alpha(c)
3182                         tok_cur_tag = new_end_tag c.toLowerCase()
3183                         tok_state = tok_state_tag_name
3184                         return
3185                 if is_lc_alpha(c)
3186                         tok_cur_tag = new_end_tag c
3187                         tok_state = tok_state_tag_name
3188                         return
3189                 if c is '>'
3190                         parse_error()
3191                         tok_state = tok_state_data
3192                         return
3193                 if c is '' # EOF
3194                         parse_error()
3195                         tok_state = tok_state_data
3196                         return new_text_node '</'
3197                 # Anything else
3198                 parse_error()
3199                 tok_cur_tag = new_comment_token c
3200                 tok_state = tok_state_bogus_comment
3201                 return null
3202
3203         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3204         tok_state_tag_name = ->
3205                 switch c = txt.charAt(cur++)
3206                         when "\t", "\n", "\u000c", ' '
3207                                 tok_state = tok_state_before_attribute_name
3208                         when '/'
3209                                 tok_state = tok_state_self_closing_start_tag
3210                         when '>'
3211                                 tok_state = tok_state_data
3212                                 tmp = tok_cur_tag
3213                                 tok_cur_tag = null
3214                                 return tmp
3215                         when "\u0000"
3216                                 parse_error()
3217                                 tok_cur_tag.name += "\ufffd"
3218                         when '' # EOF
3219                                 parse_error()
3220                                 tok_state = tok_state_data
3221                         else
3222                                 if is_uc_alpha(c)
3223                                         tok_cur_tag.name += c.toLowerCase()
3224                                 else
3225                                         tok_cur_tag.name += c
3226                 return null
3227
3228         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3229         tok_state_rcdata_less_than_sign = ->
3230                 c = txt.charAt(cur++)
3231                 if c is '/'
3232                         temporary_buffer = ''
3233                         tok_state = tok_state_rcdata_end_tag_open
3234                         return null
3235                 # Anything else
3236                 tok_state = tok_state_rcdata
3237                 cur -= 1 # reconsume the input character
3238                 return new_character_token '<'
3239
3240         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3241         tok_state_rcdata_end_tag_open = ->
3242                 c = txt.charAt(cur++)
3243                 if is_uc_alpha(c)
3244                         tok_cur_tag = new_end_tag c.toLowerCase()
3245                         temporary_buffer += c
3246                         tok_state = tok_state_rcdata_end_tag_name
3247                         return null
3248                 if is_lc_alpha(c)
3249                         tok_cur_tag = new_end_tag c
3250                         temporary_buffer += c
3251                         tok_state = tok_state_rcdata_end_tag_name
3252                         return null
3253                 # Anything else
3254                 tok_state = tok_state_rcdata
3255                 cur -= 1 # reconsume the input character
3256                 return new_character_token "</" # fixfull separate these
3257
3258         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3259         is_appropriate_end_tag = (t) ->
3260                 # spec says to check against "the tag name of the last start tag to
3261                 # have been emitted from this tokenizer", but this is only called from
3262                 # the various "raw" states, so it's hopefully ok to assume that
3263                 # open_els[0].name will work instead TODO: verify this after the script
3264                 # data states are implemented
3265                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3266                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3267
3268         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3269         tok_state_rcdata_end_tag_name = ->
3270                 c = txt.charAt(cur++)
3271                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3272                         if is_appropriate_end_tag tok_cur_tag
3273                                 tok_state = tok_state_before_attribute_name
3274                                 return
3275                         # else fall through to "Anything else"
3276                 if c is '/'
3277                         if is_appropriate_end_tag tok_cur_tag
3278                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3279                                 return
3280                         # else fall through to "Anything else"
3281                 if c is '>'
3282                         if is_appropriate_end_tag tok_cur_tag
3283                                 tok_state = tok_state_data
3284                                 return tok_cur_tag
3285                         # else fall through to "Anything else"
3286                 if is_uc_alpha(c)
3287                         tok_cur_tag.name += c.toLowerCase()
3288                         temporary_buffer += c
3289                         return null
3290                 if is_lc_alpha(c)
3291                         tok_cur_tag.name += c
3292                         temporary_buffer += c
3293                         return null
3294                 # Anything else
3295                 tok_state = tok_state_rcdata
3296                 cur -= 1 # reconsume the input character
3297                 return new_character_token '</' + temporary_buffer # fixfull separate these
3298
3299         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3300         tok_state_rawtext_less_than_sign = ->
3301                 c = txt.charAt(cur++)
3302                 if c is '/'
3303                         temporary_buffer = ''
3304                         tok_state = tok_state_rawtext_end_tag_open
3305                         return null
3306                 # Anything else
3307                 tok_state = tok_state_rawtext
3308                 cur -= 1 # reconsume the input character
3309                 return new_character_token '<'
3310
3311         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3312         tok_state_rawtext_end_tag_open = ->
3313                 c = txt.charAt(cur++)
3314                 if is_uc_alpha(c)
3315                         tok_cur_tag = new_end_tag c.toLowerCase()
3316                         temporary_buffer += c
3317                         tok_state = tok_state_rawtext_end_tag_name
3318                         return null
3319                 if is_lc_alpha(c)
3320                         tok_cur_tag = new_end_tag c
3321                         temporary_buffer += c
3322                         tok_state = tok_state_rawtext_end_tag_name
3323                         return null
3324                 # Anything else
3325                 tok_state = tok_state_rawtext
3326                 cur -= 1 # reconsume the input character
3327                 return new_character_token "</" # fixfull separate these
3328
3329         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3330         tok_state_rawtext_end_tag_name = ->
3331                 c = txt.charAt(cur++)
3332                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3333                         if is_appropriate_end_tag tok_cur_tag
3334                                 tok_state = tok_state_before_attribute_name
3335                                 return
3336                         # else fall through to "Anything else"
3337                 if c is '/'
3338                         if is_appropriate_end_tag tok_cur_tag
3339                                 tok_state = tok_state_self_closing_start_tag
3340                                 return
3341                         # else fall through to "Anything else"
3342                 if c is '>'
3343                         if is_appropriate_end_tag tok_cur_tag
3344                                 tok_state = tok_state_data
3345                                 return tok_cur_tag
3346                         # else fall through to "Anything else"
3347                 if is_uc_alpha(c)
3348                         tok_cur_tag.name += c.toLowerCase()
3349                         temporary_buffer += c
3350                         return null
3351                 if is_lc_alpha(c)
3352                         tok_cur_tag.name += c
3353                         temporary_buffer += c
3354                         return null
3355                 # Anything else
3356                 tok_state = tok_state_rawtext
3357                 cur -= 1 # reconsume the input character
3358                 return new_character_token '</' + temporary_buffer # fixfull separate these
3359
3360         # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3361         tok_state_script_data_less_than_sign = ->
3362                 c = txt.charAt(cur++)
3363                 if c is '/'
3364                         temporary_buffer = ''
3365                         tok_state = tok_state_script_data_end_tag_open
3366                         return
3367                 if c is '!'
3368                         tok_state = tok_state_script_data_escape_start
3369                         return new_character_token '<!' # fixfull split
3370                 # Anything else
3371                 tok_state = tok_state_script_data
3372                 cur -= 1 # Reconsume
3373                 return new_character_token '<'
3374
3375         # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3376         tok_state_script_data_end_tag_open = ->
3377                 c = txt.charAt(cur++)
3378                 if is_uc_alpha(c)
3379                         tok_cur_tag = new_end_tag c.toLowerCase()
3380                         temporary_buffer += c
3381                         tok_state = tok_state_script_data_end_tag_name
3382                         return
3383                 if is_lc_alpha(c)
3384                         tok_cur_tag = new_end_tag c
3385                         temporary_buffer += c
3386                         tok_state = tok_state_script_data_end_tag_name
3387                         return
3388                 # Anything else
3389                 tok_state = tok_state_script_data
3390                 cur -= 1 # Reconsume
3391                 return new_character_token '</'
3392
3393         # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3394         tok_state_script_data_end_tag_name = ->
3395                 c = txt.charAt(cur++)
3396                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3397                         if is_appropriate_end_tag tok_cur_tag
3398                                 tok_state = tok_state_before_attribute_name
3399                                 return
3400                         # fall through
3401                 if c is '/'
3402                         if is_appropriate_end_tag tok_cur_tag
3403                                 tok_state = tok_state_self_closing_start_tag
3404                                 return
3405                         # fall through
3406                 if c is '>'
3407                         if is_appropriate_end_tag tok_cur_tag
3408                                 tok_state = tok_state_data
3409                                 return tok_cur_tag
3410                         # fall through
3411                 if is_uc_alpha(c)
3412                         tok_cur_tag.name += c.toLowerCase()
3413                         temporary_buffer += c
3414                         return
3415                 if is_lc_alpha(c)
3416                         tok_cur_tag.name += c
3417                         temporary_buffer += c
3418                         return
3419                 # Anything else
3420                 tok_state = tok_state_script_data
3421                 cur -= 1 # Reconsume
3422                 return new_character_token "</#{temporary_buffer}" # fixfull split
3423
3424         # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3425         tok_state_script_data_escape_start = ->
3426                 c = txt.charAt(cur++)
3427                 if c is '-'
3428                         tok_state = tok_state_script_data_escape_start_dash
3429                         return new_character_token '-'
3430                 # Anything else
3431                 tok_state = tok_state_script_data
3432                 cur -= 1 # Reconsume
3433                 return
3434
3435         # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3436         tok_state_script_data_escape_start_dash = ->
3437                 c = txt.charAt(cur++)
3438                 if c is '-'
3439                         tok_state = tok_state_script_data_escaped_dash_dash
3440                         return new_character_token '-'
3441                 # Anything else
3442                 tok_state = tok_state_script_data
3443                 cur -= 1 # Reconsume
3444                 return
3445
3446         # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3447         tok_state_script_data_escaped = ->
3448                 c = txt.charAt(cur++)
3449                 if c is '-'
3450                         tok_state = tok_state_script_data_escaped_dash
3451                         return new_character_token '-'
3452                 if c is '<'
3453                         tok_state = tok_state_script_data_escaped_less_than_sign
3454                         return
3455                 if c is "\u0000"
3456                         parse_error()
3457                         return new_character_token "\ufffd"
3458                 if c is '' # EOF
3459                         tok_state = tok_state_data
3460                         parse_error()
3461                         cur -= 1 # Reconsume
3462                         return
3463                 # Anything else
3464                 return new_character_token c
3465
3466         # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3467         tok_state_script_data_escaped_dash = ->
3468                 c = txt.charAt(cur++)
3469                 if c is '-'
3470                         tok_state = tok_state_script_data_escaped_dash_dash
3471                         return new_character_token '-'
3472                 if c is '<'
3473                         tok_state = tok_state_script_data_escaped_less_than_sign
3474                         return
3475                 if c is "\u0000"
3476                         parse_error()
3477                         tok_state = tok_state_script_data_escaped
3478                         return new_character_token "\ufffd"
3479                 if c is '' # EOF
3480                         tok_state = tok_state_data
3481                         parse_error()
3482                         cur -= 1 # Reconsume
3483                         return
3484                 # Anything else
3485                 tok_state = tok_state_script_data_escaped
3486                 return new_character_token c
3487
3488         # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3489         tok_state_script_data_escaped_dash_dash = ->
3490                 c = txt.charAt(cur++)
3491                 if c is '-'
3492                         return new_character_token '-'
3493                 if c is '<'
3494                         tok_state = tok_state_script_data_escaped_less_than_sign
3495                         return
3496                 if c is '>'
3497                         tok_state = tok_state_script_data
3498                         return new_character_token '>'
3499                 if c is "\u0000"
3500                         parse_error()
3501                         tok_state = tok_state_script_data_escaped
3502                         return new_character_token "\ufffd"
3503                 if c is '' # EOF
3504                         parse_error()
3505                         tok_state = tok_state_data
3506                         cur -= 1 # Reconsume
3507                         return
3508                 # Anything else
3509                 tok_state = tok_state_script_data_escaped
3510                 return new_character_token c
3511
3512         # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3513         tok_state_script_data_escaped_less_than_sign = ->
3514                 c = txt.charAt(cur++)
3515                 if c is '/'
3516                         temporary_buffer = ''
3517                         tok_state = tok_state_script_data_escaped_end_tag_open
3518                         return
3519                 if is_uc_alpha(c)
3520                         temporary_buffer = c.toLowerCase() # yes, really
3521                         tok_state = tok_state_script_data_double_escape_start
3522                         return new_character_token "<#{c}" # fixfull split
3523                 if is_lc_alpha(c)
3524                         temporary_buffer = c
3525                         tok_state = tok_state_script_data_double_escape_start
3526                         return new_character_token "<#{c}" # fixfull split
3527                 # Anything else
3528                 tok_state = tok_state_script_data_escaped
3529                 cur -= 1 # Reconsume
3530                 return new_character_token '<'
3531
3532         # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3533         tok_state_script_data_escaped_end_tag_open = ->
3534                 c = txt.charAt(cur++)
3535                 if is_uc_alpha(c)
3536                         tok_cur_tag = new_end_tag c.toLowerCase()
3537                         temporary_buffer += c
3538                         tok_state = tok_state_script_data_escaped_end_tag_name
3539                         return
3540                 if is_lc_alpha(c)
3541                         tok_cur_tag = new_end_tag c
3542                         temporary_buffer += c
3543                         tok_state = tok_state_script_data_escaped_end_tag_name
3544                         return
3545                 # Anything else
3546                 tok_state = tok_state_script_data_escaped
3547                 cur -= 1 # Reconsume
3548                 return new_character_token '</' # fixfull split
3549
3550         # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3551         tok_state_script_data_escaped_end_tag_name = ->
3552                 c = txt.charAt(cur++)
3553                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3554                         if is_appropriate_end_tag tok_cur_tag
3555                                 tok_state = tok_state_before_attribute_name
3556                                 return
3557                         # fall through
3558                 if c is '/'
3559                         if is_appropriate_end_tag tok_cur_tag
3560                                 tok_state = tok_state_self_closing_start_tag
3561                                 return
3562                         # fall through
3563                 if c is '>'
3564                         if is_appropriate_end_tag tok_cur_tag
3565                                 tok_state = tok_state_data
3566                                 return tok_cur_tag
3567                         # fall through
3568                 if is_uc_alpha(c)
3569                         tok_cur_tag.name += c.toLowerCase()
3570                         temporary_buffer += c.toLowerCase()
3571                         return
3572                 if is_lc_alpha(c)
3573                         tok_cur_tag.name += c
3574                         temporary_buffer += c.toLowerCase()
3575                         return
3576                 # Anything else
3577                 tok_state = tok_state_script_data_escaped
3578                 cur -= 1 # Reconsume
3579                 return new_character_token "</#{temporary_buffer}" # fixfull split
3580
3581         # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3582         tok_state_script_data_double_escape_start = ->
3583                 c = txt.charAt(cur++)
3584                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3585                         if temporary_buffer is 'script'
3586                                 tok_state = tok_state_script_data_double_escaped
3587                         else
3588                                 tok_state = tok_state_script_data_escaped
3589                         return new_character_token c
3590                 if is_uc_alpha(c)
3591                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3592                         return new_character_token c
3593                 if is_lc_alpha(c)
3594                         temporary_buffer += c
3595                         return new_character_token c
3596                 # Anything else
3597                 tok_state = tok_state_script_data_escaped
3598                 cur -= 1 # Reconsume
3599                 return
3600
3601         # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3602         tok_state_script_data_double_escaped = ->
3603                 c = txt.charAt(cur++)
3604                 if c is '-'
3605                         tok_state = tok_state_script_data_double_escaped_dash
3606                         return new_character_token '-'
3607                 if c is '<'
3608                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3609                         return new_character_token '<'
3610                 if c is "\u0000"
3611                         parse_error()
3612                         return new_character_token "\ufffd"
3613                 if c is '' # EOF
3614                         parse_error()
3615                         tok_state = tok_state_data
3616                         cur -= 1 # Reconsume
3617                         return
3618                 # Anything else
3619                 return new_character_token c
3620
3621         # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3622         tok_state_script_data_double_escaped_dash = ->
3623                 c = txt.charAt(cur++)
3624                 if c is '-'
3625                         tok_state = tok_state_script_data_double_escaped_dash_dash
3626                         return new_character_token '-'
3627                 if c is '<'
3628                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3629                         return new_character_token '<'
3630                 if c is "\u0000"
3631                         parse_error()
3632                         tok_state = tok_state_script_data_double_escaped
3633                         return new_character_token "\ufffd"
3634                 if c is '' # EOF
3635                         parse_error()
3636                         tok_state = tok_state_data
3637                         cur -= 1 # Reconsume
3638                         return
3639                 # Anything else
3640                 tok_state = tok_state_script_data_double_escaped
3641                 return new_character_token c
3642
3643         # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3644         tok_state_script_data_double_escaped_dash_dash = ->
3645                 c = txt.charAt(cur++)
3646                 if c is '-'
3647                         return new_character_token '-'
3648                 if c is '<'
3649                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3650                         return new_character_token '<'
3651                 if c is '>'
3652                         tok_state = tok_state_script_data
3653                         return new_character_token '>'
3654                 if c is "\u0000"
3655                         parse_error()
3656                         tok_state = tok_state_script_data_double_escaped
3657                         return new_character_token "\ufffd"
3658                 if c is '' # EOF
3659                         parse_error()
3660                         tok_state = tok_state_data
3661                         cur -= 1 # Reconsume
3662                         return
3663                 # Anything else
3664                 tok_state = tok_state_script_data_double_escaped
3665                 return new_character_token c
3666
3667         # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3668         tok_state_script_data_double_escaped_less_than_sign = ->
3669                 c = txt.charAt(cur++)
3670                 if c is '/'
3671                         temporary_buffer = ''
3672                         tok_state = tok_state_script_data_double_escape_end
3673                         return new_character_token '/'
3674                 # Anything else
3675                 tok_state = tok_state_script_data_double_escaped
3676                 cur -= 1 # Reconsume
3677                 return
3678
3679         # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3680         tok_state_script_data_double_escape_end = ->
3681                 c = txt.charAt(cur++)
3682                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3683                         if temporary_buffer is 'script'
3684                                 tok_state = tok_state_script_data_escaped
3685                         else
3686                                 tok_state = tok_state_script_data_double_escaped
3687                         return new_character_token c
3688                 if is_uc_alpha(c)
3689                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3690                         return new_character_token c
3691                 if is_lc_alpha(c)
3692                         temporary_buffer += c
3693                         return new_character_token c
3694                 # Anything else
3695                 tok_state = tok_state_script_data_double_escaped
3696                 cur -= 1 # Reconsume
3697                 return
3698
3699         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3700         tok_state_before_attribute_name = ->
3701                 attr_name = null
3702                 switch c = txt.charAt(cur++)
3703                         when "\t", "\n", "\u000c", ' '
3704                                 return null
3705                         when '/'
3706                                 tok_state = tok_state_self_closing_start_tag
3707                                 return null
3708                         when '>'
3709                                 tok_state = tok_state_data
3710                                 tmp = tok_cur_tag
3711                                 tok_cur_tag = null
3712                                 return tmp
3713                         when "\u0000"
3714                                 parse_error()
3715                                 attr_name = "\ufffd"
3716                         when '"', "'", '<', '='
3717                                 parse_error()
3718                                 attr_name = c
3719                         when '' # EOF
3720                                 parse_error()
3721                                 tok_state = tok_state_data
3722                         else
3723                                 if is_uc_alpha(c)
3724                                         attr_name = c.toLowerCase()
3725                                 else
3726                                         attr_name = c
3727                 if attr_name?
3728                         tok_cur_tag.attrs_a.unshift [attr_name, '']
3729                         tok_state = tok_state_attribute_name
3730                 return null
3731
3732         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3733         tok_state_attribute_name = ->
3734                 switch c = txt.charAt(cur++)
3735                         when "\t", "\n", "\u000c", ' '
3736                                 tok_state = tok_state_after_attribute_name
3737                         when '/'
3738                                 tok_state = tok_state_self_closing_start_tag
3739                         when '='
3740                                 tok_state = tok_state_before_attribute_value
3741                         when '>'
3742                                 tok_state = tok_state_data
3743                                 tmp = tok_cur_tag
3744                                 tok_cur_tag = null
3745                                 return tmp
3746                         when "\u0000"
3747                                 parse_error()
3748                                 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3749                         when '"', "'", '<'
3750                                 parse_error()
3751                                 tok_cur_tag.attrs_a[0][0] += c
3752                         when '' # EOF
3753                                 parse_error()
3754                                 tok_state = tok_state_data
3755                         else
3756                                 if is_uc_alpha(c)
3757                                         tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3758                                 else
3759                                         tok_cur_tag.attrs_a[0][0] += c
3760                 return null
3761
3762         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3763         tok_state_after_attribute_name = ->
3764                 c = txt.charAt(cur++)
3765                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3766                         return
3767                 if c is '/'
3768                         tok_state = tok_state_self_closing_start_tag
3769                         return
3770                 if c is '='
3771                         tok_state = tok_state_before_attribute_value
3772                         return
3773                 if c is '>'
3774                         tok_state = tok_state_data
3775                         return
3776                 if is_uc_alpha(c)
3777                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3778                         tok_state = tok_state_attribute_name
3779                         return
3780                 if c is "\u0000"
3781                         parse_error()
3782                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3783                         tok_state = tok_state_attribute_name
3784                         return
3785                 if c is '' # EOF
3786                         parse_error()
3787                         tok_state = tok_state_data
3788                         cur -= 1 # reconsume
3789                         return
3790                 if c is '"' or c is "'" or c is '<'
3791                         parse_error()
3792                         # fall through to Anything else
3793                 # Anything else
3794                 tok_cur_tag.attrs_a.unshift [c, '']
3795                 tok_state = tok_state_attribute_name
3796
3797         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3798         tok_state_before_attribute_value = ->
3799                 switch c = txt.charAt(cur++)
3800                         when "\t", "\n", "\u000c", ' '
3801                                 return null
3802                         when '"'
3803                                 tok_state = tok_state_attribute_value_double_quoted
3804                         when '&'
3805                                 tok_state = tok_state_attribute_value_unquoted
3806                                 cur -= 1
3807                         when "'"
3808                                 tok_state = tok_state_attribute_value_single_quoted
3809                         when "\u0000"
3810                                 # Parse error
3811                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3812                                 tok_state = tok_state_attribute_value_unquoted
3813                         when '>'
3814                                 # Parse error
3815                                 tok_state = tok_state_data
3816                                 tmp = tok_cur_tag
3817                                 tok_cur_tag = null
3818                                 return tmp
3819                         when '' # EOF
3820                                 parse_error()
3821                                 tok_state = tok_state_data
3822                         else
3823                                 tok_cur_tag.attrs_a[0][1] += c
3824                                 tok_state = tok_state_attribute_value_unquoted
3825                 return null
3826
3827         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3828         tok_state_attribute_value_double_quoted = ->
3829                 switch c = txt.charAt(cur++)
3830                         when '"'
3831                                 tok_state = tok_state_after_attribute_value_quoted
3832                         when '&'
3833                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3834                         when "\u0000"
3835                                 # Parse error
3836                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3837                         when '' # EOF
3838                                 parse_error()
3839                                 tok_state = tok_state_data
3840                         else
3841                                 tok_cur_tag.attrs_a[0][1] += c
3842                 return null
3843
3844         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3845         tok_state_attribute_value_single_quoted = ->
3846                 switch c = txt.charAt(cur++)
3847                         when "'"
3848                                 tok_state = tok_state_after_attribute_value_quoted
3849                         when '&'
3850                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3851                         when "\u0000"
3852                                 # Parse error
3853                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3854                         when '' # EOF
3855                                 parse_error()
3856                                 tok_state = tok_state_data
3857                         else
3858                                 tok_cur_tag.attrs_a[0][1] += c
3859                 return null
3860
3861         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3862         tok_state_attribute_value_unquoted = ->
3863                 switch c = txt.charAt(cur++)
3864                         when "\t", "\n", "\u000c", ' '
3865                                 tok_state = tok_state_before_attribute_name
3866                         when '&'
3867                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3868                         when '>'
3869                                 tok_state = tok_state_data
3870                                 tmp = tok_cur_tag
3871                                 tok_cur_tag = null
3872                                 return tmp
3873                         when "\u0000"
3874                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3875                         when '' # EOF
3876                                 parse_error()
3877                                 tok_state = tok_state_data
3878                         else
3879                                 # Parse Error if ', <, = or ` (backtick)
3880                                 tok_cur_tag.attrs_a[0][1] += c
3881                 return null
3882
3883         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3884         tok_state_after_attribute_value_quoted = ->
3885                 switch c = txt.charAt(cur++)
3886                         when "\t", "\n", "\u000c", ' '
3887                                 tok_state = tok_state_before_attribute_name
3888                         when '/'
3889                                 tok_state = tok_state_self_closing_start_tag
3890                         when '>'
3891                                 tok_state = tok_state_data
3892                                 tmp = tok_cur_tag
3893                                 tok_cur_tag = null
3894                                 return tmp
3895                         when '' # EOF
3896                                 parse_error()
3897                                 tok_state = tok_state_data
3898                         else
3899                                 # Parse Error
3900                                 tok_state = tok_state_before_attribute_name
3901                                 cur -= 1 # we didn't handle that char
3902                 return null
3903
3904         # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3905         tok_state_self_closing_start_tag = ->
3906                 c = txt.charAt(cur++)
3907                 if c is '>'
3908                         tok_cur_tag.flag 'self-closing', true
3909                         tok_state = tok_state_data
3910                         return tok_cur_tag
3911                 if c is ''
3912                         parse_error()
3913                         tok_state = tok_state_data
3914                         cur -= 1 # Reconsume
3915                         return
3916                 # Anything else
3917                 parse_error()
3918                 tok_state = tok_state_before_attribute_name
3919                 cur -= 1 # Reconsume
3920                 return
3921
3922         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3923         # WARNING: put a comment token in tok_cur_tag before setting this state
3924         tok_state_bogus_comment = ->
3925                 next_gt = txt.indexOf '>', cur
3926                 if next_gt is -1
3927                         val = txt.substr cur
3928                         cur = txt.length
3929                 else
3930                         val = txt.substr cur, (next_gt - cur)
3931                         cur = next_gt + 1
3932                 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3933                 tok_cur_tag.text += val
3934                 tok_state = tok_state_data
3935                 return tok_cur_tag
3936
3937         # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3938         tok_state_markup_declaration_open = ->
3939                 if txt.substr(cur, 2) is '--'
3940                         cur += 2
3941                         tok_cur_tag = new_comment_token ''
3942                         tok_state = tok_state_comment_start
3943                         return
3944                 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3945                         cur += 7
3946                         tok_state = tok_state_doctype
3947                         return
3948                 acn = adjusted_current_node()
3949                 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3950                         cur += 7
3951                         tok_state = tok_state_cdata_section
3952                         return
3953                 # Otherwise
3954                 parse_error()
3955                 tok_cur_tag = new_comment_token ''
3956                 tok_state = tok_state_bogus_comment
3957                 return
3958
3959         # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3960         tok_state_comment_start = ->
3961                 switch c = txt.charAt(cur++)
3962                         when '-'
3963                                 tok_state = tok_state_comment_start_dash
3964                         when "\u0000"
3965                                 parse_error()
3966                                 tok_state = tok_state_comment
3967                                 return new_character_token "\ufffd"
3968                         when '>'
3969                                 parse_error()
3970                                 tok_state = tok_state_data
3971                                 return tok_cur_tag
3972                         when '' # EOF
3973                                 parse_error()
3974                                 tok_state = tok_state_data
3975                                 cur -= 1 # Reconsume
3976                                 return tok_cur_tag
3977                         else
3978                                 tok_cur_tag.text += c
3979                                 tok_state = tok_state_comment
3980                 return null
3981
3982         # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3983         tok_state_comment_start_dash = ->
3984                 switch c = txt.charAt(cur++)
3985                         when '-'
3986                                 tok_state = tok_state_comment_end
3987                         when "\u0000"
3988                                 parse_error()
3989                                 tok_cur_tag.text += "-\ufffd"
3990                                 tok_state = tok_state_comment
3991                         when '>'
3992                                 parse_error()
3993                                 tok_state = tok_state_data
3994                                 return tok_cur_tag
3995                         when '' # EOF
3996                                 parse_error()
3997                                 tok_state = tok_state_data
3998                                 cur -= 1 # Reconsume
3999                                 return tok_cur_tag
4000                         else
4001                                 tok_cur_tag.text += "-#{c}"
4002                                 tok_state = tok_state_comment
4003                 return null
4004
4005         # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
4006         tok_state_comment = ->
4007                 switch c = txt.charAt(cur++)
4008                         when '-'
4009                                 tok_state = tok_state_comment_end_dash
4010                         when "\u0000"
4011                                 parse_error()
4012                                 tok_cur_tag.text += "\ufffd"
4013                         when '' # EOF
4014                                 parse_error()
4015                                 tok_state = tok_state_data
4016                                 cur -= 1 # Reconsume
4017                                 return tok_cur_tag
4018                         else
4019                                 tok_cur_tag.text += c
4020                 return null
4021
4022         # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
4023         tok_state_comment_end_dash = ->
4024                 switch c = txt.charAt(cur++)
4025                         when '-'
4026                                 tok_state = tok_state_comment_end
4027                         when "\u0000"
4028                                 parse_error()
4029                                 tok_cur_tag.text += "-\ufffd"
4030                                 tok_state = tok_state_comment
4031                         when '' # EOF
4032                                 parse_error()
4033                                 tok_state = tok_state_data
4034                                 cur -= 1 # Reconsume
4035                                 return tok_cur_tag
4036                         else
4037                                 tok_cur_tag.text += "-#{c}"
4038                                 tok_state = tok_state_comment
4039                 return null
4040
4041         # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
4042         tok_state_comment_end = ->
4043                 switch c = txt.charAt(cur++)
4044                         when '>'
4045                                 tok_state = tok_state_data
4046                                 return tok_cur_tag
4047                         when "\u0000"
4048                                 parse_error()
4049                                 tok_cur_tag.text += "--\ufffd"
4050                                 tok_state = tok_state_comment
4051                         when '!'
4052                                 parse_error()
4053                                 tok_state = tok_state_comment_end_bang
4054                         when '-'
4055                                 parse_error()
4056                                 tok_cur_tag.text += '-'
4057                         when '' # EOF
4058                                 parse_error()
4059                                 tok_state = tok_state_data
4060                                 cur -= 1 # Reconsume
4061                                 return tok_cur_tag
4062                         else
4063                                 parse_error()
4064                                 tok_cur_tag.text += "--#{c}"
4065                                 tok_state = tok_state_comment
4066                 return null
4067
4068         # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
4069         tok_state_comment_end_bang = ->
4070                 switch c = txt.charAt(cur++)
4071                         when '-'
4072                                 tok_cur_tag.text += "--!#{c}"
4073                                 tok_state = tok_state_comment_end_dash
4074                         when '>'
4075                                 tok_state = tok_state_data
4076                                 return tok_cur_tag
4077                         when "\u0000"
4078                                 parse_error()
4079                                 tok_cur_tag.text += "--!\ufffd"
4080                                 tok_state = tok_state_comment
4081                         when '' # EOF
4082                                 parse_error()
4083                                 tok_state = tok_state_data
4084                                 cur -= 1 # Reconsume
4085                                 return tok_cur_tag
4086                         else
4087                                 tok_cur_tag.text += "--!#{c}"
4088                                 tok_state = tok_state_comment
4089                 return null
4090
4091         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4092         tok_state_doctype = ->
4093                 switch c = txt.charAt(cur++)
4094                         when "\t", "\u000a", "\u000c", ' '
4095                                 tok_state = tok_state_before_doctype_name
4096                         when '' # EOF
4097                                 parse_error()
4098                                 tok_state = tok_state_data
4099                                 el = new_doctype_token ''
4100                                 el.flag 'force-quirks', true
4101                                 cur -= 1 # Reconsume
4102                                 return el
4103                         else
4104                                 parse_error()
4105                                 tok_state = tok_state_before_doctype_name
4106                                 cur -= 1 # Reconsume
4107                 return null
4108
4109         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4110         tok_state_before_doctype_name = ->
4111                 c = txt.charAt(cur++)
4112                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4113                         return
4114                 if is_uc_alpha(c)
4115                         tok_cur_tag = new_doctype_token c.toLowerCase()
4116                         tok_state = tok_state_doctype_name
4117                         return
4118                 if c is "\u0000"
4119                         parse_error()
4120                         tok_cur_tag = new_doctype_token "\ufffd"
4121                         tok_state = tok_state_doctype_name
4122                         return
4123                 if c is '>'
4124                         parse_error()
4125                         el = new_doctype_token ''
4126                         el.flag 'force-quirks', true
4127                         tok_state = tok_state_data
4128                         return el
4129                 if c is '' # EOF
4130                         parse_error()
4131                         tok_state = tok_state_data
4132                         el = new_doctype_token ''
4133                         el.flag 'force-quirks', true
4134                         cur -= 1 # Reconsume
4135                         return el
4136                 # Anything else
4137                 tok_cur_tag = new_doctype_token c
4138                 tok_state = tok_state_doctype_name
4139                 return null
4140
4141         # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4142         tok_state_doctype_name = ->
4143                 c = txt.charAt(cur++)
4144                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4145                         tok_state = tok_state_after_doctype_name
4146                         return
4147                 if c is '>'
4148                         tok_state = tok_state_data
4149                         return tok_cur_tag
4150                 if is_uc_alpha(c)
4151                         tok_cur_tag.name += c.toLowerCase()
4152                         return
4153                 if c is "\u0000"
4154                         parse_error()
4155                         tok_cur_tag.name += "\ufffd"
4156                         return
4157                 if c is '' # EOF
4158                         parse_error()
4159                         tok_state = tok_state_data
4160                         tok_cur_tag.flag 'force-quirks', true
4161                         cur -= 1 # Reconsume
4162                         return tok_cur_tag
4163                 # Anything else
4164                 tok_cur_tag.name += c
4165                 return null
4166
4167         # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4168         tok_state_after_doctype_name = ->
4169                 c = txt.charAt(cur++)
4170                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4171                         return
4172                 if c is '>'
4173                         tok_state = tok_state_data
4174                         return tok_cur_tag
4175                 if c is '' # EOF
4176                         parse_error()
4177                         tok_state = tok_state_data
4178                         tok_cur_tag.flag 'force-quirks', true
4179                         cur -= 1 # Reconsume
4180                         return tok_cur_tag
4181                 # Anything else
4182                 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4183                         cur += 5
4184                         tok_state = tok_state_after_doctype_public_keyword
4185                         return
4186                 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4187                         cur += 5
4188                         tok_state = tok_state_after_doctype_system_keyword
4189                         return
4190                 parse_error()
4191                 tok_cur_tag.flag 'force-quirks', true
4192                 tok_state = tok_state_bogus_doctype
4193                 return null
4194
4195         # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4196         tok_state_after_doctype_public_keyword = ->
4197                 c = txt.charAt(cur++)
4198                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4199                         tok_state = tok_state_before_doctype_public_identifier
4200                         return
4201                 if c is '"'
4202                         parse_error()
4203                         tok_cur_tag.public_identifier = ''
4204                         tok_state = tok_state_doctype_public_identifier_double_quoted
4205                         return
4206                 if c is "'"
4207                         parse_error()
4208                         tok_cur_tag.public_identifier = ''
4209                         tok_state = tok_state_doctype_public_identifier_single_quoted
4210                         return
4211                 if c is '>'
4212                         parse_error()
4213                         tok_cur_tag.flag 'force-quirks', true
4214                         tok_state = tok_state_data
4215                         return tok_cur_tag
4216                 if c is '' # EOF
4217                         parse_error()
4218                         tok_state = tok_state_data
4219                         tok_cur_tag.flag 'force-quirks', true
4220                         cur -= 1 # Reconsume
4221                         return tok_cur_tag
4222                 # Anything else
4223                 parse_error()
4224                 tok_cur_tag.flag 'force-quirks', true
4225                 tok_state = tok_state_bogus_doctype
4226                 return null
4227
4228         # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4229         tok_state_before_doctype_public_identifier = ->
4230                 c = txt.charAt(cur++)
4231                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4232                         return
4233                 if c is '"'
4234                         parse_error()
4235                         tok_cur_tag.public_identifier = ''
4236                         tok_state = tok_state_doctype_public_identifier_double_quoted
4237                         return
4238                 if c is "'"
4239                         parse_error()
4240                         tok_cur_tag.public_identifier = ''
4241                         tok_state = tok_state_doctype_public_identifier_single_quoted
4242                         return
4243                 if c is '>'
4244                         parse_error()
4245                         tok_cur_tag.flag 'force-quirks', true
4246                         tok_state = tok_state_data
4247                         return tok_cur_tag
4248                 if c is '' # EOF
4249                         parse_error()
4250                         tok_state = tok_state_data
4251                         tok_cur_tag.flag 'force-quirks', true
4252                         cur -= 1 # Reconsume
4253                         return tok_cur_tag
4254                 # Anything else
4255                 parse_error()
4256                 tok_cur_tag.flag 'force-quirks', true
4257                 tok_state = tok_state_bogus_doctype
4258                 return null
4259
4260
4261         # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4262         tok_state_doctype_public_identifier_double_quoted = ->
4263                 c = txt.charAt(cur++)
4264                 if c is '"'
4265                         tok_state = tok_state_after_doctype_public_identifier
4266                         return
4267                 if c is "\u0000"
4268                         parse_error()
4269                         tok_cur_tag.public_identifier += "\ufffd"
4270                         return
4271                 if c is '>'
4272                         parse_error()
4273                         tok_cur_tag.flag 'force-quirks', true
4274                         tok_state = tok_state_data
4275                         return tok_cur_tag
4276                 if c is '' # EOF
4277                         parse_error()
4278                         tok_state = tok_state_data
4279                         tok_cur_tag.flag 'force-quirks', true
4280                         cur -= 1 # Reconsume
4281                         return tok_cur_tag
4282                 # Anything else
4283                 tok_cur_tag.public_identifier += c
4284                 return null
4285
4286         # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4287         tok_state_doctype_public_identifier_single_quoted = ->
4288                 c = txt.charAt(cur++)
4289                 if c is "'"
4290                         tok_state = tok_state_after_doctype_public_identifier
4291                         return
4292                 if c is "\u0000"
4293                         parse_error()
4294                         tok_cur_tag.public_identifier += "\ufffd"
4295                         return
4296                 if c is '>'
4297                         parse_error()
4298                         tok_cur_tag.flag 'force-quirks', true
4299                         tok_state = tok_state_data
4300                         return tok_cur_tag
4301                 if c is '' # EOF
4302                         parse_error()
4303                         tok_state = tok_state_data
4304                         tok_cur_tag.flag 'force-quirks', true
4305                         cur -= 1 # Reconsume
4306                         return tok_cur_tag
4307                 # Anything else
4308                 tok_cur_tag.public_identifier += c
4309                 return null
4310
4311         # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4312         tok_state_after_doctype_public_identifier = ->
4313                 c = txt.charAt(cur++)
4314                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4315                         tok_state = tok_state_between_doctype_public_and_system_identifiers
4316                         return
4317                 if c is '>'
4318                         tok_state = tok_state_data
4319                         return tok_cur_tag
4320                 if c is '"'
4321                         parse_error()
4322                         tok_cur_tag.system_identifier = ''
4323                         tok_state = tok_state_doctype_system_identifier_double_quoted
4324                         return
4325                 if c is "'"
4326                         parse_error()
4327                         tok_cur_tag.system_identifier = ''
4328                         tok_state = tok_state_doctype_system_identifier_single_quoted
4329                         return
4330                 if c is '' # EOF
4331                         parse_error()
4332                         tok_state = tok_state_data
4333                         tok_cur_tag.flag 'force-quirks', true
4334                         cur -= 1 # Reconsume
4335                         return tok_cur_tag
4336                 # Anything else
4337                 parse_error()
4338                 tok_cur_tag.flag 'force-quirks', true
4339                 tok_state = tok_state_bogus_doctype
4340                 return null
4341
4342         # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4343         tok_state_between_doctype_public_and_system_identifiers = ->
4344                 c = txt.charAt(cur++)
4345                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4346                         return
4347                 if c is '>'
4348                         tok_state = tok_state_data
4349                         return tok_cur_tag
4350                 if c is '"'
4351                         parse_error()
4352                         tok_cur_tag.system_identifier = ''
4353                         tok_state = tok_state_doctype_system_identifier_double_quoted
4354                         return
4355                 if c is "'"
4356                         parse_error()
4357                         tok_cur_tag.system_identifier = ''
4358                         tok_state = tok_state_doctype_system_identifier_single_quoted
4359                         return
4360                 if c is '' # EOF
4361                         parse_error()
4362                         tok_state = tok_state_data
4363                         tok_cur_tag.flag 'force-quirks', true
4364                         cur -= 1 # Reconsume
4365                         return tok_cur_tag
4366                 # Anything else
4367                 parse_error()
4368                 tok_cur_tag.flag 'force-quirks', true
4369                 tok_state = tok_state_bogus_doctype
4370                 return null
4371
4372         # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4373         tok_state_after_doctype_system_keyword = ->
4374                 c = txt.charAt(cur++)
4375                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4376                         tok_state = tok_state_before_doctype_system_identifier
4377                         return
4378                 if c is '"'
4379                         parse_error()
4380                         tok_cur_tag.system_identifier = ''
4381                         tok_state = tok_state_doctype_system_identifier_double_quoted
4382                         return
4383                 if c is "'"
4384                         parse_error()
4385                         tok_cur_tag.system_identifier = ''
4386                         tok_state = tok_state_doctype_system_identifier_single_quoted
4387                         return
4388                 if c is '>'
4389                         parse_error()
4390                         tok_cur_tag.flag 'force-quirks', true
4391                         tok_state = tok_state_data
4392                         return tok_cur_tag
4393                 if c is '' # EOF
4394                         parse_error()
4395                         tok_state = tok_state_data
4396                         tok_cur_tag.flag 'force-quirks', true
4397                         cur -= 1 # Reconsume
4398                         return tok_cur_tag
4399                 # Anything else
4400                 parse_error()
4401                 tok_cur_tag.flag 'force-quirks', true
4402                 tok_state = tok_state_bogus_doctype
4403                 return null
4404
4405         # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4406         tok_state_before_doctype_system_identifier = ->
4407                 c = txt.charAt(cur++)
4408                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4409                         return
4410                 if c is '"'
4411                         tok_cur_tag.system_identifier = ''
4412                         tok_state = tok_state_doctype_system_identifier_double_quoted
4413                         return
4414                 if c is "'"
4415                         tok_cur_tag.system_identifier = ''
4416                         tok_state = tok_state_doctype_system_identifier_single_quoted
4417                         return
4418                 if c is '>'
4419                         parse_error()
4420                         tok_cur_tag.flag 'force-quirks', true
4421                         tok_state = tok_state_data
4422                         return tok_cur_tag
4423                 if c is '' # EOF
4424                         parse_error()
4425                         tok_state = tok_state_data
4426                         tok_cur_tag.flag 'force-quirks', true
4427                         cur -= 1 # Reconsume
4428                         return tok_cur_tag
4429                 # Anything else
4430                 parse_error()
4431                 tok_cur_tag.flag 'force-quirks', true
4432                 tok_state = tok_state_bogus_doctype
4433                 return null
4434
4435         # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4436         tok_state_doctype_system_identifier_double_quoted = ->
4437                 c = txt.charAt(cur++)
4438                 if c is '"'
4439                         tok_state = tok_state_after_doctype_system_identifier
4440                         return
4441                 if c is "\u0000"
4442                         parse_error()
4443                         tok_cur_tag.system_identifier += "\ufffd"
4444                         return
4445                 if c is '>'
4446                         parse_error()
4447                         tok_cur_tag.flag 'force-quirks', true
4448                         tok_state = tok_state_data
4449                         return tok_cur_tag
4450                 if c is '' # EOF
4451                         parse_error()
4452                         tok_state = tok_state_data
4453                         tok_cur_tag.flag 'force-quirks', true
4454                         cur -= 1 # Reconsume
4455                         return tok_cur_tag
4456                 # Anything else
4457                 tok_cur_tag.system_identifier += c
4458                 return null
4459
4460         # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4461         tok_state_doctype_system_identifier_single_quoted = ->
4462                 c = txt.charAt(cur++)
4463                 if c is "'"
4464                         tok_state = tok_state_after_doctype_system_identifier
4465                         return
4466                 if c is "\u0000"
4467                         parse_error()
4468                         tok_cur_tag.system_identifier += "\ufffd"
4469                         return
4470                 if c is '>'
4471                         parse_error()
4472                         tok_cur_tag.flag 'force-quirks', true
4473                         tok_state = tok_state_data
4474                         return tok_cur_tag
4475                 if c is '' # EOF
4476                         parse_error()
4477                         tok_state = tok_state_data
4478                         tok_cur_tag.flag 'force-quirks', true
4479                         cur -= 1 # Reconsume
4480                         return tok_cur_tag
4481                 # Anything else
4482                 tok_cur_tag.system_identifier += c
4483                 return null
4484
4485         # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4486         tok_state_after_doctype_system_identifier = ->
4487                 c = txt.charAt(cur++)
4488                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4489                         return
4490                 if c is '>'
4491                         tok_state = tok_state_data
4492                         return tok_cur_tag
4493                 if c is '' # EOF
4494                         parse_error()
4495                         tok_state = tok_state_data
4496                         tok_cur_tag.flag 'force-quirks', true
4497                         cur -= 1 # Reconsume
4498                         return tok_cur_tag
4499                 # Anything else
4500                 parse_error()
4501                 # do _not_ tok_cur_tag.flag 'force-quirks', true
4502                 tok_state = tok_state_bogus_doctype
4503                 return null
4504
4505         # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4506         tok_state_bogus_doctype = ->
4507                 c = txt.charAt(cur++)
4508                 if c is '>'
4509                         tok_state = tok_state_data
4510                         return tok_cur_tag
4511                 if c is '' # EOF
4512                         tok_state = tok_state_data
4513                         cur -= 1 # Reconsume
4514                         return tok_cur_tag
4515                 # Anything else
4516                 return null
4517
4518         # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4519         tok_state_cdata_section = ->
4520                 tok_state = tok_state_data
4521                 next_gt = txt.indexOf ']]>', cur
4522                 if next_gt is -1
4523                         val = txt.substr cur
4524                         cur = txt.length
4525                 else
4526                         val = txt.substr cur, (next_gt - cur)
4527                         cur = next_gt + 3
4528                 return new_character_token val # fixfull split
4529
4530         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4531         # Don't set this as a state, just call it
4532         # returns a string (NOT a text node)
4533         parse_character_reference = (allowed_char = null, in_attr = false) ->
4534                 if cur >= txt.length
4535                         return '&'
4536                 switch c = txt.charAt(cur)
4537                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4538                                 # explicitly not a parse error
4539                                 return '&'
4540                         when ';'
4541                                 # there has to be "one or more" alnums between & and ; to be a parse error
4542                                 return '&'
4543                         when '#'
4544                                 if cur + 1 >= txt.length
4545                                         return '&'
4546                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
4547                                         base = 16
4548                                         charset = hex_chars
4549                                         start = cur + 2
4550                                 else
4551                                         charset = digits
4552                                         start = cur + 1
4553                                         base = 10
4554                                 i = 0
4555                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4556                                         i += 1
4557                                 if i is 0
4558                                         return '&'
4559                                 cur = start + i
4560                                 if txt.charAt(start + i) is ';'
4561                                         cur += 1
4562                                 else
4563                                         parse_error()
4564                                 code_point = txt.substr(start, i)
4565                                 while code_point.charAt(0) is '0' and code_point.length > 1
4566                                         code_point = code_point.substr 1
4567                                 code_point = parseInt(code_point, base)
4568                                 if unicode_fixes[code_point]?
4569                                         parse_error()
4570                                         return unicode_fixes[code_point]
4571                                 else
4572                                         if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4573                                                 parse_error()
4574                                                 return "\ufffd"
4575                                         else
4576                                                 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4577                                                         parse_error()
4578                                                 return from_code_point code_point
4579                                 return
4580                         else
4581                                 for i in [0...31]
4582                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
4583                                                 break
4584                                 if i is 0
4585                                         # exit early, because parse_error() below needs at least one alnum
4586                                         return '&'
4587                                 if txt.charAt(cur + i) is ';'
4588                                         i += 1 # include ';' terminator in value
4589                                         decoded = decode_named_char_ref txt.substr(cur, i)
4590                                         if decoded?
4591                                                 cur += i
4592                                                 return decoded
4593                                         parse_error()
4594                                         return '&'
4595                                 else
4596                                         # no ';' terminator (only legacy char refs)
4597                                         max = i
4598                                         for i in [2..max] # no prefix matches, so ok to check shortest first
4599                                                 c = legacy_char_refs[txt.substr(cur, i)]
4600                                                 if c?
4601                                                         if in_attr
4602                                                                 if txt.charAt(cur + i) is '='
4603                                                                         # "because some legacy user agents will
4604                                                                         # misinterpret the markup in those cases"
4605                                                                         parse_error()
4606                                                                         return '&'
4607                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
4608                                                                         # this makes attributes forgiving about url args
4609                                                                         return '&'
4610                                                         # ok, and besides the weird exceptions for attributes...
4611                                                         # return the matching char
4612                                                         cur += i # consume entity chars
4613                                                         parse_error() # because no terminating ";"
4614                                                         return c
4615                                         parse_error()
4616                                         return '&'
4617                 return # never reached
4618
4619         # tree constructor initialization
4620         # see comments on TYPE_TAG/etc for the structure of this data
4621         txt = args.html
4622         cur = 0
4623         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4624         doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
4625         open_els = []
4626         afe = [] # active formatting elements
4627         template_ins_modes = []
4628         ins_mode = ins_mode_initial
4629         original_ins_mode = ins_mode # TODO check spec
4630         flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4631         flag_frameset_ok = true
4632         flag_parsing = true
4633         flag_foster_parenting = false
4634         form_element_pointer = null
4635         temporary_buffer = null
4636         pending_table_character_tokens = []
4637         head_element_pointer = null
4638         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4639         context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4640         prev_node_id = 0 # just for debugging
4641
4642         # tokenizer initialization
4643         tok_state = tok_state_data
4644
4645         # text pre-processing
4646         # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4647         txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4648         txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4649         txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4650
4651         if args.name is "tests20.dat #22"
4652                 console.log "hi"
4653         # proccess input
4654         # http://www.w3.org/TR/html5/syntax.html#tree-construction
4655         while flag_parsing
4656                 t = tok_state()
4657                 if t?
4658                         process_token t
4659                         # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4660         return doc.children
4661
4662 serialize_els = (els, shallow, show_ids) ->
4663         serialized = ''
4664         sep = ''
4665         for t in els
4666                 serialized += sep
4667                 sep = ','
4668                 serialized += t.serialize shallow, show_ids
4669         return serialized
4670
4671 module.exports.parse_html = parse_html
4672 module.exports.debug_log_reset = debug_log_reset
4673 module.exports.debug_log_each = debug_log_each
4674 module.exports.TYPE_TAG = TYPE_TAG
4675 module.exports.TYPE_TEXT = TYPE_TEXT
4676 module.exports.TYPE_COMMENT = TYPE_COMMENT
4677 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4678 module.exports.NS_HTML = NS_HTML
4679 module.exports.NS_MATHML = NS_MATHML
4680 module.exports.NS_SVG = NS_SVG
4681 module.exports.QUIRKS_NO = QUIRKS_NO
4682 module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
4683 module.exports.QUIRKS_YES = QUIRKS_YES