parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor.
  20
  21 # The implementation is a pretty direct implementation of the parsing algorithm
  22 # described here:
  23 # http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
  24 #
  25 # Deviations from that spec:
  26 #
  27 #   Purposeful: search this file for "WHATWG"
  28 #
  29 #   Not finished yet: search this file for "fixfull", "TODO" and "FIXME"
  30
  31
  32 # stacks/lists
  33 #
  34 # the spec uses a many different words do indicate which ends of lists/stacks
  35 # they are talking about (and relative movement within the lists/stacks). This
  36 # section splains. I'm implementing "lists" (afe and open_els) the same way
  37 # (both as stacks)
  38 #
  39 # stacks grow downward (current element is index=0)
  40 #
  41 # example: open_els = [a, b, c, d, e, f, g]
  42 #
  43 # "grows downwards" means it's visualized like this: (index: el, names)
  44 #
  45 #   6: g "start of the list", "topmost", "first"
  46 #   5: f
  47 #   4: e "previous" (to d), "above", "before"
  48 #   3: d   (previous/next are relative to this element)
  49 #   2: c "next", "after", "lower", "below"
  50 #   1: b
  51 #   0: a "end of the list", "current node", "bottommost", "last"
  52
  53
  54 # browser
  55 # note: to get this to run outside a browser, you'll have to write a native
  56 # implementation of decode_named_char_ref()
  57 unless module?.exports?
  58         window.wheic = {}
  59         module = exports: window.wheic
  60
  61 from_code_point = (x) ->
  62         if String.fromCodePoint?
  63                 return String.fromCodePoint x
  64         else
  65                 if x <= 0xffff
  66                         return String.fromCharCode x
  67                 x -= 0x10000
  68                 return String.fromCharCode((x >> 10) + 0xd800, (x % 0x400) + 0xdc00)
  69
  70 # Each node is an obect of the Node class. Here are the Node types:
  71 TYPE_TAG = 0 # name, {attributes}, [children]
  72 TYPE_TEXT = 1 # "text"
  73 TYPE_COMMENT = 2
  74 TYPE_DOCTYPE = 3
  75 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  76 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
  77 TYPE_END_TAG = 5 # name
  78 TYPE_EOF = 6
  79 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
  80 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
  81
  82 # namespace constants
  83 NS_HTML = 1
  84 NS_MATHML = 2
  85 NS_SVG = 3
  86
  87 # quirks mode constants
  88 QUIRKS_NO = 1
  89 QUIRKS_LIMITED = 2
  90 QUIRKS_YES = 3
  91
  92 g_debug_log = []
  93 debug_log_reset = ->
  94         g_debug_log = []
  95 debug_log = (str) ->
  96         g_debug_log.push str
  97 debug_log_each = (cb) ->
  98         for str in g_debug_log
  99                 cb str
 100
 101 prev_node_id = 0
 102 class Node
 103         constructor: (type, args = {}) ->
 104                 @type = type # one of the TYPE_* constants above
 105                 @name = args.name ? '' # tag name
 106                 @text = args.text ? '' # contents for text/comment nodes
 107                 @attrs = args.attrs ? {}
 108                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
 109                 @children = args.children ? []
 110                 @namespace = args.namespace ? NS_HTML
 111                 @parent = args.parent ? null
 112                 @token = args.token ? null
 113                 @flags = args.flags ? {}
 114                 if args.id?
 115                         @id = "#{args.id}+"
 116                 else
 117                         @id = "#{++prev_node_id}"
 118         acknowledge_self_closing: ->
 119                 if @token?
 120                         @token.flag 'did_self_close', true
 121                 else
 122                         @flag 'did_self_close', true
 123         flag: (key, value = null) ->
 124                 if value?
 125                         @flags[key] = value
 126                 else
 127                         return @flags[key]
 128         serialize: (shallow = false, show_ids = false) -> # for unit tests
 129                 ret = ''
 130                 switch @type
 131                         when TYPE_TAG
 132                                 ret += 'tag:'
 133                                 ret += JSON.stringify @name
 134                                 ret += ','
 135                                 if show_ids
 136                                         ret += "##{@id},"
 137                                 if shallow
 138                                         break
 139                                 attr_keys = []
 140                                 for k of @attrs
 141                                         attr_keys.push k
 142                                 attr_keys.sort()
 143                                 ret += '{'
 144                                 sep = ''
 145                                 for k in attr_keys
 146                                         ret += sep
 147                                         sep = ','
 148                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
 149                                 ret += '},['
 150                                 sep = ''
 151                                 for c in @children
 152                                         ret += sep
 153                                         sep = ','
 154                                         ret += c.serialize shallow, show_ids
 155                                 ret += ']'
 156                         when TYPE_TEXT
 157                                 ret += 'text:'
 158                                 ret += JSON.stringify @text
 159                         when TYPE_COMMENT
 160                                 ret += 'comment:'
 161                                 ret += JSON.stringify @text
 162                         when TYPE_DOCTYPE
 163                                 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
 164                         when TYPE_AFE_MARKER
 165                                 ret += 'marker'
 166                         when TYPE_AAA_BOOKMARK
 167                                 ret += 'aaa_bookmark'
 168                         else
 169                                 ret += 'unknown:'
 170                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
 171                 return ret
 172
 173 # helpers: (only take args that are normally known when parser creates nodes)
 174 new_open_tag = (name) ->
 175         return new Node TYPE_START_TAG, name: name
 176 new_end_tag = (name) ->
 177         return new Node TYPE_END_TAG, name: name
 178 new_element = (name) ->
 179         return new Node TYPE_TAG, name: name
 180 new_text_node = (txt) ->
 181         return new Node TYPE_TEXT, text: txt
 182 new_character_token = new_text_node
 183 new_comment_token = (txt) ->
 184         return new Node TYPE_COMMENT, text: txt
 185 new_doctype_token = (name) ->
 186         return new Node TYPE_DOCTYPE, name: name
 187 new_eof_token = ->
 188         return new Node TYPE_EOF
 189 new_afe_marker = ->
 190         return new Node TYPE_AFE_MARKER
 191 new_aaa_bookmark = ->
 192         return new Node TYPE_AAA_BOOKMARK
 193
 194 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
 195 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 196 digits = "0123456789"
 197 alnum = lc_alpha + uc_alpha + digits
 198 hex_chars = digits + "abcdefABCDEF"
 199
 200 is_uc_alpha = (str) ->
 201         return str.length is 1 and uc_alpha.indexOf(str) > -1
 202 is_lc_alpha = (str) ->
 203         return str.length is 1 and lc_alpha.indexOf(str) > -1
 204
 205 # some SVG elements have dashes in them
 206 tag_name_chars = alnum + "-"
 207
 208 # http://www.w3.org/TR/html5/infrastructure.html#space-character
 209 space_chars = "\u0009\u000a\u000c\u000d\u0020"
 210 is_space = (txt) ->
 211         return txt.length is 1 and space_chars.indexOf(txt) > -1
 212 is_space_tok = (t) ->
 213         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
 214
 215 is_input_hidden_tok = (t) ->
 216         return false unless t.type is TYPE_START_TAG
 217         for a in t.attrs_a
 218                 if a[0] is 'type'
 219                         if a[1].toLowerCase() is 'hidden'
 220                                 return true
 221                         return false
 222         return false
 223
 224 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
 225 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
 226
 227 unicode_fixes = {}
 228 unicode_fixes[0x00] = "\uFFFD"
 229 unicode_fixes[0x80] = "\u20AC"
 230 unicode_fixes[0x82] = "\u201A"
 231 unicode_fixes[0x83] = "\u0192"
 232 unicode_fixes[0x84] = "\u201E"
 233 unicode_fixes[0x85] = "\u2026"
 234 unicode_fixes[0x86] = "\u2020"
 235 unicode_fixes[0x87] = "\u2021"
 236 unicode_fixes[0x88] = "\u02C6"
 237 unicode_fixes[0x89] = "\u2030"
 238 unicode_fixes[0x8A] = "\u0160"
 239 unicode_fixes[0x8B] = "\u2039"
 240 unicode_fixes[0x8C] = "\u0152"
 241 unicode_fixes[0x8E] = "\u017D"
 242 unicode_fixes[0x91] = "\u2018"
 243 unicode_fixes[0x92] = "\u2019"
 244 unicode_fixes[0x93] = "\u201C"
 245 unicode_fixes[0x94] = "\u201D"
 246 unicode_fixes[0x95] = "\u2022"
 247 unicode_fixes[0x96] = "\u2013"
 248 unicode_fixes[0x97] = "\u2014"
 249 unicode_fixes[0x98] = "\u02DC"
 250 unicode_fixes[0x99] = "\u2122"
 251 unicode_fixes[0x9A] = "\u0161"
 252 unicode_fixes[0x9B] = "\u203A"
 253 unicode_fixes[0x9C] = "\u0153"
 254 unicode_fixes[0x9E] = "\u017E"
 255 unicode_fixes[0x9F] = "\u0178"
 256
 257 quirks_yes_pi_prefixes = [
 258         "+//silmaril//dtd html pro v0r11 19970101//"
 259         "-//as//dtd html 3.0 aswedit + extensions//"
 260         "-//advasoft ltd//dtd html 3.0 aswedit + extensions//"
 261         "-//ietf//dtd html 2.0 level 1//"
 262         "-//ietf//dtd html 2.0 level 2//"
 263         "-//ietf//dtd html 2.0 strict level 1//"
 264         "-//ietf//dtd html 2.0 strict level 2//"
 265         "-//ietf//dtd html 2.0 strict//"
 266         "-//ietf//dtd html 2.0//"
 267         "-//ietf//dtd html 2.1e//"
 268         "-//ietf//dtd html 3.0//"
 269         "-//ietf//dtd html 3.2 final//"
 270         "-//ietf//dtd html 3.2//"
 271         "-//ietf//dtd html 3//"
 272         "-//ietf//dtd html level 0//"
 273         "-//ietf//dtd html level 1//"
 274         "-//ietf//dtd html level 2//"
 275         "-//ietf//dtd html level 3//"
 276         "-//ietf//dtd html strict level 0//"
 277         "-//ietf//dtd html strict level 1//"
 278         "-//ietf//dtd html strict level 2//"
 279         "-//ietf//dtd html strict level 3//"
 280         "-//ietf//dtd html strict//"
 281         "-//ietf//dtd html//"
 282         "-//metrius//dtd metrius presentational//"
 283         "-//microsoft//dtd internet explorer 2.0 html strict//"
 284         "-//microsoft//dtd internet explorer 2.0 html//"
 285         "-//microsoft//dtd internet explorer 2.0 tables//"
 286         "-//microsoft//dtd internet explorer 3.0 html strict//"
 287         "-//microsoft//dtd internet explorer 3.0 html//"
 288         "-//microsoft//dtd internet explorer 3.0 tables//"
 289         "-//netscape comm. corp.//dtd html//"
 290         "-//netscape comm. corp.//dtd strict html//"
 291         "-//o'reilly and associates//dtd html 2.0//"
 292         "-//o'reilly and associates//dtd html extended 1.0//"
 293         "-//o'reilly and associates//dtd html extended relaxed 1.0//"
 294         "-//sq//dtd html 2.0 hotmetal + extensions//"
 295         "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//"
 296         "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//"
 297         "-//spyglass//dtd html 2.0 extended//"
 298         "-//sun microsystems corp.//dtd hotjava html//"
 299         "-//sun microsystems corp.//dtd hotjava strict html//"
 300         "-//w3c//dtd html 3 1995-03-24//"
 301         "-//w3c//dtd html 3.2 draft//"
 302         "-//w3c//dtd html 3.2 final//"
 303         "-//w3c//dtd html 3.2//"
 304         "-//w3c//dtd html 3.2s draft//"
 305         "-//w3c//dtd html 4.0 frameset//"
 306         "-//w3c//dtd html 4.0 transitional//"
 307         "-//w3c//dtd html experimental 19960712//"
 308         "-//w3c//dtd html experimental 970421//"
 309         "-//w3c//dtd w3 html//"
 310         "-//w3o//dtd w3 html 3.0//"
 311         "-//webtechs//dtd mozilla html 2.0//"
 312         "-//webtechs//dtd mozilla html//"
 313 ]
 314
 315 # These are the character references that don't need a terminating semicolon
 316 # min length: 2, max: 6, none are a prefix of any other.
 317 legacy_char_refs = {
 318         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
 319         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
 320         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
 321         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
 322         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
 323         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
 324         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
 325         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
 326         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
 327         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
 328         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
 329         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
 330         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
 331         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
 332         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
 333         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
 334         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
 335         yen: '¥', yuml: 'ÿ'
 336 }
 337
 338 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
 339 raw_text_elements = ['script', 'style']
 340 escapable_raw_text_elements = ['textarea', 'title']
 341 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
 342 svg_elements = [
 343         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
 344         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
 345         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
 346         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
 347         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
 348         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
 349         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
 350         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
 351         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
 352         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
 353         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
 354         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
 355         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
 356         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
 357         'view', 'vkern'
 358 ]
 359
 360 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
 361 mathml_elements = [
 362         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
 363         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
 364         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
 365         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
 366         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
 367         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 368         'determinant', 'diff', 'divergence', 'divide', 'domain',
 369         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 370         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 371         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 372         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 373         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 374         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 375         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 376         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 377         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 378         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 379         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 380         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 381         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 382         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 383         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 384         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 385         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 386         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 387         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 388         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 389         'vectorproduct', 'xor'
 390 ]
 391 # foreign_elements = [svg_elements..., mathml_elements...]
 392 #normal_elements = All other allowed HTML elements are normal elements.
 393
 394 special_elements = {
 395         # HTML:
 396         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
 397         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
 398         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
 399         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
 400         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
 401         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
 402         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
 403         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
 404         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
 405         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
 406         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML,
 407
 408         menu:NS_HTML,menuitem:NS_HTML, # WHATWG adds these
 409
 410         meta:NS_HTML, nav:NS_HTML, noembed:NS_HTML, noframes:NS_HTML,
 411         noscript:NS_HTML, object:NS_HTML, ol:NS_HTML, p:NS_HTML, param:NS_HTML,
 412         plaintext:NS_HTML, pre:NS_HTML, script:NS_HTML, section:NS_HTML,
 413         select:NS_HTML, source:NS_HTML, style:NS_HTML, summary:NS_HTML,
 414         table:NS_HTML, tbody:NS_HTML, td:NS_HTML, template:NS_HTML,
 415         textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML, title:NS_HTML,
 416         tr:NS_HTML, track:NS_HTML, ul:NS_HTML, wbr:NS_HTML, xmp:NS_HTML,
 417
 418         # MathML:
 419         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
 420         'annotation-xml':NS_MATHML,
 421
 422         # SVG:
 423         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
 424 }
 425
 426 formatting_elements = {
 427          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
 428          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
 429          u: true
 430 }
 431
 432 mathml_text_integration = {
 433         mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML
 434 }
 435 is_mathml_text_integration_point = (el) ->
 436         return mathml_text_integration[el.name] is el.namespace
 437 is_html_integration = (el) -> # DON'T PASS A TOKEN
 438         if el.namespace is NS_MATHML
 439                 if el.name is 'annotation-xml'
 440                         if el.attrs.encoding?
 441                                 if el.attrs.encoding.toLowerCase() is 'text/html'
 442                                         return true
 443                                 if el.attrs.encoding.toLowerCase() is 'application/xhtml+xml'
 444                                         return true
 445                 return false
 446         if el.namespace is NS_SVG
 447                 if el.name is 'foreignObject' or el.name is 'desc' or el.name is 'title'
 448                         return true
 449         return false
 450
 451 h_tags = {
 452         h1:NS_HTML, h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML
 453 }
 454
 455 foster_parenting_targets = {
 456         table: NS_HTML
 457         tbody: NS_HTML
 458         tfoot: NS_HTML
 459         thead: NS_HTML
 460         tr: NS_HTML
 461 }
 462
 463 end_tag_implied = {
 464         dd: NS_HTML
 465         dt: NS_HTML
 466         li: NS_HTML
 467         option: NS_HTML
 468         optgroup: NS_HTML
 469         p: NS_HTML
 470         rb: NS_HTML
 471         rp: NS_HTML
 472         rt: NS_HTML
 473         rtc: NS_HTML
 474 }
 475
 476 el_is_special = (e) ->
 477         return special_elements[e.name] is e.namespace
 478
 479 adp_els = { address: NS_HTML, div: NS_HTML, p: NS_HTML }
 480 el_is_special_not_adp = (el) ->
 481         return special_elements[el.name] is el.namespace and adp_els[el.name] isnt el.namespace
 482
 483 svg_name_fixes = {
 484         altglyph: 'altGlyph'
 485         altglyphdef: 'altGlyphDef'
 486         altglyphitem: 'altGlyphItem'
 487         animatecolor: 'animateColor'
 488         animatemotion: 'animateMotion'
 489         animatetransform: 'animateTransform'
 490         clippath: 'clipPath'
 491         feblend: 'feBlend'
 492         fecolormatrix: 'feColorMatrix'
 493         fecomponenttransfer: 'feComponentTransfer'
 494         fecomposite: 'feComposite'
 495         feconvolvematrix: 'feConvolveMatrix'
 496         fediffuselighting: 'feDiffuseLighting'
 497         fedisplacementmap: 'feDisplacementMap'
 498         fedistantlight: 'feDistantLight'
 499         fedropshadow: 'feDropShadow'
 500         feflood: 'feFlood'
 501         fefunca: 'feFuncA'
 502         fefuncb: 'feFuncB'
 503         fefuncg: 'feFuncG'
 504         fefuncr: 'feFuncR'
 505         fegaussianblur: 'feGaussianBlur'
 506         feimage: 'feImage'
 507         femerge: 'feMerge'
 508         femergenode: 'feMergeNode'
 509         femorphology: 'feMorphology'
 510         feoffset: 'feOffset'
 511         fepointlight: 'fePointLight'
 512         fespecularlighting: 'feSpecularLighting'
 513         fespotlight: 'feSpotLight'
 514         fetile: 'feTile'
 515         feturbulence: 'feTurbulence'
 516         foreignobject: 'foreignObject'
 517         glyphref: 'glyphRef'
 518         lineargradient: 'linearGradient'
 519         radialgradient: 'radialGradient'
 520         textpath: 'textPath'
 521 }
 522 svg_attribute_fixes = {
 523         attributename: 'attributeName'
 524         attributetype: 'attributeType'
 525         basefrequency: 'baseFrequency'
 526         baseprofile: 'baseProfile'
 527         calcmode: 'calcMode'
 528         clippathunits: 'clipPathUnits'
 529         contentscripttype: 'contentScriptType'
 530         contentstyletype: 'contentStyleType'
 531         diffuseconstant: 'diffuseConstant'
 532         edgemode: 'edgeMode'
 533         externalresourcesrequired: 'externalResourcesRequired'
 534         # WHATWG removes this: filterres: 'filterRes'
 535         filterunits: 'filterUnits'
 536         glyphref: 'glyphRef'
 537         gradienttransform: 'gradientTransform'
 538         gradientunits: 'gradientUnits'
 539         kernelmatrix: 'kernelMatrix'
 540         kernelunitlength: 'kernelUnitLength'
 541         keypoints: 'keyPoints'
 542         keysplines: 'keySplines'
 543         keytimes: 'keyTimes'
 544         lengthadjust: 'lengthAdjust'
 545         limitingconeangle: 'limitingConeAngle'
 546         markerheight: 'markerHeight'
 547         markerunits: 'markerUnits'
 548         markerwidth: 'markerWidth'
 549         maskcontentunits: 'maskContentUnits'
 550         maskunits: 'maskUnits'
 551         numoctaves: 'numOctaves'
 552         pathlength: 'pathLength'
 553         patterncontentunits: 'patternContentUnits'
 554         patterntransform: 'patternTransform'
 555         patternunits: 'patternUnits'
 556         pointsatx: 'pointsAtX'
 557         pointsaty: 'pointsAtY'
 558         pointsatz: 'pointsAtZ'
 559         preservealpha: 'preserveAlpha'
 560         preserveaspectratio: 'preserveAspectRatio'
 561         primitiveunits: 'primitiveUnits'
 562         refx: 'refX'
 563         refy: 'refY'
 564         repeatcount: 'repeatCount'
 565         repeatdur: 'repeatDur'
 566         requiredextensions: 'requiredExtensions'
 567         requiredfeatures: 'requiredFeatures'
 568         specularconstant: 'specularConstant'
 569         specularexponent: 'specularExponent'
 570         spreadmethod: 'spreadMethod'
 571         startoffset: 'startOffset'
 572         stddeviation: 'stdDeviation'
 573         stitchtiles: 'stitchTiles'
 574         surfacescale: 'surfaceScale'
 575         systemlanguage: 'systemLanguage'
 576         tablevalues: 'tableValues'
 577         targetx: 'targetX'
 578         targety: 'targetY'
 579         textlength: 'textLength'
 580         viewbox: 'viewBox'
 581         viewtarget: 'viewTarget'
 582         xchannelselector: 'xChannelSelector'
 583         ychannelselector: 'yChannelSelector'
 584         zoomandpan: 'zoomAndPan'
 585 }
 586 foreign_attr_fixes = {
 587         'xlink:actuate': 'xlink actuate'
 588         'xlink:arcrole': 'xlink arcrole'
 589         'xlink:href': 'xlink href'
 590         'xlink:role': 'xlink role'
 591         'xlink:show': 'xlink show'
 592         'xlink:title': 'xlink title'
 593         'xlink:type': 'xlink type'
 594         'xml:base': 'xml base'
 595         'xml:lang': 'xml lang'
 596         'xml:space': 'xml space'
 597         'xmlns': 'xmlns'
 598         'xmlns:xlink': 'xmlns xlink'
 599 }
 600 adjust_mathml_attributes = (t) ->
 601         for a in t.attrs_a
 602                 if a[0] is 'definitionurl'
 603                         a[0] = 'definitionURL'
 604         return
 605 adjust_svg_attributes = (t) ->
 606         for a in t.attrs_a
 607                 if svg_attribute_fixes[a[0]]?
 608                         a[0] = svg_attribute_fixes[a[0]]
 609         return
 610 adjust_foreign_attributes = (t) ->
 611         # fixfull
 612         for a in t.attrs_a
 613                 if foreign_attr_fixes[a[0]]?
 614                         a[0] = foreign_attr_fixes[a[0]]
 615         return
 616
 617 # decode_named_char_ref()
 618 #
 619 # The list of named character references is _huge_ so ask the browser to decode
 620 # for us instead of wasting bandwidth/space on including the table here.
 621 #
 622 # Pass without the "&" but with the ";" examples:
 623 #    for "&amp" pass "amp;"
 624 #    for "&#x2032" pass "x2032;"
 625 g_dncr = {
 626         cache: {}
 627         textarea: document.createElement('textarea')
 628 }
 629 # TODO test this in IE8
 630 decode_named_char_ref = (txt) ->
 631         txt = "&#{txt}"
 632         decoded = g_dncr.cache[txt]
 633         return decoded if decoded?
 634         g_dncr.textarea.innerHTML = txt
 635         decoded = g_dncr.textarea.value
 636         return null if decoded is txt
 637         return g_dncr.cache[txt] = decoded
 638
 639 parse_html = (args) ->
 640         txt = null
 641         cur = null # index of next char in txt to be parsed
 642         # declare doc and tokenizer variables so they're in scope below
 643         doc = null
 644         open_els = null # stack of open elements
 645         afe = null # active formatting elements
 646         template_ins_modes = null
 647         ins_mode = null
 648         original_ins_mode = null
 649         tok_state = null
 650         tok_cur_tag = null # partially parsed tag
 651         flag_scripting = null
 652         flag_frameset_ok = null
 653         flag_parsing = null
 654         flag_foster_parenting = null
 655         form_element_pointer = null
 656         temporary_buffer = null
 657         pending_table_character_tokens = null
 658         head_element_pointer = null
 659         flag_fragment_parsing = null
 660         context_element = null
 661
 662         stop_parsing = ->
 663                 flag_parsing = false
 664
 665         parse_error = ->
 666                 if args.error_cb?
 667                         args.error_cb cur
 668                 else
 669                         console.log "Parse error at character #{cur} of #{txt.length}"
 670
 671         # http://www.w3.org/TR/html5/syntax.html#push-onto-the-list-of-active-formatting-elements
 672         # "Noah's Ark clause" but with three
 673         afe_push = (new_el) ->
 674                 matches = 0
 675                 for el, i in afe
 676                         if el.type is TYPE_AFE_MARKER
 677                                 break
 678                         if el.name is new_el.name and el.namespace is new_el.namespace
 679                                 attrs_match = true
 680                                 for k, v of el.attrs
 681                                         unless new_el.attrs[k] is v
 682                                                 attrs_match = false
 683                                                 break
 684                                 if attrs_match
 685                                         for k, v of new_el.attrs
 686                                                 unless el.attrs[k] is v
 687                                                         attrs_match = false
 688                                                         break
 689                                 if attrs_match
 690                                         matches += 1
 691                                         if matches is 3
 692                                                 afe.splice i, 1
 693                                                 break
 694                 afe.unshift new_el
 695         afe_push_marker = ->
 696                 afe.unshift new_afe_marker()
 697
 698         # the functions below impliment the Tree Contstruction algorithm
 699         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 700
 701         # But first... the helpers
 702         template_tag_is_open = ->
 703                 for el in open_els
 704                         if el.name is 'template' and el.namespace is NS_HTML
 705                                 return true
 706                 return false
 707         is_in_scope_x = (tag_name, scope, namespace) ->
 708                 for el in open_els
 709                         if el.name is tag_name and (namespace is null or namespace is el.namespace)
 710                                 return true
 711                         if scope[el.name] is el.namespace
 712                                 return false
 713                 return false
 714         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
 715                 for el in open_els
 716                         if el.name is tag_name and (namespace is null or namespace is el.namespace)
 717                                 return true
 718                         if scope[el.name] is el.namespace
 719                                 return false
 720                         if scope2[el.name] is el.namespace
 721                                 return false
 722                 return false
 723         standard_scopers = {
 724                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
 725                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
 726                 template: NS_HTML,
 727
 728                 mi: NS_MATHML, mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML,
 729                 mtext: NS_MATHML, 'annotation-xml': NS_MATHML,
 730
 731                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
 732         }
 733         button_scopers = button: NS_HTML
 734         li_scopers = ol: NS_HTML, ul: NS_HTML
 735         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
 736         is_in_scope = (tag_name, namespace = null) ->
 737                 return is_in_scope_x tag_name, standard_scopers, namespace
 738         is_in_button_scope = (tag_name, namespace = null) ->
 739                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
 740         is_in_table_scope = (tag_name, namespace = null) ->
 741                 return is_in_scope_x tag_name, table_scopers, namespace
 742         # aka is_in_list_item_scope
 743         is_in_li_scope = (tag_name, namespace = null) ->
 744                 return is_in_scope_x_y tag_name, standard_scopers, li_scopers, namespace
 745         is_in_select_scope = (tag_name, namespace = null) ->
 746                 for t in open_els
 747                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 748                                 return true
 749                         if t.namespace isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
 750                                 return false
 751                 return false
 752         # this checks for a particular element, not by name
 753         # this requires a namespace match
 754         el_is_in_scope = (needle) ->
 755                 for el in open_els
 756                         if el is needle
 757                                 return true
 758                         if standard_scopers[el.name] is el.namespace
 759                                 return false
 760                 return false
 761
 762         clear_to_table_stopers = {
 763                 'table': true
 764                 'template': true
 765                 'html': true
 766         }
 767         clear_stack_to_table_context = ->
 768                 loop
 769                         if clear_to_table_stopers[open_els[0].name]?
 770                                 break
 771                         open_els.shift()
 772                 return
 773         clear_to_table_body_stopers = {
 774                 tbody: NS_HTML
 775                 tfoot: NS_HTML
 776                 thead: NS_HTML
 777                 template: NS_HTML
 778                 html: NS_HTML
 779         }
 780         clear_stack_to_table_body_context = ->
 781                 loop
 782                         if clear_to_table_body_stopers[open_els[0].name] is open_els[0].namespace
 783                                 break
 784                         open_els.shift()
 785                 return
 786         clear_to_table_row_stopers = {
 787                 'tr': true
 788                 'template': true
 789                 'html': true
 790         }
 791         clear_stack_to_table_row_context = ->
 792                 loop
 793                         if clear_to_table_row_stopers[open_els[0].name]?
 794                                 break
 795                         open_els.shift()
 796                 return
 797         clear_afe_to_marker = ->
 798                 loop
 799                         return unless afe.length > 0 # this happens in fragment case, ?spec error
 800                         el = afe.shift()
 801                         if el.type is TYPE_AFE_MARKER
 802                                 return
 803                 return
 804
 805         # 8.2.3.1 ...
 806         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
 807         reset_ins_mode = ->
 808                 # 1. Let last be false.
 809                 last = false
 810                 # 2. Let node be the last node in the stack of open elements.
 811                 node_i = 0
 812                 node = open_els[node_i]
 813                 # 3. Loop: If node is the first node in the stack of open elements,
 814                 # then set last to true, and, if the parser was originally created as
 815                 # part of the HTML fragment parsing algorithm (fragment case) set node
 816                 # to the context element.
 817                 loop
 818                         if node_i is open_els.length - 1
 819                                 last = true
 820                                 # fixfull (fragment case)
 821
 822                         # 4. If node is a select element, run these substeps:
 823                         if node.name is 'select' and node.namespace is NS_HTML
 824                                 # 1. If last is true, jump to the step below labeled done.
 825                                 unless last
 826                                         # 2. Let ancestor be node.
 827                                         ancestor_i = node_i
 828                                         ancestor = node
 829                                         # 3. Loop: If ancestor is the first node in the stack of
 830                                         # open elements, jump to the step below labeled done.
 831                                         loop
 832                                                 if ancestor_i is open_els.length - 1
 833                                                         break
 834                                                 # 4. Let ancestor be the node before ancestor in the stack
 835                                                 # of open elements.
 836                                                 ancestor_i += 1
 837                                                 ancestor = open_els[ancestor_i]
 838                                                 # 5. If ancestor is a template node, jump to the step below
 839                                                 # labeled done.
 840                                                 if ancestor.name is 'template' and ancestor.namespace is NS_HTML
 841                                                         break
 842                                                 # 6. If ancestor is a table node, switch the insertion mode
 843                                                 # to "in select in table" and abort these steps.
 844                                                 if ancestor.name is 'table' and ancestor.namespace is NS_HTML
 845                                                         ins_mode = ins_mode_in_select_in_table
 846                                                         return
 847                                                 # 7. Jump back to the step labeled loop.
 848                                 # 8. Done: Switch the insertion mode to "in select" and abort
 849                                 # these steps.
 850                                 ins_mode = ins_mode_in_select
 851                                 return
 852                         # 5. If node is a td or th element and last is false, then switch
 853                         # the insertion mode to "in cell" and abort these steps.
 854                         if (node.name is 'td' or node.name is 'th') and node.namespace is NS_HTML and last is false
 855                                 ins_mode = ins_mode_in_cell
 856                                 return
 857                         # 6. If node is a tr element, then switch the insertion mode to "in
 858                         # row" and abort these steps.
 859                         if node.name is 'tr' and node.namespace is NS_HTML
 860                                 ins_mode = ins_mode_in_row
 861                                 return
 862                         # 7. If node is a tbody, thead, or tfoot element, then switch the
 863                         # insertion mode to "in table body" and abort these steps.
 864                         if (node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot') and node.namespace is NS_HTML
 865                                 ins_mode = ins_mode_in_table_body
 866                                 return
 867                         # 8. If node is a caption element, then switch the insertion mode
 868                         # to "in caption" and abort these steps.
 869                         if node.name is 'caption' and node.namespace is NS_HTML
 870                                 ins_mode = ins_mode_in_caption
 871                                 return
 872                         # 9. If node is a colgroup element, then switch the insertion mode
 873                         # to "in column group" and abort these steps.
 874                         if node.name is 'colgroup' and node.namespace is NS_HTML
 875                                 ins_mode = ins_mode_in_column_group
 876                                 return
 877                         # 10. If node is a table element, then switch the insertion mode to
 878                         # "in table" and abort these steps.
 879                         if node.name is 'table' and node.namespace is NS_HTML
 880                                 ins_mode = ins_mode_in_table
 881                                 return
 882                         # 11. If node is a template element, then switch the insertion mode
 883                         # to the current template insertion mode and abort these steps.
 884                         if node.name is 'template' and node.namespace is NS_HTML
 885                                 ins_mode = template_ins_modes[0]
 886                                 return
 887                         # 12. If node is a head element and last is true, then switch the
 888                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
 889                         # these steps. (fragment case)
 890                         if node.name is 'head' and node.namespace is NS_HTML and last
 891                                 ins_mode = ins_mode_in_body
 892                                 return
 893                         # 13. If node is a head element and last is false, then switch the
 894                         # insertion mode to "in head" and abort these steps.
 895                         if node.name is 'head' and node.namespace is NS_HTML and last is false
 896                                 ins_mode = ins_mode_in_head
 897                                 return
 898                         # 14. If node is a body element, then switch the insertion mode to
 899                         # "in body" and abort these steps.
 900                         if node.name is 'body' and node.namespace is NS_HTML
 901                                 ins_mode = ins_mode_in_body
 902                                 return
 903                         # 15. If node is a frameset element, then switch the insertion mode
 904                         # to "in frameset" and abort these steps. (fragment case)
 905                         if node.name is 'frameset' and node.namespace is NS_HTML
 906                                 ins_mode = ins_mode_in_frameset
 907                                 return
 908                         # 16. If node is an html element, run these substeps:
 909                         if node.name is 'html' and node.namespace is NS_HTML
 910                                 # 1. If the head element pointer is null, switch the insertion
 911                                 # mode to "before head" and abort these steps. (fragment case)
 912                                 if head_element_pointer is null
 913                                         ins_mode = ins_mode_before_head
 914                                 else
 915                                         # 2. Otherwise, the head element pointer is not null,
 916                                         # switch the insertion mode to "after head" and abort these
 917                                         # steps.
 918                                         ins_mode = ins_mode_after_head
 919                                 return
 920                         # 17. If last is true, then switch the insertion mode to "in body"
 921                         # and abort these steps. (fragment case)
 922                         if last
 923                                 ins_mode = ins_mode_in_body
 924                                 return
 925                         # 18. Let node now be the node before node in the stack of open
 926                         # elements.
 927                         node_i += 1
 928                         node = open_els[node_i]
 929                         # 19. Return to the step labeled loop.
 930
 931         # 8.2.3.2
 932
 933         # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
 934         adjusted_current_node = ->
 935                 if open_els.length is 1 and flag_fragment_parsing
 936                         return context_element
 937                 return open_els[0]
 938
 939         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
 940         # this implementation is structured (mostly) as described at the link above.
 941         # capitalized comments are the "labels" described at the link above.
 942         reconstruct_afe = ->
 943                 return if afe.length is 0
 944                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
 945                         return
 946                 # Rewind
 947                 i = 0
 948                 loop
 949                         if i is afe.length - 1
 950                                 break
 951                         i += 1
 952                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
 953                                 i -= 1 # Advance
 954                                 break
 955                 # Create
 956                 loop
 957                         el = insert_html_element afe[i].token
 958                         afe[i] = el
 959                         break if i is 0
 960                         i -= 1 # Advance
 961
 962         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
 963         # adoption agency algorithm
 964         # overview here:
 965         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
 966         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
 967         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
 968         adoption_agency = (subject) ->
 969                 debug_log "adoption_agency()"
 970                 debug_log "tree: #{serialize_els doc.children, false, true}"
 971                 debug_log "open_els: #{serialize_els open_els, true, true}"
 972                 debug_log "afe: #{serialize_els afe, true, true}"
 973 # this block implements tha W3C spec
 974 #               # 1. If the current node is an HTML element whose tag name is subject,
 975 #               # then run these substeps:
 976 #               #
 977 #               # 1. Let element be the current node.
 978 #               #
 979 #               # 2. Pop element off the stack of open elements.
 980 #               #
 981 #               # 3. If element is also in the list of active formatting elements,
 982 #               # remove the element from the list.
 983 #               #
 984 #               # 4. Abort the adoption agency algorithm.
 985 #               if open_els[0].name is subject and open_els[0].namespace is NS_HTML
 986 #                       el = open_els.shift()
 987 #                       # remove it from the list of active formatting elements (if found)
 988 #                       for t, i in afe
 989 #                               if t is el
 990 #                                       afe.splice i, 1
 991 #                                       break
 992 #                       debug_log "aaa: starting off with subject on top of stack, exiting"
 993 #                       return
 994 # WHATWG: https://html.spec.whatwg.org/multipage/syntax.html#adoption-agency-algorithm
 995                 # If the current node is an HTML element whose tag name is subject, and
 996                 # the current node is not in the list of active formatting elements,
 997                 # then pop the current node off the stack of open elements, and abort
 998                 # these steps.
 999                 if open_els[0].name is subject and open_els[0].namespace is NS_HTML
1000                         debug_log "aaa: starting off with subject on top of stack, exiting"
1001                         # remove it from the list of active formatting elements (if found)
1002                         in_afe = false
1003                         for el, i in afe
1004                                 if el is open_els[0]
1005                                         in_afe = true
1006                                         break
1007                         unless in_afe
1008                                 debug_log "aaa: ...and not in afe, aaa done"
1009                                 open_els.shift()
1010                                 return
1011                         # fall through
1012 # END WHATWG
1013                 outer = 0
1014                 loop
1015                         if outer >= 8
1016                                 return
1017                         outer += 1
1018                         # 5. Let formatting element be the last element in the list of
1019                         # active formatting elements that: is between the end of the list
1020                         # and the last scope marker in the list, if any, or the start of
1021                         # the list otherwise, and  has the tag name subject.
1022                         fe = null
1023                         for t, fe_of_afe in afe
1024                                 if t.type is TYPE_AFE_MARKER
1025                                         break
1026                                 if t.name is subject
1027                                         fe = t
1028                                         break
1029                         # If there is no such element, then abort these steps and instead
1030                         # act as described in the "any other end tag" entry above.
1031                         if fe is null
1032                                 debug_log "aaa: fe not found in afe"
1033                                 in_body_any_other_end_tag subject
1034                                 return
1035                         # 6. If formatting element is not in the stack of open elements,
1036                         # then this is a parse error; remove the element from the list, and
1037                         # abort these steps.
1038                         in_open_els = false
1039                         for t, fe_of_open_els in open_els
1040                                 if t is fe
1041                                         in_open_els = true
1042                                         break
1043                         unless in_open_els
1044                                 debug_log "aaa: fe not found in open_els"
1045                                 parse_error()
1046                                 # "remove it from the list" must mean afe, since it's not in open_els
1047                                 afe.splice fe_of_afe, 1
1048                                 return
1049                         # 7. If formatting element is in the stack of open elements, but
1050                         # the element is not in scope, then this is a parse error; abort
1051                         # these steps.
1052                         unless el_is_in_scope fe
1053                                 debug_log "aaa: fe not in scope"
1054                                 parse_error()
1055                                 return
1056                         # 8. If formatting element is not the current node, this is a parse
1057                         # error. (But do not abort these steps.)
1058                         unless open_els[0] is fe
1059                                 parse_error()
1060                                 # continue
1061                         # 9. Let furthest block be the topmost node in the stack of open
1062                         # elements that is lower in the stack than formatting element, and
1063                         # is an element in the special category. There might not be one.
1064                         fb = null
1065                         fb_of_open_els = null
1066                         for t, i in open_els
1067                                 if t is fe
1068                                         break
1069                                 if el_is_special t
1070                                         fb = t
1071                                         fb_of_open_els = i
1072                                         # and continue, to see if there's one that's more "topmost"
1073                         # 10. If there is no furthest block, then the UA must first pop all
1074                         # the nodes from the bottom of the stack of open elements, from the
1075                         # current node up to and including formatting element, then remove
1076                         # formatting element from the list of active formatting elements,
1077                         # and finally abort these steps.
1078                         if fb is null
1079                                 debug_log "aaa: no fb"
1080                                 loop
1081                                         t = open_els.shift()
1082                                         if t is fe
1083                                                 afe.splice fe_of_afe, 1
1084                                                 return
1085                         # 11. Let common ancestor be the element immediately above
1086                         # formatting element in the stack of open elements.
1087                         ca = open_els[fe_of_open_els + 1] # common ancestor
1088
1089                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
1090                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
1091                         bookmark = new_aaa_bookmark()
1092                         for t, i in afe
1093                                 if t is fe
1094                                         afe.splice i, 0, bookmark
1095                                         break
1096                         node = last_node = fb
1097                         inner = 0
1098                         loop
1099                                 inner += 1
1100                                 # 3. Let node be the element immediately above node in the
1101                                 # stack of open elements, or if node is no longer in the stack
1102                                 # of open elements (e.g. because it got removed by this
1103                                 # algorithm), the element that was immediately above node in
1104                                 # the stack of open elements before node was removed.
1105                                 node_next = null
1106                                 for t, i in open_els
1107                                         if t is node
1108                                                 node_next = open_els[i + 1]
1109                                                 break
1110                                 node = node_next ? node_above
1111                                 debug_log "inner loop #{inner}"
1112                                 debug_log "tree: #{serialize_els doc.children, false, true}"
1113                                 debug_log "open_els: #{serialize_els open_els, true, true}"
1114                                 debug_log "afe: #{serialize_els afe, true, true}"
1115                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1116                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1117                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1118                                 debug_log "node: #{node.serialize true, true}"
1119                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
1120
1121                                 # 4. If node is formatting element, then go to the next step in
1122                                 # the overall algorithm.
1123                                 if node is fe
1124                                         break
1125                                 debug_log "the meat"
1126                                 # 5. If inner loop counter is greater than three and node is in
1127                                 # the list of active formatting elements, then remove node from
1128                                 # the list of active formatting elements.
1129                                 node_in_afe = false
1130                                 for t, i in afe
1131                                         if t is node
1132                                                 if inner > 3
1133                                                         afe.splice i, 1
1134                                                         debug_log "max out inner"
1135                                                 else
1136                                                         node_in_afe = true
1137                                                         debug_log "in afe"
1138                                                 break
1139                                 # 6. If node is not in the list of active formatting elements,
1140                                 # then remove node from the stack of open elements and then go
1141                                 # back to the step labeled inner loop.
1142                                 unless node_in_afe
1143                                         debug_log "not in afe"
1144                                         for t, i in open_els
1145                                                 if t is node
1146                                                         node_above = open_els[i + 1]
1147                                                         open_els.splice i, 1
1148                                                         break
1149                                         continue
1150                                 debug_log "the bones"
1151                                 # 7. create an element for the token for which the element node
1152                                 # was created, in the HTML namespace, with common ancestor as
1153                                 # the intended parent; replace the entry for node in the list
1154                                 # of active formatting elements with an entry for the new
1155                                 # element, replace the entry for node in the stack of open
1156                                 # elements with an entry for the new element, and let node be
1157                                 # the new element.
1158                                 new_node = token_to_element node.token, NS_HTML, ca
1159                                 for t, i in afe
1160                                         if t is node
1161                                                 afe[i] = new_node
1162                                                 debug_log "replaced in afe"
1163                                                 break
1164                                 for t, i in open_els
1165                                         if t is node
1166                                                 node_above = open_els[i + 1]
1167                                                 open_els[i] = new_node
1168                                                 debug_log "replaced in open_els"
1169                                                 break
1170                                 node = new_node
1171                                 # 8. If last node is furthest block, then move the
1172                                 # aforementioned bookmark to be immediately after the new node
1173                                 # in the list of active formatting elements.
1174                                 if last_node is fb
1175                                         for t, i in afe
1176                                                 if t is bookmark
1177                                                         afe.splice i, 1
1178                                                         debug_log "removed bookmark"
1179                                                         break
1180                                         for t, i in afe
1181                                                 if t is node
1182                                                         # "after" means lower
1183                                                         afe.splice i, 0, bookmark # "after as <-
1184                                                         debug_log "placed bookmark after node"
1185                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
1186                                                         break
1187                                 # 9. Insert last node into node, first removing it from its
1188                                 # previous parent node if any.
1189                                 if last_node.parent?
1190                                         debug_log "last_node has parent"
1191                                         for c, i in last_node.parent.children
1192                                                 if c is last_node
1193                                                         debug_log "removing last_node from parent"
1194                                                         last_node.parent.children.splice i, 1
1195                                                         break
1196                                 node.children.push last_node
1197                                 last_node.parent = node
1198                                 # 10. Let last node be node.
1199                                 last_node = node
1200                                 debug_log "at last"
1201                                 # 11. Return to the step labeled inner loop.
1202                         # 14. Insert whatever last node ended up being in the previous step
1203                         # at the appropriate place for inserting a node, but using common
1204                         # ancestor as the override target.
1205
1206                         # In the case where fe is immediately followed by fb:
1207                         #   * inner loop exits out early (node==fe)
1208                         #   * last_node is fb
1209                         #   * last_node is still in the tree (not a duplicate)
1210                         if last_node.parent?
1211                                 debug_log "FEFIRST? last_node has parent"
1212                                 for c, i in last_node.parent.children
1213                                         if c is last_node
1214                                                 debug_log "removing last_node from parent"
1215                                                 last_node.parent.children.splice i, 1
1216                                                 break
1217
1218                         debug_log "after aaa inner loop"
1219                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1220                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1221                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1222                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1223                         debug_log "tree: #{serialize_els doc.children, false, true}"
1224
1225                         debug_log "insert"
1226
1227
1228                         # can't use standard insert token thing, because it's already in
1229                         # open_els and must stay at it's current position in open_els
1230                         dest = adjusted_insertion_location ca
1231                         dest[0].children.splice dest[1], 0, last_node
1232                         last_node.parent = dest[0]
1233
1234
1235                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
1236                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
1237                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
1238                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
1239                         debug_log "tree: #{serialize_els doc.children, false, true}"
1240
1241                         # 15. Create an element for the token for which formatting element
1242                         # was created, in the HTML namespace, with furthest block as the
1243                         # intended parent.
1244                         new_element = token_to_element fe.token, NS_HTML, fb
1245                         # 16. Take all of the child nodes of furthest block and append them
1246                         # to the element created in the last step.
1247                         while fb.children.length
1248                                 t = fb.children.shift()
1249                                 t.parent = new_element
1250                                 new_element.children.push t
1251                         # 17. Append that new element to furthest block.
1252                         new_element.parent = fb
1253                         fb.children.push new_element
1254                         # 18. Remove formatting element from the list of active formatting
1255                         # elements, and insert the new element into the list of active
1256                         # formatting elements at the position of the aforementioned
1257                         # bookmark.
1258                         for t, i in afe
1259                                 if t is fe
1260                                         afe.splice i, 1
1261                                         break
1262                         for t, i in afe
1263                                 if t is bookmark
1264                                         afe[i] = new_element
1265                                         break
1266                         # 19. Remove formatting element from the stack of open elements,
1267                         # and insert the new element into the stack of open elements
1268                         # immediately below the position of furthest block in that stack.
1269                         for t, i in open_els
1270                                 if t is fe
1271                                         open_els.splice i, 1
1272                                         break
1273                         for t, i in open_els
1274                                 if t is fb
1275                                         open_els.splice i, 0, new_element
1276                                         break
1277                         # 20. Jump back to the step labeled outer loop.
1278                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
1279                         debug_log "tree: #{serialize_els doc.children, false, true}"
1280                         debug_log "open_els: #{serialize_els open_els, true, true}"
1281                         debug_log "afe: #{serialize_els afe, true, true}"
1282                 debug_log "AAA DONE"
1283
1284         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
1285         close_p_element = ->
1286                 generate_implied_end_tags 'p' # arg is exception
1287                 unless open_els[0].name is 'p' and open_els[0].namespace is NS_HTML
1288                         parse_error()
1289                 while open_els.length > 1 # just in case
1290                         el = open_els.shift()
1291                         if el.name is 'p' and el.namespace is NS_HTML
1292                                 return
1293         close_p_if_in_button_scope = ->
1294                 if is_in_button_scope 'p', NS_HTML
1295                         close_p_element()
1296
1297         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
1298         # aka insert_a_character = (t) ->
1299         insert_character = (t) ->
1300                 dest = adjusted_insertion_location()
1301                 # fixfull check for Document node
1302                 if dest[1] > 0
1303                         prev = dest[0].children[dest[1] - 1]
1304                         if prev.type is TYPE_TEXT
1305                                 prev.text += t.text
1306                                 return
1307                 dest[0].children.splice dest[1], 0, t
1308
1309
1310         # 8.2.5 http://www.w3.org/TR/html5/syntax.html#tree-construction
1311         process_token = (t) ->
1312                 acn = adjusted_current_node()
1313                 unless acn?
1314                         ins_mode t
1315                         return
1316                 if acn.namespace is NS_HTML
1317                         ins_mode t
1318                         return
1319                 if is_mathml_text_integration_point(acn)
1320                         if t.type is TYPE_START_TAG and not (t.name is 'mglyph' or t.name is 'malignmark')
1321                                 ins_mode t
1322                                 return
1323                         if t.type is TYPE_TEXT
1324                                 ins_mode t
1325                                 return
1326                 if acn.namespace is NS_MATHML and acn.name is 'annotation-xml' and t.type is TYPE_START_TAG and t.name is 'svg'
1327                         ins_mode t
1328                         return
1329                 if is_html_integration acn
1330                         if t.type is TYPE_START_TAG or t.type is TYPE_TEXT
1331                                 ins_mode t
1332                                 return
1333                 if t.type is TYPE_EOF
1334                         ins_mode t
1335                         return
1336                 in_foreign_content t
1337                 return
1338
1339         # 8.2.5.1
1340         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
1341         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
1342         adjusted_insertion_location = (override_target = null) ->
1343                 # 1. If there was an override target specified, then let target be the
1344                 # override target.
1345                 if override_target?
1346                         target = override_target
1347                 else # Otherwise, let target be the current node.
1348                         target = open_els[0]
1349                 # 2. Determine the adjusted insertion location using the first matching
1350                 # steps from the following list:
1351                 #
1352                 # If foster parenting is enabled and target is a table, tbody, tfoot,
1353                 # thead, or tr element Foster parenting happens when content is
1354                 # misnested in tables.
1355                 if flag_foster_parenting and foster_parenting_targets[target.name] is target.namespace
1356                         loop # once. this is here so we can ``break`` to "abort these substeps"
1357                                 # 1. Let last template be the last template element in the
1358                                 # stack of open elements, if any.
1359                                 last_template = null
1360                                 last_template_i = null
1361                                 for el, i in open_els
1362                                         if el.name is 'template' and el.namespace is NS_HTML
1363                                                 last_template = el
1364                                                 last_template_i = i
1365                                                 break
1366                                 # 2. Let last table be the last table element in the stack of
1367                                 # open elements, if any.
1368                                 last_table = null
1369                                 last_table_i
1370                                 for el, i in open_els
1371                                         if el.name is 'table' and el.namespace is NS_HTML
1372                                                 last_table = el
1373                                                 last_table_i = i
1374                                                 break
1375                                 # 3. If there is a last template and either there is no last
1376                                 # table, or there is one, but last template is lower (more
1377                                 # recently added) than last table in the stack of open
1378                                 # elements, then: let adjusted insertion location be inside
1379                                 # last template's template contents, after its last child (if
1380                                 # any), and abort these substeps.
1381                                 if last_template and (last_table is null or last_template_i < last_table_i)
1382                                         target = last_template # fixfull should be it's contents
1383                                         target_i = target.children.length
1384                                         break
1385                                 # 4. If there is no last table, then let adjusted insertion
1386                                 # location be inside the first element in the stack of open
1387                                 # elements (the html element), after its last child (if any),
1388                                 # and abort these substeps. (fragment case)
1389                                 if last_table is null
1390                                         # this is odd
1391                                         target = open_els[open_els.length - 1]
1392                                         target_i = target.children.length
1393                                         break
1394                                 # 5. If last table has a parent element, then let adjusted
1395                                 # insertion location be inside last table's parent element,
1396                                 # immediately before last table, and abort these substeps.
1397                                 if last_table.parent?
1398                                         for c, i in last_table.parent.children
1399                                                 if c is last_table
1400                                                         target = last_table.parent
1401                                                         target_i = i
1402                                                         break
1403                                         break
1404                                 # 6. Let previous element be the element immediately above last
1405                                 # table in the stack of open elements.
1406                                 #
1407                                 # huh? how could it not have a parent?
1408                                 previous_element = open_els[last_table_i + 1]
1409                                 # 7. Let adjusted insertion location be inside previous
1410                                 # element, after its last child (if any).
1411                                 target = previous_element
1412                                 target_i = target.children.length
1413                                 # Note: These steps are involved in part because it's possible
1414                                 # for elements, the table element in this case in particular,
1415                                 # to have been moved by a script around in the DOM, or indeed
1416                                 # removed from the DOM entirely, after the element was inserted
1417                                 # by the parser.
1418                                 break # don't really loop
1419                 else
1420                         # Otherwise Let adjusted insertion location be inside target, after
1421                         # its last child (if any).
1422                         target_i = target.children.length
1423
1424                 # 3. If the adjusted insertion location is inside a template element,
1425                 # let it instead be inside the template element's template contents,
1426                 # after its last child (if any).
1427                 # fixfull (template)
1428
1429                 # 4. Return the adjusted insertion location.
1430                 return [target, target_i]
1431
1432         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1433         # aka create_an_element_for_token
1434         token_to_element = (t, namespace, intended_parent) ->
1435                 # convert attributes into a hash
1436                 attrs = {}
1437                 for a in t.attrs_a
1438                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1439                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1440
1441                 # TODO 2. If the newly created element has an xmlns attribute in the
1442                 # XMLNS namespace whose value is not exactly the same as the element's
1443                 # namespace, that is a parse error. Similarly, if the newly created
1444                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1445                 # value is not the XLink Namespace, that is a parse error.
1446
1447                 # fixfull: the spec says stuff about form pointers and ownerDocument
1448
1449                 return el
1450
1451         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1452         insert_foreign_element = (token, namespace) ->
1453                 ail = adjusted_insertion_location()
1454                 ail_el = ail[0]
1455                 ail_i = ail[1]
1456                 el = token_to_element token, namespace, ail_el
1457                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1458                 el.parent = ail_el
1459                 ail_el.children.splice ail_i, 0, el
1460                 open_els.unshift el
1461                 return el
1462         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1463         insert_html_element = (token) ->
1464                 insert_foreign_element token, NS_HTML
1465
1466         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1467         # position should be [node, index_within_children]
1468         insert_comment = (t, position = null) ->
1469                 position ?= adjusted_insertion_location()
1470                 position[0].children.splice position[1], 0, t
1471
1472         # 8.2.5.2
1473         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1474         parse_generic_raw_text = (t) ->
1475                 insert_html_element t
1476                 tok_state = tok_state_rawtext
1477                 original_ins_mode = ins_mode
1478                 ins_mode = ins_mode_text
1479         parse_generic_rcdata_text = (t) ->
1480                 insert_html_element t
1481                 tok_state = tok_state_rcdata
1482                 original_ins_mode = ins_mode
1483                 ins_mode = ins_mode_text
1484
1485         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1486         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1487         generate_implied_end_tags = (except = null) ->
1488                 while end_tag_implied[open_els[0].name] is open_els[0].namespace and open_els[0].name isnt except
1489                         open_els.shift()
1490
1491         # 8.2.5.4 The rules for parsing tokens in HTML content
1492         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1493
1494         # 8.2.5.4.1 The "initial" insertion mode
1495         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1496         is_quirks_yes_doctype = (t) ->
1497                 if t.flag 'force-quirks'
1498                         return true
1499                 if t.name isnt 'html'
1500                         return true
1501                 if t.public_identifier?
1502                         pi = t.public_identifier.toLowerCase()
1503                         for p in quirks_yes_pi_prefixes
1504                                 if pi.substr(0, p.length) is p
1505                                         return true
1506                         if pi is '-//w3o//dtd w3 html strict 3.0//en//' or pi is '-/w3c/dtd html 4.0 transitional/en' or pi is 'html'
1507                                 return true
1508                 if t.system_identifier?
1509                         if t.system_identifier.toLowerCase() is 'http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd'
1510                                 return true
1511                 else if t.public_identifier?
1512                         # already did this: pi = t.public_identifier.toLowerCase()
1513                         if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1514                                 return true
1515                 return false
1516         is_quirks_limited_doctype = (t) ->
1517                 if t.public_identifier?
1518                         pi = t.public_identifier.toLowerCase()
1519                         if pi.substr(0, 32) is '-//w3c//dtd xhtml 1.0 frameset//' or pi.substr(0, 36) is '-//w3c//dtd xhtml 1.0 transitional//'
1520                                 return true
1521                         if t.system_identifier?
1522                                 if pi.substr(0, 32) is '-//w3c//dtd html 4.01 frameset//' or pi.substr(0, 36) is '-//w3c//dtd html 4.01 transitional//'
1523                                         return true
1524                 return false
1525         ins_mode_initial = (t) ->
1526                 if is_space_tok t
1527                         return
1528                 if t.type is TYPE_COMMENT
1529                         # ?fixfull
1530                         doc.children.push t
1531                         return
1532                 if t.type is TYPE_DOCTYPE
1533                         # fixfull syntax error from first paragraph and following bullets
1534                         # fixfull set doc.doctype
1535                         # fixfull is the "not an iframe srcdoc" thing relevant?
1536                         if is_quirks_yes_doctype t
1537                                 doc.flag 'quirks mode', QUIRKS_YES
1538                         else if is_quirks_limited_doctype t
1539                                 doc.flag 'quirks mode', QUIRKS_LIMITED
1540                         doc.children.push t
1541                         ins_mode = ins_mode_before_html
1542                         return
1543                 # Anything else
1544                 # fixfull not iframe srcdoc?
1545                 parse_error()
1546                 doc.flag 'quirks mode', QUIRKS_YES
1547                 ins_mode = ins_mode_before_html
1548                 process_token t
1549                 return
1550
1551         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1552         ins_mode_before_html = (t) ->
1553                 if t.type is TYPE_DOCTYPE
1554                         parse_error()
1555                         return
1556                 if t.type is TYPE_COMMENT
1557                         doc.children.push t
1558                         return
1559                 if is_space_tok t
1560                         return
1561                 if t.type is TYPE_START_TAG and t.name is 'html'
1562                         el = token_to_element t, NS_HTML, doc
1563                         doc.children.push el
1564                         open_els.unshift(el)
1565                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1566                         ins_mode = ins_mode_before_head
1567                         return
1568                 if t.type is TYPE_END_TAG
1569                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1570                                 # fall through to "anything else"
1571                         else
1572                                 parse_error()
1573                                 return
1574                 # Anything else
1575                 el = token_to_element new_open_tag('html'), NS_HTML, doc
1576                 doc.children.push el
1577                 el.parent = doc
1578                 open_els.unshift el
1579                 # ?fixfull browsing context
1580                 ins_mode = ins_mode_before_head
1581                 process_token t
1582                 return
1583
1584         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1585         ins_mode_before_head = (t) ->
1586                 if is_space_tok t
1587                         return
1588                 if t.type is TYPE_COMMENT
1589                         insert_comment t
1590                         return
1591                 if t.type is TYPE_DOCTYPE
1592                         parse_error()
1593                         return
1594                 if t.type is TYPE_START_TAG and t.name is 'html'
1595                         ins_mode_in_body t
1596                         return
1597                 if t.type is TYPE_START_TAG and t.name is 'head'
1598                         el = insert_html_element t
1599                         head_element_pointer = el
1600                         ins_mode = ins_mode_in_head
1601                         return
1602                 if t.type is TYPE_END_TAG
1603                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1604                                 # fall through to Anything else below
1605                         else
1606                                 parse_error()
1607                                 return
1608                 # Anything else
1609                 el = insert_html_element new_open_tag 'head'
1610                 head_element_pointer = el
1611                 ins_mode = ins_mode_in_head
1612                 process_token t
1613
1614         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1615         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1616                 open_els.shift() # spec says this will be a 'head' node
1617                 ins_mode = ins_mode_after_head
1618                 process_token t
1619         ins_mode_in_head = (t) ->
1620                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1621                         insert_character t
1622                         return
1623                 if t.type is TYPE_COMMENT
1624                         insert_comment t
1625                         return
1626                 if t.type is TYPE_DOCTYPE
1627                         parse_error()
1628                         return
1629                 if t.type is TYPE_START_TAG and t.name is 'html'
1630                         ins_mode_in_body t
1631                         return
1632                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1633                         el = insert_html_element t
1634                         open_els.shift()
1635                         t.acknowledge_self_closing()
1636                         return
1637                 if t.type is TYPE_START_TAG and t.name is 'meta'
1638                         el = insert_html_element t
1639                         open_els.shift()
1640                         t.acknowledge_self_closing()
1641                         # fixfull encoding stuff
1642                         return
1643                 if t.type is TYPE_START_TAG and t.name is 'title'
1644                         parse_generic_rcdata_text t
1645                         return
1646                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or t.name is 'noframes' or t.name is 'style')
1647                         parse_generic_raw_text t
1648                         return
1649                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1650                         insert_html_element t
1651                         ins_mode = ins_mode_in_head_noscript
1652                         return
1653                 if t.type is TYPE_START_TAG and t.name is 'script'
1654                         ail = adjusted_insertion_location()
1655                         el = token_to_element t, NS_HTML, ail
1656                         el.flag 'parser-inserted', true
1657                         # fixfull frament case
1658                         ail[0].children.splice ail[1], 0, el
1659                         open_els.unshift el
1660                         tok_state = tok_state_script_data
1661                         original_ins_mode = ins_mode # make sure orig... is defined
1662                         ins_mode = ins_mode_text
1663                         return
1664                 if t.type is TYPE_END_TAG and t.name is 'head'
1665                         open_els.shift() # will be a head element... spec says so
1666                         ins_mode = ins_mode_after_head
1667                         return
1668                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1669                         ins_mode_in_head_else t
1670                         return
1671                 if t.type is TYPE_START_TAG and t.name is 'template'
1672                         insert_html_element t
1673                         afe_push_marker()
1674                         flag_frameset_ok = false
1675                         ins_mode = ins_mode_in_template
1676                         template_ins_modes.unshift ins_mode_in_template
1677                         return
1678                 if t.type is TYPE_END_TAG and t.name is 'template'
1679                         if template_tag_is_open()
1680                                 generate_implied_end_tags
1681                                 if open_els[0].name isnt 'template'
1682                                         parse_error()
1683                                 loop
1684                                         el = open_els.shift()
1685                                         if el.name is 'template' and el.namespace is NS_HTML
1686                                                 break
1687                                 clear_afe_to_marker()
1688                                 template_ins_modes.shift()
1689                                 reset_ins_mode()
1690                         else
1691                                 parse_error()
1692                         return
1693                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1694                         parse_error()
1695                         return
1696                 ins_mode_in_head_else t
1697
1698         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1699         ins_mode_in_head_noscript_else = (t) ->
1700                 parse_error()
1701                 open_els.shift()
1702                 ins_mode = ins_mode_in_head
1703                 process_token t
1704         ins_mode_in_head_noscript = (t) ->
1705                 if t.type is TYPE_DOCTYPE
1706                         parse_error()
1707                         return
1708                 if t.type is TYPE_START_TAG and t.name is 'html'
1709                         ins_mode_in_body t
1710                         return
1711                 if t.type is TYPE_END_TAG and t.name is 'noscript'
1712                         open_els.shift()
1713                         ins_mode = ins_mode_in_head
1714                         return
1715                 if is_space_tok(t) or t.type is TYPE_COMMENT or (t.type is TYPE_START_TAG and (t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'style'))
1716                         ins_mode_in_head t
1717                         return
1718                 if t.type is TYPE_END_TAG and t.name is 'br'
1719                         ins_mode_in_head_noscript_else t
1720                         return
1721                 if (t.type is TYPE_START_TAG and (t.name is 'head' or t.name is 'noscript')) or t.type is TYPE_END_TAG
1722                         parse_error()
1723                         return
1724                 # Anything else
1725                 ins_mode_in_head_noscript_else t
1726                 return
1727
1728
1729
1730         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1731         ins_mode_after_head_else = (t) ->
1732                 body_tok = new_open_tag 'body'
1733                 insert_html_element body_tok
1734                 ins_mode = ins_mode_in_body
1735                 process_token t
1736                 return
1737         ins_mode_after_head = (t) ->
1738                 if is_space_tok t
1739                         insert_character t
1740                         return
1741                 if t.type is TYPE_COMMENT
1742                         insert_comment t
1743                         return
1744                 if t.type is TYPE_DOCTYPE
1745                         parse_error()
1746                         return
1747                 if t.type is TYPE_START_TAG and t.name is 'html'
1748                         ins_mode_in_body t
1749                         return
1750                 if t.type is TYPE_START_TAG and t.name is 'body'
1751                         insert_html_element t
1752                         flag_frameset_ok = false
1753                         ins_mode = ins_mode_in_body
1754                         return
1755                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1756                         insert_html_element t
1757                         ins_mode = ins_mode_in_frameset
1758                         return
1759                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1760                         parse_error()
1761                         open_els.unshift head_element_pointer
1762                         ins_mode_in_head t
1763                         for el, i in open_els
1764                                 if el is head_element_pointer
1765                                         open_els.splice i, 1
1766                                         return
1767                         console.log "warning: 23904 couldn't find head element in open_els"
1768                         return
1769                 if t.type is TYPE_END_TAG and t.name is 'template'
1770                         ins_mode_in_head t
1771                         return
1772                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1773                         ins_mode_after_head_else t
1774                         return
1775                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1776                         parse_error()
1777                         return
1778                 # Anything else
1779                 ins_mode_after_head_else t
1780
1781         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1782         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1783                 node = open_els[0]
1784                 loop
1785                         if node.name is name and node.namespace is NS_HTML
1786                                 generate_implied_end_tags name # arg is exception
1787                                 unless node is open_els[0]
1788                                         parse_error()
1789                                 loop
1790                                         el = open_els.shift()
1791                                         if el is node
1792                                                 return
1793                         if special_elements[node.name] is node.namespace
1794                                 parse_error()
1795                                 return
1796                         for el, i in open_els
1797                                 if node is el
1798                                         node = open_els[i + 1]
1799                                         break
1800                 return
1801         ins_mode_in_body = (t) ->
1802                 if t.type is TYPE_TEXT and t.text is "\u0000"
1803                         parse_error()
1804                         return
1805                 if is_space_tok t
1806                         reconstruct_afe()
1807                         insert_character t
1808                         return
1809                 if t.type is TYPE_TEXT
1810                         reconstruct_afe()
1811                         insert_character t
1812                         flag_frameset_ok = false
1813                         return
1814                 if t.type is TYPE_COMMENT
1815                         insert_comment t
1816                         return
1817                 if t.type is TYPE_DOCTYPE
1818                         parse_error()
1819                         return
1820                 if t.type is TYPE_START_TAG and t.name is 'html'
1821                         parse_error()
1822                         return if template_tag_is_open()
1823                         root_attrs = open_els[open_els.length - 1].attrs
1824                         for a in t.attrs_a
1825                                 root_attrs[a[0]] = a[1] unless root_attrs[a[0]]?
1826                         return
1827
1828                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
1829                         ins_mode_in_head t
1830                         return
1831                 if t.type is TYPE_START_TAG and t.name is 'body'
1832                         parse_error()
1833                         return if open_els.length < 2
1834                         second = open_els[open_els.length - 2]
1835                         return unless second.namespace is NS_HTML
1836                         return unless second.name is 'body'
1837                         return if template_tag_is_open()
1838                         flag_frameset_ok = false
1839                         for a in t.attrs_a
1840                                 second.attrs[a[0]] = a[1] unless second.attrs[a[0]]?
1841                         return
1842                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1843                         parse_error()
1844                         return if open_els.length < 2
1845                         second_i = open_els.length - 2
1846                         second = open_els[second_i]
1847                         return unless second.namespace is NS_HTML
1848                         return unless second.name is 'body'
1849                         if flag_frameset_ok is false
1850                                 return
1851                         if second.parent?
1852                                 for el, i in second.parent.children
1853                                         if el is second
1854                                                 second.parent.children.splice i, 1
1855                                                 break
1856                         open_els.splice second_i, 1
1857                         # pop everything except the "root html element"
1858                         while open_els.length > 1
1859                                 open_els.shift()
1860                         insert_html_element t
1861                         ins_mode = ins_mode_in_frameset
1862                         return
1863                 if t.type is TYPE_EOF
1864                         ok_tags = {
1865                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, p:NS_HTML, tbody:NS_HTML,
1866                                 td:NS_HTML, tfoot:NS_HTML, th:NS_HTML, thead:NS_HTML,
1867                                 tr:NS_HTML, body:NS_HTML, html:NS_HTML,
1868                         }
1869                         for el in open_els
1870                                 unless ok_tags[t.name] is el.namespace
1871                                         parse_error()
1872                                         break
1873                         if template_ins_modes.length > 0
1874                                 ins_mode_in_template t
1875                         else
1876                                 stop_parsing()
1877                         return
1878                 if t.type is TYPE_END_TAG and t.name is 'body'
1879                         unless is_in_scope 'body', NS_HTML
1880                                 parse_error()
1881                                 return
1882                         ok_tags = {
1883                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1884                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1885                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1886                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1887                                 html:NS_HTML
1888                         }
1889                         for el in open_els
1890                                 unless ok_tags[t.name] is el.namespace
1891                                         parse_error()
1892                                         break
1893                         ins_mode = ins_mode_after_body
1894                         return
1895                 if t.type is TYPE_END_TAG and t.name is 'html'
1896                         unless is_in_scope 'body', NS_HTML
1897                                 parse_error()
1898                                 return
1899                         ok_tags = {
1900                                 dd:NS_HTML, dt:NS_HTML, li:NS_HTML, optgroup:NS_HTML,
1901                                 option:NS_HTML, p:NS_HTML, rb:NS_HTML, rp:NS_HTML, rt:NS_HTML,
1902                                 rtc:NS_HTML, tbody:NS_HTML, td:NS_HTML, tfoot:NS_HTML,
1903                                 th:NS_HTML, thead:NS_HTML, tr:NS_HTML, body:NS_HTML,
1904                                 html:NS_HTML
1905                         }
1906                         for el in open_els
1907                                 unless ok_tags[t.name] is el.namespace
1908                                         parse_error()
1909                                         break
1910                         ins_mode = ins_mode_after_body
1911                         process_token t
1912                         return
1913                 if t.type is TYPE_START_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'p' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
1914                         close_p_if_in_button_scope()
1915                         insert_html_element t
1916                         return
1917                 if t.type is TYPE_START_TAG and h_tags[t.name]?
1918                         close_p_if_in_button_scope()
1919                         if h_tags[open_els[0].name] is open_els[0].namespace
1920                                 parse_error()
1921                                 open_els.shift()
1922                         insert_html_element t
1923                         return
1924                 if t.type is TYPE_START_TAG and (t.name is 'pre' or t.name is 'listing')
1925                         close_p_if_in_button_scope()
1926                         insert_html_element t
1927                         # spec: If the next token is a "LF" (U+000A) character token, then
1928                         # ignore that token and move on to the next one. (Newlines at the
1929                         # start of pre blocks are ignored as an authoring convenience.)
1930                         if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
1931                                 cur += 1
1932                         flag_frameset_ok = false
1933                         return
1934                 if t.type is TYPE_START_TAG and t.name is 'form'
1935                         unless form_element_pointer is null or template_tag_is_open()
1936                                 parse_error()
1937                                 return
1938                         close_p_if_in_button_scope()
1939                         el = insert_html_element t
1940                         unless template_tag_is_open()
1941                                 form_element_pointer = el
1942                         return
1943                 if t.type is TYPE_START_TAG and t.name is 'li'
1944                         flag_frameset_ok = false
1945                         for node in open_els
1946                                 if node.name is 'li' and node.namespace is NS_HTML
1947                                         generate_implied_end_tags 'li' # arg is exception
1948                                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
1949                                                 parse_error()
1950                                         loop
1951                                                 el = open_els.shift()
1952                                                 if el.name is 'li' and el.namespace is NS_HTML
1953                                                         break
1954                                         break
1955                                 if el_is_special_not_adp node
1956                                                 break
1957                         close_p_if_in_button_scope()
1958                         insert_html_element t
1959                         return
1960                 if t.type is TYPE_START_TAG and (t.name is 'dd' or t.name is 'dt')
1961                         flag_frameset_ok = false
1962                         for node in open_els
1963                                 if node.name is 'dd' and node.namespace is NS_HTML
1964                                         generate_implied_end_tags 'dd' # arg is exception
1965                                         if open_els[0].name isnt 'dd' or open_els[0].namespace isnt NS_HTML
1966                                                 parse_error()
1967                                         loop
1968                                                 el = open_els.shift()
1969                                                 if el.name is 'dd' and el.namespace is NS_HTML
1970                                                         break
1971                                         break
1972                                 if node.name is 'dt' and node.namespace is NS_HTML
1973                                         generate_implied_end_tags 'dt' # arg is exception
1974                                         if open_els[0].name isnt 'dt' or open_els[0].namespace isnt NS_HTML
1975                                                 parse_error()
1976                                         loop
1977                                                 el = open_els.shift()
1978                                                 if el.name is 'dt' and el.namespace is NS_HTML
1979                                                         break
1980                                         break
1981                                 if el_is_special_not_adp node
1982                                         break
1983                         close_p_if_in_button_scope()
1984                         insert_html_element t
1985                         return
1986                 if t.type is TYPE_START_TAG and t.name is 'plaintext'
1987                         close_p_if_in_button_scope()
1988                         insert_html_element t
1989                         tok_state = tok_state_plaintext
1990                         return
1991                 if t.type is TYPE_START_TAG and t.name is 'button'
1992                         if is_in_scope 'button', NS_HTML
1993                                 parse_error()
1994                                 generate_implied_end_tags()
1995                                 loop
1996                                         el = open_els.shift()
1997                                         if el.name is 'button' and el.namespace is NS_HTML
1998                                                 break
1999                         reconstruct_afe()
2000                         insert_html_element t
2001                         flag_frameset_ok = false
2002                         return
2003                 if t.type is TYPE_END_TAG and (t.name is 'address' or t.name is 'article' or t.name is 'aside' or t.name is 'blockquote' or t.name is 'button' or t.name is 'center' or t.name is 'details' or t.name is 'dialog' or t.name is 'dir' or t.name is 'div' or t.name is 'dl' or t.name is 'fieldset' or t.name is 'figcaption' or t.name is 'figure' or t.name is 'footer' or t.name is 'header' or t.name is 'hgroup' or t.name is 'listing' or t.name is 'main' or t.name is 'nav' or t.name is 'ol' or t.name is 'pre' or t.name is 'section' or t.name is 'summary' or t.name is 'ul')
2004                         unless is_in_scope t.name, NS_HTML
2005                                 parse_error()
2006                                 return
2007                         generate_implied_end_tags()
2008                         unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
2009                                 parse_error()
2010                         loop
2011                                 el = open_els.shift()
2012                                 if el.name is t.name and el.namespace is NS_HTML
2013                                         return
2014                         return
2015                 if t.type is TYPE_END_TAG and t.name is 'form'
2016                         unless template_tag_is_open()
2017                                 node = form_element_pointer
2018                                 form_element_pointer = null
2019                                 if node is null or not el_is_in_scope node
2020                                         parse_error()
2021                                         return
2022                                 generate_implied_end_tags()
2023                                 if open_els[0] isnt node
2024                                         parse_error()
2025                                 for el, i in open_els
2026                                         if el is node
2027                                                 open_els.splice i, 1
2028                                                 break
2029                         else
2030                                 unless is_in_scope 'form', NS_HTML
2031                                         parse_error()
2032                                         return
2033                                 generate_implied_end_tags()
2034                                 if open_els[0].name isnt 'form' or open_els[0].namespace isnt NS_HTML
2035                                         parse_error()
2036                                 loop
2037                                         el = open_els.shift()
2038                                         if el.name is 'form' and el.namespace is NS_HTML
2039                                                 break
2040                         return
2041                 if t.type is TYPE_END_TAG and t.name is 'p'
2042                         unless is_in_button_scope 'p', NS_HTML
2043                                 parse_error()
2044                                 insert_html_element new_open_tag 'p'
2045                         close_p_element()
2046                         return
2047                 if t.type is TYPE_END_TAG and t.name is 'li'
2048                         unless is_in_li_scope 'li', NS_HTML
2049                                 parse_error()
2050                                 return
2051                         generate_implied_end_tags 'li' # arg is exception
2052                         if open_els[0].name isnt 'li' or open_els[0].namespace isnt NS_HTML
2053                                 parse_error()
2054                         loop
2055                                 el = open_els.shift()
2056                                 if el.name is 'li' and el.namespace is NS_HTML
2057                                         break
2058                         return
2059                 if t.type is TYPE_END_TAG and (t.name is 'dd' or t.name is 'dt')
2060                         unless is_in_scope t.name, NS_HTML
2061                                 parse_error()
2062                                 return
2063                         generate_implied_end_tags t.name # arg is exception
2064                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2065                                 parse_error()
2066                         loop
2067                                 el = open_els.shift()
2068                                 if el.name is t.name and el.namespace is NS_HTML
2069                                         break
2070                         return
2071                 if t.type is TYPE_END_TAG and h_tags[t.name]?
2072                         h_in_scope = false
2073                         for el in open_els
2074                                 if h_tags[el.name] is el.namespace
2075                                         h_in_scope = true
2076                                         break
2077                                 if standard_scopers[el.name] is el.namespace
2078                                         break
2079                         unless h_in_scope
2080                                 parse_error()
2081                                 return
2082                         generate_implied_end_tags()
2083                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2084                                 parse_error()
2085                         loop
2086                                 el = open_els.shift()
2087                                 if h_tags[el.name] is el.namespace
2088                                         break
2089                         return
2090                 # deep breath!
2091                 if t.type is TYPE_START_TAG and t.name is 'a'
2092                         # If the list of active formatting elements contains an a element
2093                         # between the end of the list and the last marker on the list (or
2094                         # the start of the list if there is no marker on the list), then
2095                         # this is a parse error; run the adoption agency algorithm for the
2096                         # tag name "a", then remove that element from the list of active
2097                         # formatting elements and the stack of open elements if the
2098                         # adoption agency algorithm didn't already remove it (it might not
2099                         # have if the element is not in table scope).
2100                         found = false
2101                         for el in afe
2102                                 if el.type is TYPE_AFE_MARKER
2103                                         break
2104                                 if el.name is 'a' and el.namespace is NS_HTML
2105                                         found = el
2106                         if found?
2107                                 parse_error()
2108                                 adoption_agency 'a'
2109                                 for el, i in afe
2110                                         if el is found
2111                                                 afe.splice i, 1
2112                                 for el, i in open_els
2113                                         if el is found
2114                                                 open_els.splice i, 1
2115                         reconstruct_afe()
2116                         el = insert_html_element t
2117                         afe_push el
2118                         return
2119                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2120                         reconstruct_afe()
2121                         el = insert_html_element t
2122                         afe_push el
2123                         return
2124                 if t.type is TYPE_START_TAG and t.name is 'nobr'
2125                         reconstruct_afe()
2126                         if is_in_scope 'nobr', NS_HTML
2127                                 parse_error()
2128                                 adoption_agency 'nobr'
2129                                 reconstruct_afe()
2130                         el = insert_html_element t
2131                         afe_push el
2132                         return
2133                 if t.type is TYPE_END_TAG and (t.name is 'a' or t.name is 'b' or t.name is 'big' or t.name is 'code' or t.name is 'em' or t.name is 'font' or t.name is 'i' or t.name is 'nobr' or t.name is 's' or t.name is 'small' or t.name is 'strike' or t.name is 'strong' or t.name is 'tt' or t.name is 'u')
2134                         adoption_agency t.name
2135                         return
2136                 if t.type is TYPE_START_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2137                         reconstruct_afe()
2138                         insert_html_element t
2139                         afe_push_marker()
2140                         flag_frameset_ok = false
2141                         return
2142                 if t.type is TYPE_END_TAG and (t.name is 'applet' or t.name is 'marquee' or t.name is 'object')
2143                         unless is_in_scope t.name, NS_HTML
2144                                 parse_error()
2145                                 return
2146                         generate_implied_end_tags()
2147                         if open_els[0].name isnt t.name or open_els[0].namespace isnt NS_HTML
2148                                 parse_error()
2149                         loop
2150                                 el = open_els.shift()
2151                                 if el.name is t.name and el.namespace is NS_HTML
2152                                         break
2153                         clear_afe_to_marker()
2154                         return
2155                 if t.type is TYPE_START_TAG and t.name is 'table'
2156                         unless doc.flag('quirks mode') is QUIRKS_YES
2157                                 close_p_if_in_button_scope() # test
2158                         insert_html_element t
2159                         flag_frameset_ok = false
2160                         ins_mode = ins_mode_in_table
2161                         return
2162                 if t.type is TYPE_END_TAG and t.name is 'br'
2163                         parse_error()
2164                         t.type = TYPE_START_TAG
2165                         # fall through
2166                 if t.type is TYPE_START_TAG and (t.name is 'area' or t.name is 'br' or t.name is 'embed' or t.name is 'img' or t.name is 'keygen' or t.name is 'wbr')
2167                         reconstruct_afe()
2168                         insert_html_element t
2169                         open_els.shift()
2170                         t.acknowledge_self_closing()
2171                         flag_frameset_ok = false
2172                         return
2173                 if t.type is TYPE_START_TAG and t.name is 'input'
2174                         reconstruct_afe()
2175                         insert_html_element t
2176                         open_els.shift()
2177                         t.acknowledge_self_closing()
2178                         unless is_input_hidden_tok t
2179                                 flag_frameset_ok = false
2180                         return
2181                 if t.type is TYPE_START_TAG and (t.name is 'menuitem' or t.name is 'param' or t.name is 'source' or t.name is 'track')
2182                         # WHATWG adds 'menuitem' for this block
2183                         insert_html_element t
2184                         open_els.shift()
2185                         t.acknowledge_self_closing()
2186                         return
2187                 if t.type is TYPE_START_TAG and t.name is 'hr'
2188                         close_p_if_in_button_scope()
2189                         insert_html_element t
2190                         open_els.shift()
2191                         t.acknowledge_self_closing()
2192                         flag_frameset_ok = false
2193                         return
2194                 if t.type is TYPE_START_TAG and t.name is 'image'
2195                         parse_error()
2196                         t.name = 'img'
2197                         process_token t
2198                         return
2199                 if t.type is TYPE_START_TAG and t.name is 'isindex'
2200                         parse_error()
2201                         if template_tag_is_open() is false and form_element_pointer isnt null
2202                                 return
2203                         t.acknowledge_self_closing()
2204                         flag_frameset_ok = false
2205                         close_p_if_in_button_scope()
2206                         el = insert_html_element new_open_tag 'form'
2207                         unless template_tag_is_open()
2208                                 form_element_pointer = el
2209                         for a in t.attrs_a
2210                                 if a[0] is 'action'
2211                                         el.attrs['action'] = a[1]
2212                                         break
2213                         insert_html_element new_open_tag 'hr'
2214                         open_els.shift()
2215                         reconstruct_afe()
2216                         insert_html_element new_open_tag 'label'
2217                         # note: this is a little out-of-spec-order so we only have to scan t.attrs_a once
2218                         input_el = new_open_tag 'input'
2219                         prompt = null
2220                         for a in t.attrs_a
2221                                 if a[0] is 'prompt'
2222                                         prompt = a[1]
2223                                 if a[0] isnt 'name' and a[0] isnt 'action' and a[0] isnt 'prompt'
2224                                         input_el.attrs_a.push [a[0], a[1]]
2225                         input_el.attrs_a.push ['name', 'isindex']
2226                         # fixfull this next bit is in english... internationalize?
2227                         prompt ?= "This is a searchable index. Enter search keywords: "
2228                         insert_character new_character_token prompt # fixfull split
2229                         # TODO submit typo "balue" in spec
2230                         insert_html_element input_el
2231                         open_els.shift()
2232                         # insert_character '' # you can put chars here if promt attr missing
2233                         open_els.shift()
2234                         insert_html_element new_open_tag 'hr'
2235                         open_els.shift()
2236                         open_els.shift()
2237                         unless template_tag_is_open()
2238                                 form_element_pointer = null
2239                         return
2240                 if t.type is TYPE_START_TAG and t.name is 'textarea'
2241                         insert_html_element t
2242                         if txt.charAt(cur) is "\u000a" # FIXME check for crlf?
2243                                 cur += 1
2244                         tok_state = tok_state_rcdata
2245                         original_ins_mode = ins_mode
2246                         flag_frameset_ok = false
2247                         ins_mode = ins_mode_text
2248                         return
2249                 if t.type is TYPE_START_TAG and t.name is 'xmp'
2250                         close_p_if_in_button_scope()
2251                         reconstruct_afe()
2252                         flag_frameset_ok = false
2253                         parse_generic_raw_text t
2254                         return
2255                 if t.type is TYPE_START_TAG and t.name is 'iframe'
2256                         flag_frameset_ok = false
2257                         parse_generic_raw_text t
2258                         return
2259                 if t.type is TYPE_START_TAG and (t.name is 'noembed' or (t.name is 'noscript' and flag_scripting))
2260                         parse_generic_raw_text t
2261                         return
2262                 if t.type is TYPE_START_TAG and t.name is 'select'
2263                         reconstruct_afe()
2264                         insert_html_element t
2265                         flag_frameset_ok = false
2266                         if ins_mode is ins_mode_in_table or ins_mode is ins_mode_in_caption or ins_mode is ins_mode_in_table_body or ins_mode is ins_mode_in_row or ins_mode is ins_mode_in_cell
2267                                 ins_mode = ins_mode_in_select_in_table
2268                         else
2269                                 ins_mode = ins_mode_in_select
2270                         return
2271                 if t.type is TYPE_START_TAG and (t.name is 'optgroup' or t.name is 'option')
2272                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2273                                 open_els.shift()
2274                         reconstruct_afe()
2275                         insert_html_element t
2276                         return
2277 # this comment block implements the W3C spec
2278 #               if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rp' or t.name is 'rtc')
2279 #                       if is_in_scope 'ruby', NS_HTML
2280 #                               generate_implied_end_tags()
2281 #                               unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2282 #                                       parse_error()
2283 #                       insert_html_element t
2284 #                       return
2285 #               if t.type is TYPE_START_TAG and t.name is 'rt'
2286 #                       if is_in_scope 'ruby', NS_HTML
2287 #                               generate_implied_end_tags 'rtc' # arg is exception
2288 #                               unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2289 #                                       parse_error()
2290 #                       insert_html_element t
2291 #                       return
2292 # below implements the WHATWG spec https://html.spec.whatwg.org/multipage/syntax.html#parsing-main-inbody
2293                 if t.type is TYPE_START_TAG and (t.name is 'rb' or t.name is 'rtc')
2294                         if is_in_scope 'ruby', NS_HTML
2295                                 generate_implied_end_tags()
2296                                 unless open_els[0].name is 'ruby' and open_els[0].namespace is NS_HTML
2297                                         parse_error()
2298                         insert_html_element t
2299                         return
2300                 if t.type is TYPE_START_TAG and (t.name is 'rp' or t.name is 'rt')
2301                         if is_in_scope 'ruby', NS_HTML
2302                                 generate_implied_end_tags 'rtc'
2303                                 unless (open_els[0].name is 'ruby' or open_els[0].name is 'rtc') and open_els[0].namespace is NS_HTML
2304                                         parse_error()
2305                         insert_html_element t
2306                         return
2307 # end WHATWG chunk
2308                 if t.type is TYPE_START_TAG and t.name is 'math'
2309                         reconstruct_afe()
2310                         adjust_mathml_attributes t
2311                         adjust_foreign_attributes t
2312                         insert_foreign_element t, NS_MATHML
2313                         if t.flag 'self-closing'
2314                                 open_els.shift()
2315                                 t.acknowledge_self_closing()
2316                         return
2317                 if t.type is TYPE_START_TAG and t.name is 'svg'
2318                         reconstruct_afe()
2319                         adjust_svg_attributes t
2320                         adjust_foreign_attributes t
2321                         insert_foreign_element t, NS_SVG
2322                         if t.flag 'self-closing'
2323                                 open_els.shift()
2324                                 t.acknowledge_self_closing()
2325                         return
2326                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'frame' or t.name is 'head' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2327                         parse_error()
2328                         return
2329                 if t.type is TYPE_START_TAG # any other start tag
2330                         reconstruct_afe()
2331                         insert_html_element t
2332                         return
2333                 if t.type is TYPE_END_TAG # any other end tag
2334                         in_body_any_other_end_tag t.name
2335                         return
2336                 return
2337
2338         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
2339         ins_mode_text = (t) ->
2340                 if t.type is TYPE_TEXT
2341                         insert_character t
2342                         return
2343                 if t.type is TYPE_EOF
2344                         parse_error()
2345                         if open_els[0].name is 'script' and open_els[0].namespace is NS_HTML
2346                                 open_els[0].flag 'already started', true
2347                         open_els.shift()
2348                         ins_mode = original_ins_mode
2349                         process_token t
2350                         return
2351                 if t.type is TYPE_END_TAG and t.name is 'script'
2352                         open_els.shift()
2353                         ins_mode = original_ins_mode
2354                         # fixfull the spec seems to assume that I'm going to run the script
2355                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
2356                         return
2357                 if t.type is TYPE_END_TAG
2358                         open_els.shift()
2359                         ins_mode = original_ins_mode
2360                         return
2361                 console.log 'warning: end of ins_mode_text reached'
2362
2363         # the functions below implement the tokenizer stats described here:
2364         # http://www.w3.org/TR/html5/syntax.html#tokenization
2365
2366         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
2367         ins_mode_in_table_else = (t) ->
2368                 parse_error()
2369                 flag_foster_parenting = true
2370                 ins_mode_in_body t
2371                 flag_foster_parenting = false
2372                 return
2373         ins_mode_in_table = (t) ->
2374                 switch t.type
2375                         when TYPE_TEXT
2376                                 if (open_els[0].name is 'table' or open_els[0].name is 'tbody' or open_els[0].name is 'tfoot' or open_els[0].name is 'thead' or open_els[0].name is 'tr') and open_els[0].namespace is NS_HTML
2377                                         pending_table_character_tokens = []
2378                                         original_ins_mode = ins_mode
2379                                         ins_mode = ins_mode_in_table_text
2380                                         process_token t
2381                                 else
2382                                         ins_mode_in_table_else t
2383                         when TYPE_COMMENT
2384                                 insert_comment t
2385                         when TYPE_DOCTYPE
2386                                 parse_error()
2387                         when TYPE_START_TAG
2388                                 switch t.name
2389                                         when 'caption'
2390                                                 clear_stack_to_table_context()
2391                                                 afe_push_marker()
2392                                                 insert_html_element t
2393                                                 ins_mode = ins_mode_in_caption
2394                                         when 'colgroup'
2395                                                 clear_stack_to_table_context()
2396                                                 insert_html_element t
2397                                                 ins_mode = ins_mode_in_column_group
2398                                         when 'col'
2399                                                 clear_stack_to_table_context()
2400                                                 insert_html_element new_open_tag 'colgroup'
2401                                                 ins_mode = ins_mode_in_column_group
2402                                                 process_token t
2403                                         when 'tbody', 'tfoot', 'thead'
2404                                                 clear_stack_to_table_context()
2405                                                 insert_html_element t
2406                                                 ins_mode = ins_mode_in_table_body
2407                                         when 'td', 'th', 'tr'
2408                                                 clear_stack_to_table_context()
2409                                                 insert_html_element new_open_tag 'tbody'
2410                                                 ins_mode = ins_mode_in_table_body
2411                                                 process_token t
2412                                         when 'table'
2413                                                 parse_error()
2414                                                 if is_in_table_scope 'table', NS_HTML
2415                                                         loop
2416                                                                 el = open_els.shift()
2417                                                                 if el.name is 'table' and el.namespace is NS_HTML
2418                                                                         break
2419                                                         reset_ins_mode()
2420                                                         process_token t
2421                                         when 'style', 'script', 'template'
2422                                                 ins_mode_in_head t
2423                                         when 'input'
2424                                                 unless is_input_hidden_tok t
2425                                                         ins_mode_in_table_else t
2426                                                 else
2427                                                         parse_error()
2428                                                         el = insert_html_element t
2429                                                         open_els.shift()
2430                                                         t.acknowledge_self_closing()
2431                                         when 'form'
2432                                                 parse_error()
2433                                                 if form_element_pointer?
2434                                                         return
2435                                                 if template_tag_is_open()
2436                                                         return
2437                                                 form_element_pointer = insert_html_element t
2438                                                 open_els.shift()
2439                                         else
2440                                                 ins_mode_in_table_else t
2441                         when TYPE_END_TAG
2442                                 switch t.name
2443                                         when 'table'
2444                                                 if is_in_table_scope 'table', NS_HTML
2445                                                         loop
2446                                                                 el = open_els.shift()
2447                                                                 if el.name is 'table' and el.namespace is NS_HTML
2448                                                                         break
2449                                                         reset_ins_mode()
2450                                                 else
2451                                                         parse_error()
2452                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
2453                                                 parse_error()
2454                                         when 'template'
2455                                                 ins_mode_in_head t
2456                                         else
2457                                                 ins_mode_in_table_else t
2458                         when TYPE_EOF
2459                                 ins_mode_in_body t
2460                         else
2461                                 ins_mode_in_table_else t
2462
2463
2464         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
2465         ins_mode_in_table_text = (t) ->
2466                 if t.type is TYPE_TEXT and t.text is "\u0000"
2467                         # from javascript?
2468                         parse_error()
2469                         return
2470                 if t.type is TYPE_TEXT
2471                         pending_table_character_tokens.push t
2472                         return
2473                 # Anything else
2474                 all_space = true
2475                 for old in pending_table_character_tokens
2476                         unless is_space_tok old
2477                                 all_space = false
2478                                 break
2479                 if all_space
2480                         for old in pending_table_character_tokens
2481                                 insert_character old
2482                 else
2483                         for old in pending_table_character_tokens
2484                                 ins_mode_in_table_else old
2485                 pending_table_character_tokens = []
2486                 ins_mode = original_ins_mode
2487                 process_token t
2488
2489         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
2490         ins_mode_in_caption = (t) ->
2491                 if t.type is TYPE_END_TAG and t.name is 'caption'
2492                         if is_in_table_scope 'caption', NS_HTML
2493                                 generate_implied_end_tags()
2494                                 if open_els[0].name isnt 'caption'
2495                                         parse_error()
2496                                 loop
2497                                         el = open_els.shift()
2498                                         if el.name is 'caption' and el.namespace is NS_HTML
2499                                                 break
2500                                 clear_afe_to_marker()
2501                                 ins_mode = ins_mode_in_table
2502                         else
2503                                 parse_error()
2504                                 # fragment case
2505                         return
2506                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2507                         parse_error()
2508                         if is_in_table_scope 'caption', NS_HTML
2509                                 loop
2510                                         el = open_els.shift()
2511                                         if el.name is 'caption' and el.namespace is NS_HTML
2512                                                 break
2513                                 clear_afe_to_marker()
2514                                 ins_mode = ins_mode_in_table
2515                                 process_token t
2516                         # else fragment case
2517                         return
2518                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2519                         parse_error()
2520                         return
2521                 # Anything else
2522                 ins_mode_in_body t
2523
2524         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
2525         ins_mode_in_column_group = (t) ->
2526                 if is_space_tok t
2527                         insert_character t
2528                         return
2529                 if t.type is TYPE_COMMENT
2530                         insert_comment t
2531                         return
2532                 if t.type is TYPE_DOCTYPE
2533                         parse_error()
2534                         return
2535                 if t.type is TYPE_START_TAG and t.name is 'html'
2536                         ins_mode_in_body t
2537                         return
2538                 if t.type is TYPE_START_TAG and t.name is 'col'
2539                         el = insert_html_element t
2540                         open_els.shift()
2541                         t.acknowledge_self_closing()
2542                         return
2543                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
2544                         if open_els[0].name is 'colgroup' and open_els.namespace is NS_HTML
2545                                 open_els.shift()
2546                                 ins_mode = ins_mode_in_table
2547                         else
2548                                 parse_error()
2549                         return
2550                 if t.type is TYPE_END_TAG and t.name is 'col'
2551                         parse_error()
2552                         return
2553                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
2554                         ins_mode_in_head t
2555                         return
2556                 if t.type is TYPE_EOF
2557                         ins_mode_in_body t
2558                         return
2559                 # Anything else
2560                 if open_els[0].name isnt 'colgroup'
2561                         parse_error()
2562                         return
2563                 open_els.shift()
2564                 ins_mode = ins_mode_in_table
2565                 process_token t
2566                 return
2567
2568         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
2569         ins_mode_in_table_body = (t) ->
2570                 if t.type is TYPE_START_TAG and t.name is 'tr'
2571                         clear_stack_to_table_body_context()
2572                         insert_html_element t
2573                         ins_mode = ins_mode_in_row
2574                         return
2575                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2576                         parse_error()
2577                         clear_stack_to_table_body_context()
2578                         insert_html_element new_open_tag 'tr'
2579                         ins_mode = ins_mode_in_row
2580                         process_token t
2581                         return
2582                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2583                         unless is_in_table_scope t.name, NS_HTML
2584                                 parse_error()
2585                                 return
2586                         clear_stack_to_table_body_context()
2587                         open_els.shift()
2588                         ins_mode = ins_mode_in_table
2589                         return
2590                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
2591                         has = false
2592                         for el in open_els
2593                                 if el.namespace is NS_HTML and (el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead')
2594                                         has = true
2595                                         break
2596                                 if table_scopers[el.name] is el.namespace
2597                                         break
2598                         if !has
2599                                 parse_error()
2600                                 return
2601                         clear_stack_to_table_body_context()
2602                         open_els.shift()
2603                         ins_mode = ins_mode_in_table
2604                         process_token t
2605                         return
2606                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
2607                         parse_error()
2608                         return
2609                 # Anything else
2610                 ins_mode_in_table t
2611
2612         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
2613         ins_mode_in_row = (t) ->
2614                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
2615                         clear_stack_to_table_row_context()
2616                         insert_html_element t
2617                         ins_mode = ins_mode_in_cell
2618                         afe_push_marker()
2619                         return
2620                 if t.type is TYPE_END_TAG and t.name is 'tr'
2621                         if is_in_table_scope 'tr', NS_HTML
2622                                 clear_stack_to_table_row_context()
2623                                 open_els.shift()
2624                                 ins_mode = ins_mode_in_table_body
2625                         else
2626                                 parse_error()
2627                         return
2628                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
2629                         if is_in_table_scope 'tr', NS_HTML
2630                                 clear_stack_to_table_row_context()
2631                                 open_els.shift()
2632                                 ins_mode = ins_mode_in_table_body
2633                                 process_token t
2634                         else
2635                                 parse_error()
2636                         return
2637                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2638                         if is_in_table_scope t.name, NS_HTML
2639                                 if is_in_table_scope 'tr', NS_HTML
2640                                         clear_stack_to_table_row_context()
2641                                         open_els.shift()
2642                                         ins_mode = ins_mode_in_table_body
2643                                         process_token t
2644                         else
2645                                 parse_error()
2646                         return
2647                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
2648                         parse_error()
2649                         return
2650                 # Anything else
2651                 ins_mode_in_table t
2652
2653         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
2654         close_the_cell = ->
2655                 generate_implied_end_tags()
2656                 unless (open_els[0].name is 'td' or open_els[0] is 'th') and open_els[0].namespace is NS_HTML
2657                         parse_error()
2658                 loop
2659                         el = open_els.shift()
2660                         if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2661                                 break
2662                 clear_afe_to_marker()
2663                 ins_mode = ins_mode_in_row
2664
2665         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
2666         ins_mode_in_cell = (t) ->
2667                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
2668                         if is_in_table_scope t.name, NS_HTML
2669                                 generate_implied_end_tags()
2670                                 unless (open_els[0].name is t.name) and open_els[0].namespace is NS_HTML
2671                                         parse_error()
2672                                 loop
2673                                         el = open_els.shift()
2674                                         if el.name is t.name and el.namespace is NS_HTML
2675                                                 break
2676                                 clear_afe_to_marker()
2677                                 ins_mode = ins_mode_in_row
2678                         else
2679                                 parse_error()
2680                         return
2681                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
2682                         has = false
2683                         for el in open_els
2684                                 if el.namespace is NS_HTML and (el.name is 'td' or el.name is 'th')
2685                                         has = true
2686                                         break
2687                                 if table_scopers[el.name] is el.namespace
2688                                         break
2689                         if !has
2690                                 parse_error()
2691                                 return
2692                         close_the_cell()
2693                         process_token t
2694                         return
2695                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
2696                         parse_error()
2697                         return
2698                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
2699                         if is_in_table_scope t.name, NS_HTML
2700                                 close_the_cell()
2701                                 process_token t
2702                         else
2703                                 parse_error()
2704                         return
2705                 # Anything Else
2706                 ins_mode_in_body t
2707
2708         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
2709         ins_mode_in_select = (t) ->
2710                 if t.type is TYPE_TEXT and t.text is "\u0000"
2711                         parse_error()
2712                         return
2713                 if t.type is TYPE_TEXT
2714                         insert_character t
2715                         return
2716                 if t.type is TYPE_COMMENT
2717                         insert_comment t
2718                         return
2719                 if t.type is TYPE_DOCTYPE
2720                         parse_error()
2721                         return
2722                 if t.type is TYPE_START_TAG and t.name is 'html'
2723                         ins_mode_in_body t
2724                         return
2725                 if t.type is TYPE_START_TAG and t.name is 'option'
2726                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2727                                 open_els.shift()
2728                         insert_html_element t
2729                         return
2730                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
2731                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2732                                 open_els.shift()
2733                         if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2734                                 open_els.shift()
2735                         insert_html_element t
2736                         return
2737                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
2738                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2739                                 if open_els[1].name is 'optgroup' and open_els[0].namespace is NS_HTML
2740                                         open_els.shift()
2741                         if open_els[0].name is 'optgroup' and open_els[0].namespace is NS_HTML
2742                                 open_els.shift()
2743                         else
2744                                 parse_error()
2745                         return
2746                 if t.type is TYPE_END_TAG and t.name is 'option'
2747                         if open_els[0].name is 'option' and open_els[0].namespace is NS_HTML
2748                                 open_els.shift()
2749                         else
2750                                 parse_error()
2751                         return
2752                 if t.type is TYPE_END_TAG and t.name is 'select'
2753                         if is_in_select_scope 'select', NS_HTML
2754                                 loop
2755                                         el = open_els.shift()
2756                                         if el.name is 'select' and el.namespace is NS_HTML
2757                                                 break
2758                                 reset_ins_mode()
2759                         else
2760                                 parse_error()
2761                         return
2762                 if t.type is TYPE_START_TAG and t.name is 'select'
2763                         parse_error()
2764                         loop
2765                                 el = open_els.shift()
2766                                 if el.name is 'select' and el.namespace is NS_HTML
2767                                         break
2768                         reset_ins_mode()
2769                         # spec says that this is the same as </select> but it doesn't say
2770                         # to check scope first
2771                         return
2772                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
2773                         parse_error()
2774                         if is_in_select_scope 'select', NS_HTML
2775                                 return
2776                         loop
2777                                 el = open_els.shift()
2778                                 if el.name is 'select' and el.namespace is NS_HTML
2779                                         break
2780                         reset_ins_mode()
2781                         process_token t
2782                         return
2783                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
2784                         ins_mode_in_head t
2785                         return
2786                 if t.type is TYPE_EOF
2787                         ins_mode_in_body t
2788                         return
2789                 # Anything else
2790                 parse_error()
2791                 return
2792
2793         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
2794         ins_mode_in_select_in_table = (t) ->
2795                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2796                         parse_error()
2797                         loop
2798                                 el = open_els.shift()
2799                                 if el.name is 'select' and el.namespace is NS_HTML
2800                                         break
2801                         reset_ins_mode()
2802                         process_token t
2803                         return
2804                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
2805                         parse_error()
2806                         unless is_in_table_scope t.name, NS_HTML
2807                                 return
2808                         loop
2809                                 el = open_els.shift()
2810                                 if el.name is 'select' and el.namespace is NS_HTML
2811                                         break
2812                         reset_ins_mode()
2813                         process_token t
2814                         return
2815                 # Anything else
2816                 ins_mode_in_select t
2817                 return
2818
2819         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2820         ins_mode_in_template = (t) ->
2821                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2822                         ins_mode_in_body t
2823                         return
2824                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2825                         ins_mode_in_head t
2826                         return
2827                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2828                         template_ins_modes.shift()
2829                         template_ins_modes.unshift ins_mode_in_table
2830                         ins_mode = ins_mode_in_table
2831                         process_token t
2832                         return
2833                 if t.type is TYPE_START_TAG and t.name is 'col'
2834                         template_ins_modes.shift()
2835                         template_ins_modes.unshift ins_mode_in_column_group
2836                         ins_mode = ins_mode_in_column_group
2837                         process_token t
2838                         return
2839                 if t.type is TYPE_START_TAG and t.name is 'tr'
2840                         template_ins_modes.shift()
2841                         template_ins_modes.unshift ins_mode_in_table_body
2842                         ins_mode = ins_mode_in_table_body
2843                         process_token t
2844                         return
2845                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2846                         template_ins_modes.shift()
2847                         template_ins_modes.unshift ins_mode_in_row
2848                         ins_mode = ins_mode_in_row
2849                         process_token t
2850                         return
2851                 if t.type is TYPE_START_TAG
2852                         template_ins_modes.shift()
2853                         template_ins_modes.unshift ins_mode_in_body
2854                         ins_mode = ins_mode_in_body
2855                         process_token t
2856                         return
2857                 if t.type is TYPE_END_TAG
2858                         parse_error()
2859                         return
2860                 if t.type is TYPE_EOF
2861                         unless template_tag_is_open()
2862                                 stop_parsing()
2863                                 return
2864                         parse_error()
2865                         loop
2866                                 el = open_els.shift()
2867                                 if el.name is 'template' and el.namespace is NS_HTML
2868                                         break
2869                         clear_afe_to_marker()
2870                         template_ins_modes.shift()
2871                         reset_ins_mode()
2872                         process_token t
2873
2874         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2875         ins_mode_after_body = (t) ->
2876                 if is_space_tok t
2877                         ins_mode_in_body t
2878                         return
2879                 if t.type is TYPE_COMMENT
2880                         first = open_els[open_els.length - 1]
2881                         insert_comment t, [first, first.children.length]
2882                         return
2883                 if t.type is TYPE_DOCTYPE
2884                         parse_error()
2885                         return
2886                 if t.type is TYPE_START_TAG and t.name is 'html'
2887                         ins_mode_in_body t
2888                         return
2889                 if t.type is TYPE_END_TAG and t.name is 'html'
2890                         if flag_fragment_parsing
2891                                 parse_error()
2892                                 return
2893                         ins_mode = ins_mode_after_after_body
2894                         return
2895                 if t.type is TYPE_EOF
2896                         stop_parsing()
2897                         return
2898                 # Anything ELse
2899                 parse_error()
2900                 ins_mode = ins_mode_in_body
2901                 process_token t
2902
2903         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2904         ins_mode_in_frameset = (t) ->
2905                 if is_space_tok t
2906                         insert_character t
2907                         return
2908                 if t.type is TYPE_COMMENT
2909                         insert_comment t
2910                         return
2911                 if t.type is TYPE_DOCTYPE
2912                         parse_error()
2913                         return
2914                 if t.type is TYPE_START_TAG and t.name is 'html'
2915                         ins_mode_in_body t
2916                         return
2917                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2918                         insert_html_element t
2919                         return
2920                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2921                         if open_els.length is 1
2922                                 parse_error()
2923                                 return # fragment case
2924                         open_els.shift()
2925                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2926                                 ins_mode = ins_mode_after_frameset
2927                         return
2928                 if t.type is TYPE_START_TAG and t.name is 'frame'
2929                         insert_html_element t
2930                         open_els.shift()
2931                         t.acknowledge_self_closing()
2932                         return
2933                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2934                         ins_mode_in_head t
2935                         return
2936                 if t.type is TYPE_EOF
2937                         if open_els.length isnt 1
2938                                 parse_error()
2939                         stop_parsing()
2940                         return
2941                 # Anything else
2942                 parse_error()
2943                 return
2944
2945         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2946         ins_mode_after_frameset = (t) ->
2947                 if is_space_tok t
2948                         insert_character t
2949                         return
2950                 if t.type is TYPE_COMMENT
2951                         insert_comment t
2952                         return
2953                 if t.type is TYPE_DOCTYPE
2954                         parse_error()
2955                         return
2956                 if t.type is TYPE_START_TAG and t.name is 'html'
2957                         ins_mode_in_body t
2958                         return
2959                 if t.type is TYPE_END_TAG and t.name is 'html'
2960                         ins_mode = ins_mode_after_after_frameset
2961                         return
2962                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2963                         ins_mode_in_head t
2964                         return
2965                 if t.type is TYPE_EOF
2966                         stop_parsing()
2967                         return
2968                 # Anything else
2969                 parse_error()
2970                 return
2971
2972         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2973         ins_mode_after_after_body = (t) ->
2974                 if t.type is TYPE_COMMENT
2975                         insert_comment t, [doc, doc.children.length]
2976                         return
2977                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2978                         ins_mode_in_body t
2979                         return
2980                 if t.type is TYPE_EOF
2981                         stop_parsing()
2982                         return
2983                 # Anything else
2984                 parse_error()
2985                 ins_mode = ins_mode_in_body
2986                 process_token t
2987                 return
2988
2989         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2990         ins_mode_after_after_frameset = (t) ->
2991                 if t.type is TYPE_COMMENT
2992                         insert_comment t, [doc, doc.children.length]
2993                         return
2994                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2995                         ins_mode_in_body t
2996                         return
2997                 if t.type is TYPE_EOF
2998                         stop_parsing()
2999                         return
3000                 if t.type is TYPE_START_TAG and t.name is 'noframes'
3001                         ins_mode_in_head t
3002                         return
3003                 # Anything else
3004                 parse_error()
3005                 return
3006
3007         # 8.2.5.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inforeign
3008         has_color_face_or_size = (t) ->
3009                 for a in t.attrs_a
3010                         if a[0] is 'color' or a[0] is 'face' or a[0] is 'size'
3011                                 return true
3012                 return false
3013         in_foreign_content_end_script = ->
3014                 open_els.shift()
3015                 # fixfull
3016                 return
3017         in_foreign_content_other_start = (t) ->
3018                 acn = adjusted_current_node()
3019                 if acn.namespace is NS_MATHML
3020                         adjust_mathml_attributes t
3021                 if acn.namespace is NS_SVG and svg_name_fixes[t.name]?
3022                         t.name = svg_name_fixes[t.name]
3023                 if acn.namespace is NS_SVG
3024                         adjust_svg_attributes t
3025                 adjust_foreign_attributes t
3026                 insert_foreign_element t, acn.namespace
3027                 if t.flag 'self-closing'
3028                         if t.name is 'script'
3029                                 t.acknowledge_self_closing()
3030                                 in_foreign_content_end_script()
3031                                 # fixfull
3032                         else
3033                                 open_els.shift()
3034                                 t.acknowledge_self_closing()
3035                 return
3036         in_foreign_content = (t) ->
3037                 if t.type is TYPE_TEXT and t.text is "\u0000"
3038                         parse_error()
3039                         insert_character new_character_token "\ufffd"
3040                         return
3041                 if is_space_tok t
3042                         insert_character t
3043                         return
3044                 if t.type is TYPE_TEXT
3045                         flag_frameset_ok = false
3046                         insert_character t
3047                         return
3048                 if t.type is TYPE_COMMENT
3049                         insert_comment t
3050                         return
3051                 if t.type is TYPE_DOCTYPE
3052                         parse_error()
3053                         return
3054                 if t.type is TYPE_START_TAG and (t.name is 'b' or t.name is 'big' or t.name is 'blockquote' or t.name is 'body' or t.name is 'br' or t.name is 'center' or t.name is 'code' or t.name is 'dd' or t.name is 'div' or t.name is 'dl' or t.name is 'dt' or t.name is 'em' or t.name is 'embed' or t.name is 'h1' or t.name is 'h2' or t.name is 'h3' or t.name is 'h4' or t.name is 'h5' or t.name is 'h6' or t.name is 'head' or t.name is 'hr' or t.name is 'i' or t.name is 'img' or t.name is 'li' or t.name is 'listing' or t.name is 'main' or t.name is 'meta' or t.name is 'nobr' or t.name is 'ol' or t.name is 'p' or t.name is 'pre' or t.name is 'ruby' or t.name is 's' or t.name is 'small' or t.name is 'span' or t.name is 'strong' or t.name is 'strike' or t.name is 'sub' or t.name is 'sup' or t.name is 'table' or t.name is 'tt' or t.name is 'u' or t.name is 'ul' or t.name is 'var' or (t.name is 'font' and has_color_face_or_size(t)))
3055                         parse_error()
3056                         if flag_fragment_parsing
3057                                 in_foreign_content_other_start t
3058                                 return
3059                         loop # is this safe?
3060                                 open_els.shift()
3061                                 if is_mathml_text_integration_point(open_els[0]) or is_html_integration(open_els[0]) or open_els[0].namespace is NS_HTML
3062                                         break
3063                         process_token t
3064                         return
3065                 if t.type is TYPE_START_TAG
3066                         in_foreign_content_other_start t
3067                         return
3068                 if t.type is TYPE_END_TAG and t.name is 'script' and open_els[0].name is 'script' and open_els[0].namespace is NS_SVG
3069                         in_foreign_content_end_script()
3070                         return
3071                 if t.type is TYPE_END_TAG
3072                         i = 0
3073                         node = open_els[i]
3074                         if node.name.toLowerCase() isnt t.name
3075                                 parse_error()
3076                         loop
3077                                 if node is open_els[open_els.length - 1]
3078                                         return
3079                                 if node.name.toLowerCase() is t.name
3080                                         loop
3081                                                 el = open_els.shift()
3082                                                 if el is node
3083                                                         return
3084                                 i += 1
3085                                 node = open_els[i]
3086                                 if node.namespace is NS_HTML
3087                                         break
3088                         ins_mode t # explicitly call HTML insertion mode
3089
3090
3091         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
3092         tok_state_data = ->
3093                 switch c = txt.charAt(cur++)
3094                         when '&'
3095                                 return new_text_node parse_character_reference()
3096                         when '<'
3097                                 tok_state = tok_state_tag_open
3098                         when "\u0000"
3099                                 parse_error()
3100                                 return new_text_node "\ufffd"
3101                         when '' # EOF
3102                                 return new_eof_token()
3103                         else
3104                                 return new_text_node c
3105                 return null
3106
3107         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
3108         # not needed: tok_state_character_reference_in_data = ->
3109         # just call parse_character_reference()
3110
3111         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
3112         tok_state_rcdata = ->
3113                 switch c = txt.charAt(cur++)
3114                         when '&'
3115                                 return new_text_node parse_character_reference()
3116                         when '<'
3117                                 tok_state = tok_state_rcdata_less_than_sign
3118                         when "\u0000"
3119                                 parse_error()
3120                                 return new_character_token "\ufffd"
3121                         when '' # EOF
3122                                 return new_eof_token()
3123                         else
3124                                 return new_character_token c
3125                 return null
3126
3127         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
3128         # not needed: tok_state_character_reference_in_rcdata = ->
3129         # just call parse_character_reference()
3130
3131         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
3132         tok_state_rawtext = ->
3133                 switch c = txt.charAt(cur++)
3134                         when '<'
3135                                 tok_state = tok_state_rawtext_less_than_sign
3136                         when "\u0000"
3137                                 parse_error()
3138                                 return new_character_token "\ufffd"
3139                         when '' # EOF
3140                                 return new_eof_token()
3141                         else
3142                                 return new_character_token c
3143                 return null
3144
3145         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
3146         tok_state_script_data = ->
3147                 switch c = txt.charAt(cur++)
3148                         when '<'
3149                                 tok_state = tok_state_script_data_less_than_sign
3150                         when "\u0000"
3151                                 parse_error()
3152                                 return new_character_token "\ufffd"
3153                         when '' # EOF
3154                                 return new_eof_token()
3155                         else
3156                                 return new_character_token c
3157                 return null
3158
3159         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
3160         tok_state_plaintext = ->
3161                 switch c = txt.charAt(cur++)
3162                         when "\u0000"
3163                                 parse_error()
3164                                 return new_character_token "\ufffd"
3165                         when '' # EOF
3166                                 return new_eof_token()
3167                         else
3168                                 return new_character_token c
3169                 return null
3170
3171
3172         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
3173         tok_state_tag_open = ->
3174                 c = txt.charAt(cur++)
3175                 if c is '!'
3176                         tok_state = tok_state_markup_declaration_open
3177                         return
3178                 if c is '/'
3179                         tok_state = tok_state_end_tag_open
3180                         return
3181                 if is_uc_alpha(c)
3182                         tok_cur_tag = new_open_tag c.toLowerCase()
3183                         tok_state = tok_state_tag_name
3184                         return
3185                 if is_lc_alpha(c)
3186                         tok_cur_tag = new_open_tag c
3187                         tok_state = tok_state_tag_name
3188                         return
3189                 if c is '?'
3190                         parse_error()
3191                         tok_cur_tag = new_comment_token '?' # FIXME right?
3192                         tok_state = tok_state_bogus_comment
3193                         return
3194                 # Anything else
3195                 parse_error()
3196                 tok_state = tok_state_data
3197                 cur -= 1 # we didn't parse/handle the char after <
3198                 return new_text_node '<'
3199
3200         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
3201         tok_state_end_tag_open = ->
3202                 c = txt.charAt(cur++)
3203                 if is_uc_alpha(c)
3204                         tok_cur_tag = new_end_tag c.toLowerCase()
3205                         tok_state = tok_state_tag_name
3206                         return
3207                 if is_lc_alpha(c)
3208                         tok_cur_tag = new_end_tag c
3209                         tok_state = tok_state_tag_name
3210                         return
3211                 if c is '>'
3212                         parse_error()
3213                         tok_state = tok_state_data
3214                         return
3215                 if c is '' # EOF
3216                         parse_error()
3217                         tok_state = tok_state_data
3218                         return new_text_node '</'
3219                 # Anything else
3220                 parse_error()
3221                 tok_cur_tag = new_comment_token c
3222                 tok_state = tok_state_bogus_comment
3223                 return null
3224
3225         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
3226         tok_state_tag_name = ->
3227                 switch c = txt.charAt(cur++)
3228                         when "\t", "\n", "\u000c", ' '
3229                                 tok_state = tok_state_before_attribute_name
3230                         when '/'
3231                                 tok_state = tok_state_self_closing_start_tag
3232                         when '>'
3233                                 tok_state = tok_state_data
3234                                 tmp = tok_cur_tag
3235                                 tok_cur_tag = null
3236                                 return tmp
3237                         when "\u0000"
3238                                 parse_error()
3239                                 tok_cur_tag.name += "\ufffd"
3240                         when '' # EOF
3241                                 parse_error()
3242                                 tok_state = tok_state_data
3243                         else
3244                                 if is_uc_alpha(c)
3245                                         tok_cur_tag.name += c.toLowerCase()
3246                                 else
3247                                         tok_cur_tag.name += c
3248                 return null
3249
3250         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
3251         tok_state_rcdata_less_than_sign = ->
3252                 c = txt.charAt(cur++)
3253                 if c is '/'
3254                         temporary_buffer = ''
3255                         tok_state = tok_state_rcdata_end_tag_open
3256                         return null
3257                 # Anything else
3258                 tok_state = tok_state_rcdata
3259                 cur -= 1 # reconsume the input character
3260                 return new_character_token '<'
3261
3262         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
3263         tok_state_rcdata_end_tag_open = ->
3264                 c = txt.charAt(cur++)
3265                 if is_uc_alpha(c)
3266                         tok_cur_tag = new_end_tag c.toLowerCase()
3267                         temporary_buffer += c
3268                         tok_state = tok_state_rcdata_end_tag_name
3269                         return null
3270                 if is_lc_alpha(c)
3271                         tok_cur_tag = new_end_tag c
3272                         temporary_buffer += c
3273                         tok_state = tok_state_rcdata_end_tag_name
3274                         return null
3275                 # Anything else
3276                 tok_state = tok_state_rcdata
3277                 cur -= 1 # reconsume the input character
3278                 return new_character_token "</" # fixfull separate these
3279
3280         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
3281         is_appropriate_end_tag = (t) ->
3282                 # spec says to check against "the tag name of the last start tag to
3283                 # have been emitted from this tokenizer", but this is only called from
3284                 # the various "raw" states, so it's hopefully ok to assume that
3285                 # open_els[0].name will work instead TODO: verify this after the script
3286                 # data states are implemented
3287                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
3288                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
3289
3290         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
3291         tok_state_rcdata_end_tag_name = ->
3292                 c = txt.charAt(cur++)
3293                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3294                         if is_appropriate_end_tag tok_cur_tag
3295                                 tok_state = tok_state_before_attribute_name
3296                                 return
3297                         # else fall through to "Anything else"
3298                 if c is '/'
3299                         if is_appropriate_end_tag tok_cur_tag
3300                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
3301                                 return
3302                         # else fall through to "Anything else"
3303                 if c is '>'
3304                         if is_appropriate_end_tag tok_cur_tag
3305                                 tok_state = tok_state_data
3306                                 return tok_cur_tag
3307                         # else fall through to "Anything else"
3308                 if is_uc_alpha(c)
3309                         tok_cur_tag.name += c.toLowerCase()
3310                         temporary_buffer += c
3311                         return null
3312                 if is_lc_alpha(c)
3313                         tok_cur_tag.name += c
3314                         temporary_buffer += c
3315                         return null
3316                 # Anything else
3317                 tok_state = tok_state_rcdata
3318                 cur -= 1 # reconsume the input character
3319                 return new_character_token '</' + temporary_buffer # fixfull separate these
3320
3321         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
3322         tok_state_rawtext_less_than_sign = ->
3323                 c = txt.charAt(cur++)
3324                 if c is '/'
3325                         temporary_buffer = ''
3326                         tok_state = tok_state_rawtext_end_tag_open
3327                         return null
3328                 # Anything else
3329                 tok_state = tok_state_rawtext
3330                 cur -= 1 # reconsume the input character
3331                 return new_character_token '<'
3332
3333         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
3334         tok_state_rawtext_end_tag_open = ->
3335                 c = txt.charAt(cur++)
3336                 if is_uc_alpha(c)
3337                         tok_cur_tag = new_end_tag c.toLowerCase()
3338                         temporary_buffer += c
3339                         tok_state = tok_state_rawtext_end_tag_name
3340                         return null
3341                 if is_lc_alpha(c)
3342                         tok_cur_tag = new_end_tag c
3343                         temporary_buffer += c
3344                         tok_state = tok_state_rawtext_end_tag_name
3345                         return null
3346                 # Anything else
3347                 tok_state = tok_state_rawtext
3348                 cur -= 1 # reconsume the input character
3349                 return new_character_token "</" # fixfull separate these
3350
3351         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
3352         tok_state_rawtext_end_tag_name = ->
3353                 c = txt.charAt(cur++)
3354                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3355                         if is_appropriate_end_tag tok_cur_tag
3356                                 tok_state = tok_state_before_attribute_name
3357                                 return
3358                         # else fall through to "Anything else"
3359                 if c is '/'
3360                         if is_appropriate_end_tag tok_cur_tag
3361                                 tok_state = tok_state_self_closing_start_tag
3362                                 return
3363                         # else fall through to "Anything else"
3364                 if c is '>'
3365                         if is_appropriate_end_tag tok_cur_tag
3366                                 tok_state = tok_state_data
3367                                 return tok_cur_tag
3368                         # else fall through to "Anything else"
3369                 if is_uc_alpha(c)
3370                         tok_cur_tag.name += c.toLowerCase()
3371                         temporary_buffer += c
3372                         return null
3373                 if is_lc_alpha(c)
3374                         tok_cur_tag.name += c
3375                         temporary_buffer += c
3376                         return null
3377                 # Anything else
3378                 tok_state = tok_state_rawtext
3379                 cur -= 1 # reconsume the input character
3380                 return new_character_token '</' + temporary_buffer # fixfull separate these
3381
3382         # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
3383         tok_state_script_data_less_than_sign = ->
3384                 c = txt.charAt(cur++)
3385                 if c is '/'
3386                         temporary_buffer = ''
3387                         tok_state = tok_state_script_data_end_tag_open
3388                         return
3389                 if c is '!'
3390                         tok_state = tok_state_script_data_escape_start
3391                         return new_character_token '<!' # fixfull split
3392                 # Anything else
3393                 tok_state = tok_state_script_data
3394                 cur -= 1 # Reconsume
3395                 return new_character_token '<'
3396
3397         # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3398         tok_state_script_data_end_tag_open = ->
3399                 c = txt.charAt(cur++)
3400                 if is_uc_alpha(c)
3401                         tok_cur_tag = new_end_tag c.toLowerCase()
3402                         temporary_buffer += c
3403                         tok_state = tok_state_script_data_end_tag_name
3404                         return
3405                 if is_lc_alpha(c)
3406                         tok_cur_tag = new_end_tag c
3407                         temporary_buffer += c
3408                         tok_state = tok_state_script_data_end_tag_name
3409                         return
3410                 # Anything else
3411                 tok_state = tok_state_script_data
3412                 cur -= 1 # Reconsume
3413                 return new_character_token '</'
3414
3415         # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
3416         tok_state_script_data_end_tag_name = ->
3417                 c = txt.charAt(cur++)
3418                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3419                         if is_appropriate_end_tag tok_cur_tag
3420                                 tok_state = tok_state_before_attribute_name
3421                                 return
3422                         # fall through
3423                 if c is '/'
3424                         if is_appropriate_end_tag tok_cur_tag
3425                                 tok_state = tok_state_self_closing_start_tag
3426                                 return
3427                         # fall through
3428                 if c is '>'
3429                         if is_appropriate_end_tag tok_cur_tag
3430                                 tok_state = tok_state_data
3431                                 return tok_cur_tag
3432                         # fall through
3433                 if is_uc_alpha(c)
3434                         tok_cur_tag.name += c.toLowerCase()
3435                         temporary_buffer += c
3436                         return
3437                 if is_lc_alpha(c)
3438                         tok_cur_tag.name += c
3439                         temporary_buffer += c
3440                         return
3441                 # Anything else
3442                 tok_state = tok_state_script_data
3443                 cur -= 1 # Reconsume
3444                 return new_character_token "</#{temporary_buffer}" # fixfull split
3445
3446         # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
3447         tok_state_script_data_escape_start = ->
3448                 c = txt.charAt(cur++)
3449                 if c is '-'
3450                         tok_state = tok_state_script_data_escape_start_dash
3451                         return new_character_token '-'
3452                 # Anything else
3453                 tok_state = tok_state_script_data
3454                 cur -= 1 # Reconsume
3455                 return
3456
3457         # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
3458         tok_state_script_data_escape_start_dash = ->
3459                 c = txt.charAt(cur++)
3460                 if c is '-'
3461                         tok_state = tok_state_script_data_escaped_dash_dash
3462                         return new_character_token '-'
3463                 # Anything else
3464                 tok_state = tok_state_script_data
3465                 cur -= 1 # Reconsume
3466                 return
3467
3468         # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
3469         tok_state_script_data_escaped = ->
3470                 c = txt.charAt(cur++)
3471                 if c is '-'
3472                         tok_state = tok_state_script_data_escaped_dash
3473                         return new_character_token '-'
3474                 if c is '<'
3475                         tok_state = tok_state_script_data_escaped_less_than_sign
3476                         return
3477                 if c is "\u0000"
3478                         parse_error()
3479                         return new_character_token "\ufffd"
3480                 if c is '' # EOF
3481                         tok_state = tok_state_data
3482                         parse_error()
3483                         cur -= 1 # Reconsume
3484                         return
3485                 # Anything else
3486                 return new_character_token c
3487
3488         # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
3489         tok_state_script_data_escaped_dash = ->
3490                 c = txt.charAt(cur++)
3491                 if c is '-'
3492                         tok_state = tok_state_script_data_escaped_dash_dash
3493                         return new_character_token '-'
3494                 if c is '<'
3495                         tok_state = tok_state_script_data_escaped_less_than_sign
3496                         return
3497                 if c is "\u0000"
3498                         parse_error()
3499                         tok_state = tok_state_script_data_escaped
3500                         return new_character_token "\ufffd"
3501                 if c is '' # EOF
3502                         tok_state = tok_state_data
3503                         parse_error()
3504                         cur -= 1 # Reconsume
3505                         return
3506                 # Anything else
3507                 tok_state = tok_state_script_data_escaped
3508                 return new_character_token c
3509
3510         # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
3511         tok_state_script_data_escaped_dash_dash = ->
3512                 c = txt.charAt(cur++)
3513                 if c is '-'
3514                         return new_character_token '-'
3515                 if c is '<'
3516                         tok_state = tok_state_script_data_escaped_less_than_sign
3517                         return
3518                 if c is '>'
3519                         tok_state = tok_state_script_data
3520                         return new_character_token '>'
3521                 if c is "\u0000"
3522                         parse_error()
3523                         tok_state = tok_state_script_data_escaped
3524                         return new_character_token "\ufffd"
3525                 if c is '' # EOF
3526                         parse_error()
3527                         tok_state = tok_state_data
3528                         cur -= 1 # Reconsume
3529                         return
3530                 # Anything else
3531                 tok_state = tok_state_script_data_escaped
3532                 return new_character_token c
3533
3534         # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
3535         tok_state_script_data_escaped_less_than_sign = ->
3536                 c = txt.charAt(cur++)
3537                 if c is '/'
3538                         temporary_buffer = ''
3539                         tok_state = tok_state_script_data_escaped_end_tag_open
3540                         return
3541                 if is_uc_alpha(c)
3542                         temporary_buffer = c.toLowerCase() # yes, really
3543                         tok_state = tok_state_script_data_double_escape_start
3544                         return new_character_token "<#{c}" # fixfull split
3545                 if is_lc_alpha(c)
3546                         temporary_buffer = c
3547                         tok_state = tok_state_script_data_double_escape_start
3548                         return new_character_token "<#{c}" # fixfull split
3549                 # Anything else
3550                 tok_state = tok_state_script_data_escaped
3551                 cur -= 1 # Reconsume
3552                 return new_character_token '<'
3553
3554         # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
3555         tok_state_script_data_escaped_end_tag_open = ->
3556                 c = txt.charAt(cur++)
3557                 if is_uc_alpha(c)
3558                         tok_cur_tag = new_end_tag c.toLowerCase()
3559                         temporary_buffer += c
3560                         tok_state = tok_state_script_data_escaped_end_tag_name
3561                         return
3562                 if is_lc_alpha(c)
3563                         tok_cur_tag = new_end_tag c
3564                         temporary_buffer += c
3565                         tok_state = tok_state_script_data_escaped_end_tag_name
3566                         return
3567                 # Anything else
3568                 tok_state = tok_state_script_data_escaped
3569                 cur -= 1 # Reconsume
3570                 return new_character_token '</' # fixfull split
3571
3572         # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
3573         tok_state_script_data_escaped_end_tag_name = ->
3574                 c = txt.charAt(cur++)
3575                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3576                         if is_appropriate_end_tag tok_cur_tag
3577                                 tok_state = tok_state_before_attribute_name
3578                                 return
3579                         # fall through
3580                 if c is '/'
3581                         if is_appropriate_end_tag tok_cur_tag
3582                                 tok_state = tok_state_self_closing_start_tag
3583                                 return
3584                         # fall through
3585                 if c is '>'
3586                         if is_appropriate_end_tag tok_cur_tag
3587                                 tok_state = tok_state_data
3588                                 return tok_cur_tag
3589                         # fall through
3590                 if is_uc_alpha(c)
3591                         tok_cur_tag.name += c.toLowerCase()
3592                         temporary_buffer += c.toLowerCase()
3593                         return
3594                 if is_lc_alpha(c)
3595                         tok_cur_tag.name += c
3596                         temporary_buffer += c.toLowerCase()
3597                         return
3598                 # Anything else
3599                 tok_state = tok_state_script_data_escaped
3600                 cur -= 1 # Reconsume
3601                 return new_character_token "</#{temporary_buffer}" # fixfull split
3602
3603         # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
3604         tok_state_script_data_double_escape_start = ->
3605                 c = txt.charAt(cur++)
3606                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3607                         if temporary_buffer is 'script'
3608                                 tok_state = tok_state_script_data_double_escaped
3609                         else
3610                                 tok_state = tok_state_script_data_escaped
3611                         return new_character_token c
3612                 if is_uc_alpha(c)
3613                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3614                         return new_character_token c
3615                 if is_lc_alpha(c)
3616                         temporary_buffer += c
3617                         return new_character_token c
3618                 # Anything else
3619                 tok_state = tok_state_script_data_escaped
3620                 cur -= 1 # Reconsume
3621                 return
3622
3623         # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
3624         tok_state_script_data_double_escaped = ->
3625                 c = txt.charAt(cur++)
3626                 if c is '-'
3627                         tok_state = tok_state_script_data_double_escaped_dash
3628                         return new_character_token '-'
3629                 if c is '<'
3630                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3631                         return new_character_token '<'
3632                 if c is "\u0000"
3633                         parse_error()
3634                         return new_character_token "\ufffd"
3635                 if c is '' # EOF
3636                         parse_error()
3637                         tok_state = tok_state_data
3638                         cur -= 1 # Reconsume
3639                         return
3640                 # Anything else
3641                 return new_character_token c
3642
3643         # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
3644         tok_state_script_data_double_escaped_dash = ->
3645                 c = txt.charAt(cur++)
3646                 if c is '-'
3647                         tok_state = tok_state_script_data_double_escaped_dash_dash
3648                         return new_character_token '-'
3649                 if c is '<'
3650                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3651                         return new_character_token '<'
3652                 if c is "\u0000"
3653                         parse_error()
3654                         tok_state = tok_state_script_data_double_escaped
3655                         return new_character_token "\ufffd"
3656                 if c is '' # EOF
3657                         parse_error()
3658                         tok_state = tok_state_data
3659                         cur -= 1 # Reconsume
3660                         return
3661                 # Anything else
3662                 tok_state = tok_state_script_data_double_escaped
3663                 return new_character_token c
3664
3665         # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
3666         tok_state_script_data_double_escaped_dash_dash = ->
3667                 c = txt.charAt(cur++)
3668                 if c is '-'
3669                         return new_character_token '-'
3670                 if c is '<'
3671                         tok_state = tok_state_script_data_double_escaped_less_than_sign
3672                         return new_character_token '<'
3673                 if c is '>'
3674                         tok_state = tok_state_script_data
3675                         return new_character_token '>'
3676                 if c is "\u0000"
3677                         parse_error()
3678                         tok_state = tok_state_script_data_double_escaped
3679                         return new_character_token "\ufffd"
3680                 if c is '' # EOF
3681                         parse_error()
3682                         tok_state = tok_state_data
3683                         cur -= 1 # Reconsume
3684                         return
3685                 # Anything else
3686                 tok_state = tok_state_script_data_double_escaped
3687                 return new_character_token c
3688
3689         # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
3690         tok_state_script_data_double_escaped_less_than_sign = ->
3691                 c = txt.charAt(cur++)
3692                 if c is '/'
3693                         temporary_buffer = ''
3694                         tok_state = tok_state_script_data_double_escape_end
3695                         return new_character_token '/'
3696                 # Anything else
3697                 tok_state = tok_state_script_data_double_escaped
3698                 cur -= 1 # Reconsume
3699                 return
3700
3701         # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
3702         tok_state_script_data_double_escape_end = ->
3703                 c = txt.charAt(cur++)
3704                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
3705                         if temporary_buffer is 'script'
3706                                 tok_state = tok_state_script_data_escaped
3707                         else
3708                                 tok_state = tok_state_script_data_double_escaped
3709                         return new_character_token c
3710                 if is_uc_alpha(c)
3711                         temporary_buffer += c.toLowerCase() # yes, really lowercase
3712                         return new_character_token c
3713                 if is_lc_alpha(c)
3714                         temporary_buffer += c
3715                         return new_character_token c
3716                 # Anything else
3717                 tok_state = tok_state_script_data_double_escaped
3718                 cur -= 1 # Reconsume
3719                 return
3720
3721         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
3722         tok_state_before_attribute_name = ->
3723                 attr_name = null
3724                 switch c = txt.charAt(cur++)
3725                         when "\t", "\n", "\u000c", ' '
3726                                 return null
3727                         when '/'
3728                                 tok_state = tok_state_self_closing_start_tag
3729                                 return null
3730                         when '>'
3731                                 tok_state = tok_state_data
3732                                 tmp = tok_cur_tag
3733                                 tok_cur_tag = null
3734                                 return tmp
3735                         when "\u0000"
3736                                 parse_error()
3737                                 attr_name = "\ufffd"
3738                         when '"', "'", '<', '='
3739                                 parse_error()
3740                                 attr_name = c
3741                         when '' # EOF
3742                                 parse_error()
3743                                 tok_state = tok_state_data
3744                         else
3745                                 if is_uc_alpha(c)
3746                                         attr_name = c.toLowerCase()
3747                                 else
3748                                         attr_name = c
3749                 if attr_name?
3750                         tok_cur_tag.attrs_a.unshift [attr_name, '']
3751                         tok_state = tok_state_attribute_name
3752                 return null
3753
3754         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
3755         tok_state_attribute_name = ->
3756                 switch c = txt.charAt(cur++)
3757                         when "\t", "\n", "\u000c", ' '
3758                                 tok_state = tok_state_after_attribute_name
3759                         when '/'
3760                                 tok_state = tok_state_self_closing_start_tag
3761                         when '='
3762                                 tok_state = tok_state_before_attribute_value
3763                         when '>'
3764                                 tok_state = tok_state_data
3765                                 tmp = tok_cur_tag
3766                                 tok_cur_tag = null
3767                                 return tmp
3768                         when "\u0000"
3769                                 parse_error()
3770                                 tok_cur_tag.attrs_a[0][0] += "\ufffd"
3771                         when '"', "'", '<'
3772                                 parse_error()
3773                                 tok_cur_tag.attrs_a[0][0] += c
3774                         when '' # EOF
3775                                 parse_error()
3776                                 tok_state = tok_state_data
3777                         else
3778                                 if is_uc_alpha(c)
3779                                         tok_cur_tag.attrs_a[0][0] += c.toLowerCase()
3780                                 else
3781                                         tok_cur_tag.attrs_a[0][0] += c
3782                 return null
3783
3784         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
3785         tok_state_after_attribute_name = ->
3786                 c = txt.charAt(cur++)
3787                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
3788                         return
3789                 if c is '/'
3790                         tok_state = tok_state_self_closing_start_tag
3791                         return
3792                 if c is '='
3793                         tok_state = tok_state_before_attribute_value
3794                         return
3795                 if c is '>'
3796                         tok_state = tok_state_data
3797                         return
3798                 if is_uc_alpha(c)
3799                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
3800                         tok_state = tok_state_attribute_name
3801                         return
3802                 if c is "\u0000"
3803                         parse_error()
3804                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
3805                         tok_state = tok_state_attribute_name
3806                         return
3807                 if c is '' # EOF
3808                         parse_error()
3809                         tok_state = tok_state_data
3810                         cur -= 1 # reconsume
3811                         return
3812                 if c is '"' or c is "'" or c is '<'
3813                         parse_error()
3814                         # fall through to Anything else
3815                 # Anything else
3816                 tok_cur_tag.attrs_a.unshift [c, '']
3817                 tok_state = tok_state_attribute_name
3818
3819         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
3820         tok_state_before_attribute_value = ->
3821                 switch c = txt.charAt(cur++)
3822                         when "\t", "\n", "\u000c", ' '
3823                                 return null
3824                         when '"'
3825                                 tok_state = tok_state_attribute_value_double_quoted
3826                         when '&'
3827                                 tok_state = tok_state_attribute_value_unquoted
3828                                 cur -= 1
3829                         when "'"
3830                                 tok_state = tok_state_attribute_value_single_quoted
3831                         when "\u0000"
3832                                 # Parse error
3833                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3834                                 tok_state = tok_state_attribute_value_unquoted
3835                         when '>'
3836                                 # Parse error
3837                                 tok_state = tok_state_data
3838                                 tmp = tok_cur_tag
3839                                 tok_cur_tag = null
3840                                 return tmp
3841                         when '' # EOF
3842                                 parse_error()
3843                                 tok_state = tok_state_data
3844                         else
3845                                 tok_cur_tag.attrs_a[0][1] += c
3846                                 tok_state = tok_state_attribute_value_unquoted
3847                 return null
3848
3849         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
3850         tok_state_attribute_value_double_quoted = ->
3851                 switch c = txt.charAt(cur++)
3852                         when '"'
3853                                 tok_state = tok_state_after_attribute_value_quoted
3854                         when '&'
3855                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
3856                         when "\u0000"
3857                                 # Parse error
3858                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3859                         when '' # EOF
3860                                 parse_error()
3861                                 tok_state = tok_state_data
3862                         else
3863                                 tok_cur_tag.attrs_a[0][1] += c
3864                 return null
3865
3866         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
3867         tok_state_attribute_value_single_quoted = ->
3868                 switch c = txt.charAt(cur++)
3869                         when "'"
3870                                 tok_state = tok_state_after_attribute_value_quoted
3871                         when '&'
3872                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
3873                         when "\u0000"
3874                                 # Parse error
3875                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3876                         when '' # EOF
3877                                 parse_error()
3878                                 tok_state = tok_state_data
3879                         else
3880                                 tok_cur_tag.attrs_a[0][1] += c
3881                 return null
3882
3883         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
3884         tok_state_attribute_value_unquoted = ->
3885                 switch c = txt.charAt(cur++)
3886                         when "\t", "\n", "\u000c", ' '
3887                                 tok_state = tok_state_before_attribute_name
3888                         when '&'
3889                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
3890                         when '>'
3891                                 tok_state = tok_state_data
3892                                 tmp = tok_cur_tag
3893                                 tok_cur_tag = null
3894                                 return tmp
3895                         when "\u0000"
3896                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
3897                         when '' # EOF
3898                                 parse_error()
3899                                 tok_state = tok_state_data
3900                         else
3901                                 # Parse Error if ', <, = or ` (backtick)
3902                                 tok_cur_tag.attrs_a[0][1] += c
3903                 return null
3904
3905         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
3906         tok_state_after_attribute_value_quoted = ->
3907                 switch c = txt.charAt(cur++)
3908                         when "\t", "\n", "\u000c", ' '
3909                                 tok_state = tok_state_before_attribute_name
3910                         when '/'
3911                                 tok_state = tok_state_self_closing_start_tag
3912                         when '>'
3913                                 tok_state = tok_state_data
3914                                 tmp = tok_cur_tag
3915                                 tok_cur_tag = null
3916                                 return tmp
3917                         when '' # EOF
3918                                 parse_error()
3919                                 tok_state = tok_state_data
3920                         else
3921                                 # Parse Error
3922                                 tok_state = tok_state_before_attribute_name
3923                                 cur -= 1 # we didn't handle that char
3924                 return null
3925
3926         # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3927         tok_state_self_closing_start_tag = ->
3928                 c = txt.charAt(cur++)
3929                 if c is '>'
3930                         tok_cur_tag.flag 'self-closing', true
3931                         tok_state = tok_state_data
3932                         return tok_cur_tag
3933                 if c is ''
3934                         parse_error()
3935                         tok_state = tok_state_data
3936                         cur -= 1 # Reconsume
3937                         return
3938                 # Anything else
3939                 parse_error()
3940                 tok_state = tok_state_before_attribute_name
3941                 cur -= 1 # Reconsume
3942                 return
3943
3944         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3945         # WARNING: put a comment token in tok_cur_tag before setting this state
3946         tok_state_bogus_comment = ->
3947                 next_gt = txt.indexOf '>', cur
3948                 if next_gt is -1
3949                         val = txt.substr cur
3950                         cur = txt.length
3951                 else
3952                         val = txt.substr cur, (next_gt - cur)
3953                         cur = next_gt + 1
3954                 val = val.replace(new RegExp("\u0000", 'g'), "\ufffd")
3955                 tok_cur_tag.text += val
3956                 tok_state = tok_state_data
3957                 return tok_cur_tag
3958
3959         # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3960         tok_state_markup_declaration_open = ->
3961                 if txt.substr(cur, 2) is '--'
3962                         cur += 2
3963                         tok_cur_tag = new_comment_token ''
3964                         tok_state = tok_state_comment_start
3965                         return
3966                 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3967                         cur += 7
3968                         tok_state = tok_state_doctype
3969                         return
3970                 acn = adjusted_current_node()
3971                 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3972                         cur += 7
3973                         tok_state = tok_state_cdata_section
3974                         return
3975                 # Otherwise
3976                 parse_error()
3977                 tok_cur_tag = new_comment_token ''
3978                 tok_state = tok_state_bogus_comment
3979                 return
3980
3981         # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3982         tok_state_comment_start = ->
3983                 switch c = txt.charAt(cur++)
3984                         when '-'
3985                                 tok_state = tok_state_comment_start_dash
3986                         when "\u0000"
3987                                 parse_error()
3988                                 tok_state = tok_state_comment
3989                                 return new_character_token "\ufffd"
3990                         when '>'
3991                                 parse_error()
3992                                 tok_state = tok_state_data
3993                                 return tok_cur_tag
3994                         when '' # EOF
3995                                 parse_error()
3996                                 tok_state = tok_state_data
3997                                 cur -= 1 # Reconsume
3998                                 return tok_cur_tag
3999                         else
4000                                 tok_cur_tag.text += c
4001                                 tok_state = tok_state_comment
4002                 return null
4003
4004         # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
4005         tok_state_comment_start_dash = ->
4006                 switch c = txt.charAt(cur++)
4007                         when '-'
4008                                 tok_state = tok_state_comment_end
4009                         when "\u0000"
4010                                 parse_error()
4011                                 tok_cur_tag.text += "-\ufffd"
4012                                 tok_state = tok_state_comment
4013                         when '>'
4014                                 parse_error()
4015                                 tok_state = tok_state_data
4016                                 return tok_cur_tag
4017                         when '' # EOF
4018                                 parse_error()
4019                                 tok_state = tok_state_data
4020                                 cur -= 1 # Reconsume
4021                                 return tok_cur_tag
4022                         else
4023                                 tok_cur_tag.text += "-#{c}"
4024                                 tok_state = tok_state_comment
4025                 return null
4026
4027         # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
4028         tok_state_comment = ->
4029                 switch c = txt.charAt(cur++)
4030                         when '-'
4031                                 tok_state = tok_state_comment_end_dash
4032                         when "\u0000"
4033                                 parse_error()
4034                                 tok_cur_tag.text += "\ufffd"
4035                         when '' # EOF
4036                                 parse_error()
4037                                 tok_state = tok_state_data
4038                                 cur -= 1 # Reconsume
4039                                 return tok_cur_tag
4040                         else
4041                                 tok_cur_tag.text += c
4042                 return null
4043
4044         # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
4045         tok_state_comment_end_dash = ->
4046                 switch c = txt.charAt(cur++)
4047                         when '-'
4048                                 tok_state = tok_state_comment_end
4049                         when "\u0000"
4050                                 parse_error()
4051                                 tok_cur_tag.text += "-\ufffd"
4052                                 tok_state = tok_state_comment
4053                         when '' # EOF
4054                                 parse_error()
4055                                 tok_state = tok_state_data
4056                                 cur -= 1 # Reconsume
4057                                 return tok_cur_tag
4058                         else
4059                                 tok_cur_tag.text += "-#{c}"
4060                                 tok_state = tok_state_comment
4061                 return null
4062
4063         # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
4064         tok_state_comment_end = ->
4065                 switch c = txt.charAt(cur++)
4066                         when '>'
4067                                 tok_state = tok_state_data
4068                                 return tok_cur_tag
4069                         when "\u0000"
4070                                 parse_error()
4071                                 tok_cur_tag.text += "--\ufffd"
4072                                 tok_state = tok_state_comment
4073                         when '!'
4074                                 parse_error()
4075                                 tok_state = tok_state_comment_end_bang
4076                         when '-'
4077                                 parse_error()
4078                                 tok_cur_tag.text += '-'
4079                         when '' # EOF
4080                                 parse_error()
4081                                 tok_state = tok_state_data
4082                                 cur -= 1 # Reconsume
4083                                 return tok_cur_tag
4084                         else
4085                                 parse_error()
4086                                 tok_cur_tag.text += "--#{c}"
4087                                 tok_state = tok_state_comment
4088                 return null
4089
4090         # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
4091         tok_state_comment_end_bang = ->
4092                 switch c = txt.charAt(cur++)
4093                         when '-'
4094                                 tok_cur_tag.text += "--!#{c}"
4095                                 tok_state = tok_state_comment_end_dash
4096                         when '>'
4097                                 tok_state = tok_state_data
4098                                 return tok_cur_tag
4099                         when "\u0000"
4100                                 parse_error()
4101                                 tok_cur_tag.text += "--!\ufffd"
4102                                 tok_state = tok_state_comment
4103                         when '' # EOF
4104                                 parse_error()
4105                                 tok_state = tok_state_data
4106                                 cur -= 1 # Reconsume
4107                                 return tok_cur_tag
4108                         else
4109                                 tok_cur_tag.text += "--!#{c}"
4110                                 tok_state = tok_state_comment
4111                 return null
4112
4113         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4114         tok_state_doctype = ->
4115                 switch c = txt.charAt(cur++)
4116                         when "\t", "\u000a", "\u000c", ' '
4117                                 tok_state = tok_state_before_doctype_name
4118                         when '' # EOF
4119                                 parse_error()
4120                                 tok_state = tok_state_data
4121                                 el = new_doctype_token ''
4122                                 el.flag 'force-quirks', true
4123                                 cur -= 1 # Reconsume
4124                                 return el
4125                         else
4126                                 parse_error()
4127                                 tok_state = tok_state_before_doctype_name
4128                                 cur -= 1 # Reconsume
4129                 return null
4130
4131         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
4132         tok_state_before_doctype_name = ->
4133                 c = txt.charAt(cur++)
4134                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4135                         return
4136                 if is_uc_alpha(c)
4137                         tok_cur_tag = new_doctype_token c.toLowerCase()
4138                         tok_state = tok_state_doctype_name
4139                         return
4140                 if c is "\u0000"
4141                         parse_error()
4142                         tok_cur_tag = new_doctype_token "\ufffd"
4143                         tok_state = tok_state_doctype_name
4144                         return
4145                 if c is '>'
4146                         parse_error()
4147                         el = new_doctype_token ''
4148                         el.flag 'force-quirks', true
4149                         tok_state = tok_state_data
4150                         return el
4151                 if c is '' # EOF
4152                         parse_error()
4153                         tok_state = tok_state_data
4154                         el = new_doctype_token ''
4155                         el.flag 'force-quirks', true
4156                         cur -= 1 # Reconsume
4157                         return el
4158                 # Anything else
4159                 tok_cur_tag = new_doctype_token c
4160                 tok_state = tok_state_doctype_name
4161                 return null
4162
4163         # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
4164         tok_state_doctype_name = ->
4165                 c = txt.charAt(cur++)
4166                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4167                         tok_state = tok_state_after_doctype_name
4168                         return
4169                 if c is '>'
4170                         tok_state = tok_state_data
4171                         return tok_cur_tag
4172                 if is_uc_alpha(c)
4173                         tok_cur_tag.name += c.toLowerCase()
4174                         return
4175                 if c is "\u0000"
4176                         parse_error()
4177                         tok_cur_tag.name += "\ufffd"
4178                         return
4179                 if c is '' # EOF
4180                         parse_error()
4181                         tok_state = tok_state_data
4182                         tok_cur_tag.flag 'force-quirks', true
4183                         cur -= 1 # Reconsume
4184                         return tok_cur_tag
4185                 # Anything else
4186                 tok_cur_tag.name += c
4187                 return null
4188
4189         # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
4190         tok_state_after_doctype_name = ->
4191                 c = txt.charAt(cur++)
4192                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4193                         return
4194                 if c is '>'
4195                         tok_state = tok_state_data
4196                         return tok_cur_tag
4197                 if c is '' # EOF
4198                         parse_error()
4199                         tok_state = tok_state_data
4200                         tok_cur_tag.flag 'force-quirks', true
4201                         cur -= 1 # Reconsume
4202                         return tok_cur_tag
4203                 # Anything else
4204                 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
4205                         cur += 5
4206                         tok_state = tok_state_after_doctype_public_keyword
4207                         return
4208                 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
4209                         cur += 5
4210                         tok_state = tok_state_after_doctype_system_keyword
4211                         return
4212                 parse_error()
4213                 tok_cur_tag.flag 'force-quirks', true
4214                 tok_state = tok_state_bogus_doctype
4215                 return null
4216
4217         # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
4218         tok_state_after_doctype_public_keyword = ->
4219                 c = txt.charAt(cur++)
4220                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4221                         tok_state = tok_state_before_doctype_public_identifier
4222                         return
4223                 if c is '"'
4224                         parse_error()
4225                         tok_cur_tag.public_identifier = ''
4226                         tok_state = tok_state_doctype_public_identifier_double_quoted
4227                         return
4228                 if c is "'"
4229                         parse_error()
4230                         tok_cur_tag.public_identifier = ''
4231                         tok_state = tok_state_doctype_public_identifier_single_quoted
4232                         return
4233                 if c is '>'
4234                         parse_error()
4235                         tok_cur_tag.flag 'force-quirks', true
4236                         tok_state = tok_state_data
4237                         return tok_cur_tag
4238                 if c is '' # EOF
4239                         parse_error()
4240                         tok_state = tok_state_data
4241                         tok_cur_tag.flag 'force-quirks', true
4242                         cur -= 1 # Reconsume
4243                         return tok_cur_tag
4244                 # Anything else
4245                 parse_error()
4246                 tok_cur_tag.flag 'force-quirks', true
4247                 tok_state = tok_state_bogus_doctype
4248                 return null
4249
4250         # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
4251         tok_state_before_doctype_public_identifier = ->
4252                 c = txt.charAt(cur++)
4253                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4254                         return
4255                 if c is '"'
4256                         parse_error()
4257                         tok_cur_tag.public_identifier = ''
4258                         tok_state = tok_state_doctype_public_identifier_double_quoted
4259                         return
4260                 if c is "'"
4261                         parse_error()
4262                         tok_cur_tag.public_identifier = ''
4263                         tok_state = tok_state_doctype_public_identifier_single_quoted
4264                         return
4265                 if c is '>'
4266                         parse_error()
4267                         tok_cur_tag.flag 'force-quirks', true
4268                         tok_state = tok_state_data
4269                         return tok_cur_tag
4270                 if c is '' # EOF
4271                         parse_error()
4272                         tok_state = tok_state_data
4273                         tok_cur_tag.flag 'force-quirks', true
4274                         cur -= 1 # Reconsume
4275                         return tok_cur_tag
4276                 # Anything else
4277                 parse_error()
4278                 tok_cur_tag.flag 'force-quirks', true
4279                 tok_state = tok_state_bogus_doctype
4280                 return null
4281
4282
4283         # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
4284         tok_state_doctype_public_identifier_double_quoted = ->
4285                 c = txt.charAt(cur++)
4286                 if c is '"'
4287                         tok_state = tok_state_after_doctype_public_identifier
4288                         return
4289                 if c is "\u0000"
4290                         parse_error()
4291                         tok_cur_tag.public_identifier += "\ufffd"
4292                         return
4293                 if c is '>'
4294                         parse_error()
4295                         tok_cur_tag.flag 'force-quirks', true
4296                         tok_state = tok_state_data
4297                         return tok_cur_tag
4298                 if c is '' # EOF
4299                         parse_error()
4300                         tok_state = tok_state_data
4301                         tok_cur_tag.flag 'force-quirks', true
4302                         cur -= 1 # Reconsume
4303                         return tok_cur_tag
4304                 # Anything else
4305                 tok_cur_tag.public_identifier += c
4306                 return null
4307
4308         # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
4309         tok_state_doctype_public_identifier_single_quoted = ->
4310                 c = txt.charAt(cur++)
4311                 if c is "'"
4312                         tok_state = tok_state_after_doctype_public_identifier
4313                         return
4314                 if c is "\u0000"
4315                         parse_error()
4316                         tok_cur_tag.public_identifier += "\ufffd"
4317                         return
4318                 if c is '>'
4319                         parse_error()
4320                         tok_cur_tag.flag 'force-quirks', true
4321                         tok_state = tok_state_data
4322                         return tok_cur_tag
4323                 if c is '' # EOF
4324                         parse_error()
4325                         tok_state = tok_state_data
4326                         tok_cur_tag.flag 'force-quirks', true
4327                         cur -= 1 # Reconsume
4328                         return tok_cur_tag
4329                 # Anything else
4330                 tok_cur_tag.public_identifier += c
4331                 return null
4332
4333         # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
4334         tok_state_after_doctype_public_identifier = ->
4335                 c = txt.charAt(cur++)
4336                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4337                         tok_state = tok_state_between_doctype_public_and_system_identifiers
4338                         return
4339                 if c is '>'
4340                         tok_state = tok_state_data
4341                         return tok_cur_tag
4342                 if c is '"'
4343                         parse_error()
4344                         tok_cur_tag.system_identifier = ''
4345                         tok_state = tok_state_doctype_system_identifier_double_quoted
4346                         return
4347                 if c is "'"
4348                         parse_error()
4349                         tok_cur_tag.system_identifier = ''
4350                         tok_state = tok_state_doctype_system_identifier_single_quoted
4351                         return
4352                 if c is '' # EOF
4353                         parse_error()
4354                         tok_state = tok_state_data
4355                         tok_cur_tag.flag 'force-quirks', true
4356                         cur -= 1 # Reconsume
4357                         return tok_cur_tag
4358                 # Anything else
4359                 parse_error()
4360                 tok_cur_tag.flag 'force-quirks', true
4361                 tok_state = tok_state_bogus_doctype
4362                 return null
4363
4364         # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
4365         tok_state_between_doctype_public_and_system_identifiers = ->
4366                 c = txt.charAt(cur++)
4367                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4368                         return
4369                 if c is '>'
4370                         tok_state = tok_state_data
4371                         return tok_cur_tag
4372                 if c is '"'
4373                         parse_error()
4374                         tok_cur_tag.system_identifier = ''
4375                         tok_state = tok_state_doctype_system_identifier_double_quoted
4376                         return
4377                 if c is "'"
4378                         parse_error()
4379                         tok_cur_tag.system_identifier = ''
4380                         tok_state = tok_state_doctype_system_identifier_single_quoted
4381                         return
4382                 if c is '' # EOF
4383                         parse_error()
4384                         tok_state = tok_state_data
4385                         tok_cur_tag.flag 'force-quirks', true
4386                         cur -= 1 # Reconsume
4387                         return tok_cur_tag
4388                 # Anything else
4389                 parse_error()
4390                 tok_cur_tag.flag 'force-quirks', true
4391                 tok_state = tok_state_bogus_doctype
4392                 return null
4393
4394         # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
4395         tok_state_after_doctype_system_keyword = ->
4396                 c = txt.charAt(cur++)
4397                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4398                         tok_state = tok_state_before_doctype_system_identifier
4399                         return
4400                 if c is '"'
4401                         parse_error()
4402                         tok_cur_tag.system_identifier = ''
4403                         tok_state = tok_state_doctype_system_identifier_double_quoted
4404                         return
4405                 if c is "'"
4406                         parse_error()
4407                         tok_cur_tag.system_identifier = ''
4408                         tok_state = tok_state_doctype_system_identifier_single_quoted
4409                         return
4410                 if c is '>'
4411                         parse_error()
4412                         tok_cur_tag.flag 'force-quirks', true
4413                         tok_state = tok_state_data
4414                         return tok_cur_tag
4415                 if c is '' # EOF
4416                         parse_error()
4417                         tok_state = tok_state_data
4418                         tok_cur_tag.flag 'force-quirks', true
4419                         cur -= 1 # Reconsume
4420                         return tok_cur_tag
4421                 # Anything else
4422                 parse_error()
4423                 tok_cur_tag.flag 'force-quirks', true
4424                 tok_state = tok_state_bogus_doctype
4425                 return null
4426
4427         # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
4428         tok_state_before_doctype_system_identifier = ->
4429                 c = txt.charAt(cur++)
4430                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4431                         return
4432                 if c is '"'
4433                         tok_cur_tag.system_identifier = ''
4434                         tok_state = tok_state_doctype_system_identifier_double_quoted
4435                         return
4436                 if c is "'"
4437                         tok_cur_tag.system_identifier = ''
4438                         tok_state = tok_state_doctype_system_identifier_single_quoted
4439                         return
4440                 if c is '>'
4441                         parse_error()
4442                         tok_cur_tag.flag 'force-quirks', true
4443                         tok_state = tok_state_data
4444                         return tok_cur_tag
4445                 if c is '' # EOF
4446                         parse_error()
4447                         tok_state = tok_state_data
4448                         tok_cur_tag.flag 'force-quirks', true
4449                         cur -= 1 # Reconsume
4450                         return tok_cur_tag
4451                 # Anything else
4452                 parse_error()
4453                 tok_cur_tag.flag 'force-quirks', true
4454                 tok_state = tok_state_bogus_doctype
4455                 return null
4456
4457         # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
4458         tok_state_doctype_system_identifier_double_quoted = ->
4459                 c = txt.charAt(cur++)
4460                 if c is '"'
4461                         tok_state = tok_state_after_doctype_system_identifier
4462                         return
4463                 if c is "\u0000"
4464                         parse_error()
4465                         tok_cur_tag.system_identifier += "\ufffd"
4466                         return
4467                 if c is '>'
4468                         parse_error()
4469                         tok_cur_tag.flag 'force-quirks', true
4470                         tok_state = tok_state_data
4471                         return tok_cur_tag
4472                 if c is '' # EOF
4473                         parse_error()
4474                         tok_state = tok_state_data
4475                         tok_cur_tag.flag 'force-quirks', true
4476                         cur -= 1 # Reconsume
4477                         return tok_cur_tag
4478                 # Anything else
4479                 tok_cur_tag.system_identifier += c
4480                 return null
4481
4482         # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
4483         tok_state_doctype_system_identifier_single_quoted = ->
4484                 c = txt.charAt(cur++)
4485                 if c is "'"
4486                         tok_state = tok_state_after_doctype_system_identifier
4487                         return
4488                 if c is "\u0000"
4489                         parse_error()
4490                         tok_cur_tag.system_identifier += "\ufffd"
4491                         return
4492                 if c is '>'
4493                         parse_error()
4494                         tok_cur_tag.flag 'force-quirks', true
4495                         tok_state = tok_state_data
4496                         return tok_cur_tag
4497                 if c is '' # EOF
4498                         parse_error()
4499                         tok_state = tok_state_data
4500                         tok_cur_tag.flag 'force-quirks', true
4501                         cur -= 1 # Reconsume
4502                         return tok_cur_tag
4503                 # Anything else
4504                 tok_cur_tag.system_identifier += c
4505                 return null
4506
4507         # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
4508         tok_state_after_doctype_system_identifier = ->
4509                 c = txt.charAt(cur++)
4510                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
4511                         return
4512                 if c is '>'
4513                         tok_state = tok_state_data
4514                         return tok_cur_tag
4515                 if c is '' # EOF
4516                         parse_error()
4517                         tok_state = tok_state_data
4518                         tok_cur_tag.flag 'force-quirks', true
4519                         cur -= 1 # Reconsume
4520                         return tok_cur_tag
4521                 # Anything else
4522                 parse_error()
4523                 # do _not_ tok_cur_tag.flag 'force-quirks', true
4524                 tok_state = tok_state_bogus_doctype
4525                 return null
4526
4527         # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
4528         tok_state_bogus_doctype = ->
4529                 c = txt.charAt(cur++)
4530                 if c is '>'
4531                         tok_state = tok_state_data
4532                         return tok_cur_tag
4533                 if c is '' # EOF
4534                         tok_state = tok_state_data
4535                         cur -= 1 # Reconsume
4536                         return tok_cur_tag
4537                 # Anything else
4538                 return null
4539
4540         # 8.2.4.68 http://www.w3.org/TR/html5/syntax.html#cdata-section-state
4541         tok_state_cdata_section = ->
4542                 tok_state = tok_state_data
4543                 next_gt = txt.indexOf ']]>', cur
4544                 if next_gt is -1
4545                         val = txt.substr cur
4546                         cur = txt.length
4547                 else
4548                         val = txt.substr cur, (next_gt - cur)
4549                         cur = next_gt + 3
4550                 if val.length > 0
4551                         return new_character_token val # fixfull split
4552                 return null
4553
4554         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
4555         # Don't set this as a state, just call it
4556         # returns a string (NOT a text node)
4557         parse_character_reference = (allowed_char = null, in_attr = false) ->
4558                 if cur >= txt.length
4559                         return '&'
4560                 switch c = txt.charAt(cur)
4561                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
4562                                 # explicitly not a parse error
4563                                 return '&'
4564                         when ';'
4565                                 # there has to be "one or more" alnums between & and ; to be a parse error
4566                                 return '&'
4567                         when '#'
4568                                 if cur + 1 >= txt.length
4569                                         return '&'
4570                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
4571                                         base = 16
4572                                         charset = hex_chars
4573                                         start = cur + 2
4574                                 else
4575                                         charset = digits
4576                                         start = cur + 1
4577                                         base = 10
4578                                 i = 0
4579                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
4580                                         i += 1
4581                                 if i is 0
4582                                         return '&'
4583                                 cur = start + i
4584                                 if txt.charAt(start + i) is ';'
4585                                         cur += 1
4586                                 else
4587                                         parse_error()
4588                                 code_point = txt.substr(start, i)
4589                                 while code_point.charAt(0) is '0' and code_point.length > 1
4590                                         code_point = code_point.substr 1
4591                                 code_point = parseInt(code_point, base)
4592                                 if unicode_fixes[code_point]?
4593                                         parse_error()
4594                                         return unicode_fixes[code_point]
4595                                 else
4596                                         if (code_point >= 0xd800 and code_point <= 0xdfff) or code_point > 0x10ffff
4597                                                 parse_error()
4598                                                 return "\ufffd"
4599                                         else
4600                                                 if (code_point >= 0x0001 and code_point <= 0x0008) or (code_point >= 0x000D and code_point <= 0x001F) or (code_point >= 0x007F and code_point <= 0x009F) or (code_point >= 0xFDD0 and code_point <= 0xFDEF) or code_point is 0x000B or code_point is 0xFFFE or code_point is 0xFFFF or code_point is 0x1FFFE or code_point is 0x1FFFF or code_point is 0x2FFFE or code_point is 0x2FFFF or code_point is 0x3FFFE or code_point is 0x3FFFF or code_point is 0x4FFFE or code_point is 0x4FFFF or code_point is 0x5FFFE or code_point is 0x5FFFF or code_point is 0x6FFFE or code_point is 0x6FFFF or code_point is 0x7FFFE or code_point is 0x7FFFF or code_point is 0x8FFFE or code_point is 0x8FFFF or code_point is 0x9FFFE or code_point is 0x9FFFF or code_point is 0xAFFFE or code_point is 0xAFFFF or code_point is 0xBFFFE or code_point is 0xBFFFF or code_point is 0xCFFFE or code_point is 0xCFFFF or code_point is 0xDFFFE or code_point is 0xDFFFF or code_point is 0xEFFFE or code_point is 0xEFFFF or code_point is 0xFFFFE or code_point is 0xFFFFF or code_point is 0x10FFFE or code_point is 0x10FFFF
4601                                                         parse_error()
4602                                                 return from_code_point code_point
4603                                 return
4604                         else
4605                                 for i in [0...31]
4606                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
4607                                                 break
4608                                 if i is 0
4609                                         # exit early, because parse_error() below needs at least one alnum
4610                                         return '&'
4611                                 if txt.charAt(cur + i) is ';'
4612                                         i += 1 # include ';' terminator in value
4613                                         decoded = decode_named_char_ref txt.substr(cur, i)
4614                                         if decoded?
4615                                                 cur += i
4616                                                 return decoded
4617                                         parse_error()
4618                                         return '&'
4619                                 else
4620                                         # no ';' terminator (only legacy char refs)
4621                                         max = i
4622                                         for i in [2..max] # no prefix matches, so ok to check shortest first
4623                                                 c = legacy_char_refs[txt.substr(cur, i)]
4624                                                 if c?
4625                                                         if in_attr
4626                                                                 if txt.charAt(cur + i) is '='
4627                                                                         # "because some legacy user agents will
4628                                                                         # misinterpret the markup in those cases"
4629                                                                         parse_error()
4630                                                                         return '&'
4631                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
4632                                                                         # this makes attributes forgiving about url args
4633                                                                         return '&'
4634                                                         # ok, and besides the weird exceptions for attributes...
4635                                                         # return the matching char
4636                                                         cur += i # consume entity chars
4637                                                         parse_error() # because no terminating ";"
4638                                                         return c
4639                                         parse_error()
4640                                         return '&'
4641                 return # never reached
4642
4643         # tree constructor initialization
4644         # see comments on TYPE_TAG/etc for the structure of this data
4645         txt = args.html
4646         cur = 0
4647         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
4648         doc.flag 'quirks mode', QUIRKS_NO # TODO bugreport spec for not specifying this
4649         open_els = []
4650         afe = [] # active formatting elements
4651         template_ins_modes = []
4652         ins_mode = ins_mode_initial
4653         original_ins_mode = ins_mode # TODO check spec
4654         flag_scripting = args.scripting ? true # TODO might need an extra flag to get <noscript> to parse correctly
4655         flag_frameset_ok = true
4656         flag_parsing = true
4657         flag_foster_parenting = false
4658         form_element_pointer = null
4659         temporary_buffer = null
4660         pending_table_character_tokens = []
4661         head_element_pointer = null
4662         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
4663         context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
4664         prev_node_id = 0 # just for debugging
4665
4666         # tokenizer initialization
4667         tok_state = tok_state_data
4668
4669         # text pre-processing
4670         # FIXME http://www.w3.org/TR/html5/syntax.html#preprocessing-the-input-stream
4671         txt = txt.replace(new RegExp("\u0000", 'g'), "\ufffd") # fixfull spec doesn't say this
4672         txt = txt.replace(new RegExp("\r\n", 'g'), "\n") # fixfull spec doesn't say this
4673         txt = txt.replace(new RegExp("\r", 'g'), "\n") # fixfull spec doesn't say this
4674
4675         if args.name is "tests23.dat #1"
4676                 console.log "hi"
4677         # proccess input
4678         # http://www.w3.org/TR/html5/syntax.html#tree-construction
4679         while flag_parsing
4680                 t = tok_state()
4681                 if t?
4682                         process_token t
4683                         # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
4684         return doc.children
4685
4686 serialize_els = (els, shallow, show_ids) ->
4687         serialized = ''
4688         sep = ''
4689         for t in els
4690                 serialized += sep
4691                 sep = ','
4692                 serialized += t.serialize shallow, show_ids
4693         return serialized
4694
4695 module.exports.parse_html = parse_html
4696 module.exports.debug_log_reset = debug_log_reset
4697 module.exports.debug_log_each = debug_log_each
4698 module.exports.TYPE_TAG = TYPE_TAG
4699 module.exports.TYPE_TEXT = TYPE_TEXT
4700 module.exports.TYPE_COMMENT = TYPE_COMMENT
4701 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE
4702 module.exports.NS_HTML = NS_HTML
4703 module.exports.NS_MATHML = NS_MATHML
4704 module.exports.NS_SVG = NS_SVG
4705 module.exports.QUIRKS_NO = QUIRKS_NO
4706 module.exports.QUIRKS_LIMITED = QUIRKS_LIMITED
4707 module.exports.QUIRKS_YES = QUIRKS_YES