parse-html.coffee

   1 # HTML parser meant to run in a browser, in support of WYSIWYG editor
   2 # Copyright 2015 Jason Woofenden
   3 #
   4 # This program is free software: you can redistribute it and/or modify it under
   5 # the terms of the GNU Affero General Public License as published by the Free
   6 # Software Foundation, either version 3 of the License, or (at your option) any
   7 # later version.
   8 #
   9 # This program is distributed in the hope that it will be useful, but WITHOUT
  10 # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11 # FOR A PARTICULAR PURPOSE.  See the GNU Affero General Public License for more
  12 # details.
  13 #
  14 # You should have received a copy of the GNU Affero General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17
  18 # This file implements a parser for html snippets, meant to be used by a
  19 # WYSIWYG editor. Hence it does not attempt to parse doctypes, <html>, <head>
  20 # or <body> tags, nor does it produce the top level "document" node in the dom
  21 # tree, nor nodes for html, head or body. Comments containing "fixfull"
  22 # indicate places where additional code is needed for full HTML document
  23 # parsing.
  24 #
  25 # Instead, the data structure produced by this parser is an array of Nodes.
  26
  27
  28 # stacks/lists
  29 #
  30 # the spec uses a many different words do indicate which ends of lists/stacks
  31 # they are talking about (and relative movement within the lists/stacks). This
  32 # section splains. I'm implementing "lists" (afe and open_els) the same way
  33 # (both as stacks)
  34 #
  35 # stacks grow downward (current element is index=0)
  36 #
  37 # example: open_els = [a, b, c, d, e, f, g]
  38 #
  39 # "grows downwards" means it's visualized like this: (index: el, names)
  40 #
  41 #   6: g "start of the list", "topmost", "first"
  42 #   5: f
  43 #   4: e "previous" (to d), "above", "before"
  44 #   3: d   (previous/next are relative to this element)
  45 #   2: c "next", "after", "lower", "below"
  46 #   1: b
  47 #   0: a "end of the list", "current node", "bottommost", "last"
  48
  49
  50 # browser
  51 # note: to get this to run outside a browser, you'll have to write a native
  52 # implementation of decode_named_char_ref()
  53 unless module?.exports?
  54         window.wheic = {}
  55         module = exports: window.wheic
  56
  57 # Each node is an obect of the Node class. Here are the Node types:
  58 TYPE_TAG = 0 # name, {attributes}, [children]
  59 TYPE_TEXT = 1 # "text"
  60 TYPE_COMMENT = 2
  61 TYPE_DOCTYPE = 3
  62 # the following types are emited by the tokenizer, but shouldn't end up in the tree:
  63 TYPE_START_TAG = 4 # name, [attributes ([key,value]...) in reverse order], [children]
  64 TYPE_END_TAG = 5 # name
  65 TYPE_EOF = 6
  66 TYPE_AFE_MARKER = 7 # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
  67 TYPE_AAA_BOOKMARK = 8 # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
  68
  69 # namespace constants
  70 NS_HTML = 1
  71 NS_MATHML = 2
  72 NS_SVG = 3
  73
  74 g_debug_log = []
  75 debug_log_reset = ->
  76         g_debug_log = []
  77 debug_log = (str) ->
  78         g_debug_log.push str
  79 debug_log_each = (cb) ->
  80         for str in g_debug_log
  81                 cb str
  82
  83 prev_node_id = 0
  84 class Node
  85         constructor: (type, args = {}) ->
  86                 @type = type # one of the TYPE_* constants above
  87                 @name = args.name ? '' # tag name
  88                 @text = args.text ? '' # contents for text/comment nodes
  89                 @attrs = args.attrs ? {}
  90                 @attrs_a = args.attr_k ? [] # attrs in progress, TYPE_START_TAG only
  91                 @children = args.children ? []
  92                 @namespace = args.namespace ? NS_HTML
  93                 @parent = args.parent ? null
  94                 @token = args.token ? null
  95                 if args.id?
  96                         @id = "#{args.id}+"
  97                 else
  98                         @id = "#{++prev_node_id}"
  99         acknowledge_self_closing: ->
 100                 if @token?
 101                         @token.flag 'did_self_close'
 102                 else
 103                         @flag 'did_self_close', true
 104         flag: ->
 105                 # fixfull
 106         serialize: (shallow = false, show_ids = false) -> # for unit tests
 107                 ret = ''
 108                 switch @type
 109                         when TYPE_TAG
 110                                 ret += 'tag:'
 111                                 ret += JSON.stringify @name
 112                                 ret += ','
 113                                 if show_ids
 114                                         ret += "##{@id},"
 115                                 if shallow
 116                                         break
 117                                 attr_keys = []
 118                                 for k of @attrs
 119                                         attr_keys.push k
 120                                 attr_keys.sort()
 121                                 ret += '{'
 122                                 sep = ''
 123                                 for k in attr_keys
 124                                         ret += sep
 125                                         sep = ','
 126                                         ret += "#{JSON.stringify k}:#{JSON.stringify @attrs[k]}"
 127                                 ret += '},['
 128                                 sep = ''
 129                                 for c in @children
 130                                         ret += sep
 131                                         sep = ','
 132                                         ret += c.serialize shallow, show_ids
 133                                 ret += ']'
 134                         when TYPE_TEXT
 135                                 ret += 'text:'
 136                                 ret += JSON.stringify @text
 137                         when TYPE_COMMENT
 138                                 ret += 'comment:'
 139                                 ret += JSON.stringify @text
 140                         when TYPE_DOCTYPE
 141                                 ret += "doctype:#{@name},#{JSON.stringify(@public_identifier ? '')},#{JSON.stringify(@system_identifier ? '')}"
 142                         when TYPE_AFE_MARKER
 143                                 ret += 'marker'
 144                         when TYPE_AAA_BOOKMARK
 145                                 ret += 'aaa_bookmark'
 146                         else
 147                                 ret += 'unknown:'
 148                                 console.log "unknown: #{JSON.stringify @}" # backtrace is just as well
 149                 return ret
 150
 151 # helpers: (only take args that are normally known when parser creates nodes)
 152 new_open_tag = (name) ->
 153         return new Node TYPE_START_TAG, name: name
 154 new_end_tag = (name) ->
 155         return new Node TYPE_END_TAG, name: name
 156 new_element = (name) ->
 157         return new Node TYPE_TAG, name: name
 158 new_text_node = (txt) ->
 159         return new Node TYPE_TEXT, text: txt
 160 new_character_token = new_text_node
 161 new_comment_token = (txt) ->
 162         return new Node TYPE_COMMENT, text: txt
 163 new_doctype_token = (name) ->
 164         return new Node TYPE_DOCTYPE, name: name
 165 new_eof_token = ->
 166         return new Node TYPE_EOF
 167 new_afe_marker = ->
 168         return new Node TYPE_AFE_MARKER
 169 new_aaa_bookmark = ->
 170         return new Node TYPE_AAA_BOOKMARK
 171
 172 lc_alpha = "abcdefghijklmnopqrstuvwxyz"
 173 uc_alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
 174 digits = "0123456789"
 175 alnum = lc_alpha + uc_alpha + digits
 176 hex_chars = digits + "abcdefABCDEF"
 177
 178 is_uc_alpha = (str) ->
 179         return str.length is 1 and uc_alpha.indexOf(str) > -1
 180 is_lc_alpha = (str) ->
 181         return str.length is 1 and lc_alpha.indexOf(str) > -1
 182
 183 # some SVG elements have dashes in them
 184 tag_name_chars = alnum + "-"
 185
 186 # http://www.w3.org/TR/html5/infrastructure.html#space-character
 187 space_chars = "\u0009\u000a\u000c\u000d\u0020"
 188 is_space = (txt) ->
 189         return txt.length is 1 and space_chars.indexOf(txt) > -1
 190 is_space_tok = (t) ->
 191         return t.type is TYPE_TEXT && t.text.length is 1 and space_chars.indexOf(t.text) > -1
 192
 193 is_input_hidden_tok = (t) ->
 194         return unless t.type is TYPE_START_TAG
 195         for a of t.attrs_a
 196                 if a[0] is 'type'
 197                         if a[1].toLowerCase() is 'hidden'
 198                                 return true
 199                         return false
 200         return false
 201
 202 # https://en.wikipedia.org/wiki/Whitespace_character#Unicode
 203 whitespace_chars = "\u0009\u000a\u000b\u000c\u000d\u0020\u0085\u00a0\u1680\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u2028\u2029\u202f\u205f\u3000"
 204
 205 # These are the character references that don't need a terminating semicolon
 206 # min length: 2, max: 6, none are a prefix of any other.
 207 legacy_char_refs = {
 208         Aacute: 'Á', aacute: 'á', Acirc: 'Â', acirc: 'â', acute: '´', AElig: 'Æ',
 209         aelig: 'æ', Agrave: 'À', agrave: 'à', AMP: '&', amp: '&', Aring: 'Å',
 210         aring: 'å', Atilde: 'Ã', atilde: 'ã', Auml: 'Ä', auml: 'ä', brvbar: '¦',
 211         Ccedil: 'Ç', ccedil: 'ç', cedil: '¸', cent: '¢', COPY: '©', copy: '©',
 212         curren: '¤', deg: '°', divide: '÷', Eacute: 'É', eacute: 'é', Ecirc: 'Ê',
 213         ecirc: 'ê', Egrave: 'È', egrave: 'è', ETH: 'Ð', eth: 'ð', Euml: 'Ë',
 214         euml: 'ë', frac12: '½', frac14: '¼', frac34: '¾', GT: '>', gt: '>',
 215         Iacute: 'Í', iacute: 'í', Icirc: 'Î', icirc: 'î', iexcl: '¡', Igrave: 'Ì',
 216         igrave: 'ì', iquest: '¿', Iuml: 'Ï', iuml: 'ï', laquo: '«', LT: '<',
 217         lt: '<', macr: '¯', micro: 'µ', middot: '·', nbsp: "\u00a0", not: '¬',
 218         Ntilde: 'Ñ', ntilde: 'ñ', Oacute: 'Ó', oacute: 'ó', Ocirc: 'Ô', ocirc: 'ô',
 219         Ograve: 'Ò', ograve: 'ò', ordf: 'ª', ordm: 'º', Oslash: 'Ø', oslash: 'ø',
 220         Otilde: 'Õ', otilde: 'õ', Ouml: 'Ö', ouml: 'ö', para: '¶', plusmn: '±',
 221         pound: '£', QUOT: '"', quot: '"', raquo: '»', REG: '®', reg: '®', sect: '§',
 222         shy: '', sup1: '¹', sup2: '²', sup3: '³', szlig: 'ß', THORN: 'Þ', thorn: 'þ',
 223         times: '×', Uacute: 'Ú', uacute: 'ú', Ucirc: 'Û', ucirc: 'û', Ugrave: 'Ù',
 224         ugrave: 'ù', uml: '¨', Uuml: 'Ü', uuml: 'ü', Yacute: 'Ý', yacute: 'ý',
 225         yen: '¥', yuml: 'ÿ'
 226 }
 227
 228 void_elements = ['area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'meta', 'param', 'source', 'track', 'wbr']
 229 raw_text_elements = ['script', 'style']
 230 escapable_raw_text_elements = ['textarea', 'title']
 231 # http://www.w3.org/TR/SVG/ 1.1 (Second Edition)
 232 svg_elements = [
 233         'a', 'altGlyph', 'altGlyphDef', 'altGlyphItem', 'animate', 'animateColor',
 234         'animateMotion', 'animateTransform', 'circle', 'clipPath', 'color-profile',
 235         'cursor', 'defs', 'desc', 'ellipse', 'feBlend', 'feColorMatrix',
 236         'feComponentTransfer', 'feComposite', 'feConvolveMatrix',
 237         'feDiffuseLighting', 'feDisplacementMap', 'feDistantLight', 'feFlood',
 238         'feFuncA', 'feFuncB', 'feFuncG', 'feFuncR', 'feGaussianBlur', 'feImage',
 239         'feMerge', 'feMergeNode', 'feMorphology', 'feOffset', 'fePointLight',
 240         'feSpecularLighting', 'feSpotLight', 'feTile', 'feTurbulence', 'filter',
 241         'font', 'font-face', 'font-face-format', 'font-face-name', 'font-face-src',
 242         'font-face-uri', 'foreignObject', 'g', 'glyph', 'glyphRef', 'hkern',
 243         'image', 'line', 'linearGradient', 'marker', 'mask', 'metadata',
 244         'missing-glyph', 'mpath', 'path', 'pattern', 'polygon', 'polyline',
 245         'radialGradient', 'rect', 'script', 'set', 'stop', 'style', 'svg',
 246         'switch', 'symbol', 'text', 'textPath', 'title', 'tref', 'tspan', 'use',
 247         'view', 'vkern'
 248 ]
 249
 250 # http://www.w3.org/TR/MathML/ Version 3.0 2nd Edition
 251 mathml_elements = [
 252         'abs', 'and', 'annotation', 'annotation-xml', 'apply', 'approx', 'arccos',
 253         'arccosh', 'arccot', 'arccoth', 'arccsc', 'arccsch', 'arcsec', 'arcsech',
 254         'arcsin', 'arcsinh', 'arctan', 'arctanh', 'arg', 'bind', 'bvar', 'card',
 255         'cartesianproduct', 'cbytes', 'ceiling', 'cerror', 'ci', 'cn', 'codomain',
 256         'complexes', 'compose', 'condition', 'conjugate', 'cos', 'cosh', 'cot',
 257         'coth', 'cs', 'csc', 'csch', 'csymbol', 'curl', 'declare', 'degree',
 258         'determinant', 'diff', 'divergence', 'divide', 'domain',
 259         'domainofapplication', 'emptyset', 'eq', 'equivalent', 'eulergamma',
 260         'exists', 'exp', 'exponentiale', 'factorial', 'factorof', 'false', 'floor',
 261         'fn', 'forall', 'gcd', 'geq', 'grad', 'gt', 'ident', 'image', 'imaginary',
 262         'imaginaryi', 'implies', 'in', 'infinity', 'int', 'integers', 'intersect',
 263         'interval', 'inverse', 'lambda', 'laplacian', 'lcm', 'leq', 'limit',
 264         'list', 'ln', 'log', 'logbase', 'lowlimit', 'lt', 'maction', 'maligngroup',
 265         'malignmark', 'math', 'matrix', 'matrixrow', 'max', 'mean', 'median',
 266         'menclose', 'merror', 'mfenced', 'mfrac', 'mglyph', 'mi', 'mi', 'min',
 267         'minus', 'mlabeledtr', 'mlongdiv', 'mmultiscripts', 'mn', 'mo', 'mode',
 268         'moment', 'momentabout', 'mover', 'mpadded', 'mphantom', 'mprescripts',
 269         'mroot', 'mrow', 'ms', 'mscarries', 'mscarry', 'msgroup', 'msline',
 270         'mspace', 'msqrt', 'msrow', 'mstack', 'mstyle', 'msub', 'msubsup', 'msup',
 271         'mtable', 'mtd', 'mtext', 'mtr', 'munder', 'munderover', 'naturalnumbers',
 272         'neq', 'none', 'not', 'notanumber', 'notin', 'notprsubset', 'notsubset',
 273         'or', 'otherwise', 'outerproduct', 'partialdiff', 'pi', 'piece',
 274         'piecewise', 'plus', 'power', 'primes', 'product', 'prsubset', 'quotient',
 275         'rationals', 'real', 'reals', 'reln', 'rem', 'root', 'scalarproduct',
 276         'sdev', 'sec', 'sech', 'selector', 'semantics', 'sep', 'set', 'setdiff',
 277         'share', 'sin', 'sinh', 'span', 'subset', 'sum', 'tan', 'tanh', 'tendsto',
 278         'times', 'transpose', 'true', 'union', 'uplimit', 'variance', 'vector',
 279         'vectorproduct', 'xor'
 280 ]
 281 # foreign_elements = [svg_elements..., mathml_elements...]
 282 #normal_elements = All other allowed HTML elements are normal elements.
 283
 284 special_elements = {
 285         # HTML:
 286         address:NS_HTML, applet:NS_HTML, area:NS_HTML, article:NS_HTML,
 287         aside:NS_HTML, base:NS_HTML, basefont:NS_HTML, bgsound:NS_HTML,
 288         blockquote:NS_HTML, body:NS_HTML, br:NS_HTML, button:NS_HTML,
 289         caption:NS_HTML, center:NS_HTML, col:NS_HTML, colgroup:NS_HTML, dd:NS_HTML,
 290         details:NS_HTML, dir:NS_HTML, div:NS_HTML, dl:NS_HTML, dt:NS_HTML,
 291         embed:NS_HTML, fieldset:NS_HTML, figcaption:NS_HTML, figure:NS_HTML,
 292         footer:NS_HTML, form:NS_HTML, frame:NS_HTML, frameset:NS_HTML, h1:NS_HTML,
 293         h2:NS_HTML, h3:NS_HTML, h4:NS_HTML, h5:NS_HTML, h6:NS_HTML, head:NS_HTML,
 294         header:NS_HTML, hgroup:NS_HTML, hr:NS_HTML, html:NS_HTML, iframe:NS_HTML,
 295         img:NS_HTML, input:NS_HTML, isindex:NS_HTML, li:NS_HTML, link:NS_HTML,
 296         listing:NS_HTML, main:NS_HTML, marquee:NS_HTML, meta:NS_HTML, nav:NS_HTML,
 297         noembed:NS_HTML, noframes:NS_HTML, noscript:NS_HTML, object:NS_HTML,
 298         ol:NS_HTML, p:NS_HTML, param:NS_HTML, plaintext:NS_HTML, pre:NS_HTML,
 299         script:NS_HTML, section:NS_HTML, select:NS_HTML, source:NS_HTML,
 300         style:NS_HTML, summary:NS_HTML, table:NS_HTML, tbody:NS_HTML, td:NS_HTML,
 301         template:NS_HTML, textarea:NS_HTML, tfoot:NS_HTML, th:NS_HTML,
 302         thead:NS_HTML, title:NS_HTML, tr:NS_HTML, track:NS_HTML, ul:NS_HTML,
 303         wbr:NS_HTML, xmp:NS_HTML,
 304
 305         # MathML:
 306         mi:NS_MATHML, mo:NS_MATHML, mn:NS_MATHML, ms:NS_MATHML, mtext:NS_MATHML,
 307         'annotation-xml':NS_MATHML,
 308
 309         # SVG:
 310         foreignObject:NS_SVG, desc:NS_SVG, title:NS_SVG
 311 }
 312
 313 formatting_elements = {
 314          a: true, b: true, big: true, code: true, em: true, font: true, i: true,
 315          nobr: true, s: true, small: true, strike: true, strong: true, tt: true,
 316          u: true
 317 }
 318
 319 foster_parenting_targets = {
 320         table: true
 321         tbody: true
 322         tfoot: true
 323         thead: true
 324         tr: true
 325 }
 326
 327 # all html I presume
 328 end_tag_implied = {
 329         dd: true
 330         dt: true
 331         li: true
 332         option: true
 333         optgroup: true
 334         p: true
 335         rb: true
 336         rp: true
 337         rt: true
 338         rtc: true
 339 }
 340
 341 el_is_special = (e) ->
 342         return special_elements[e.name] is e.namespace
 343
 344 # decode_named_char_ref()
 345 #
 346 # The list of named character references is _huge_ so ask the browser to decode
 347 # for us instead of wasting bandwidth/space on including the table here.
 348 #
 349 # Pass without the "&" but with the ";" examples:
 350 #    for "&amp" pass "amp;"
 351 #    for "&#x2032" pass "x2032;"
 352 g_dncr = {
 353         cache: {}
 354         textarea: document.createElement('textarea')
 355 }
 356 # TODO test this in IE8
 357 decode_named_char_ref = (txt) ->
 358         txt = "&#{txt}"
 359         decoded = g_dncr.cache[txt]
 360         return decoded if decoded?
 361         g_dncr.textarea.innerHTML = txt
 362         decoded = g_dncr.textarea.value
 363         return null if decoded is txt
 364         return g_dncr.cache[txt] = decoded
 365
 366 parse_html = (txt, parse_error_cb = null) ->
 367         cur = 0 # index of next char in txt to be parsed
 368         # declare doc and tokenizer variables so they're in scope below
 369         doc = null
 370         open_els = null # stack of open elements
 371         afe = null # active formatting elements
 372         template_insertion_modes = null
 373         insertion_mode = null
 374         original_insertion_mode = null
 375         tok_state = null
 376         tok_cur_tag = null # partially parsed tag
 377         flag_scripting = null
 378         flag_frameset_ok = null
 379         flag_parsing = null
 380         flag_foster_parenting = null
 381         form_element_pointer = null
 382         temporary_buffer = null
 383         pending_table_character_tokens = null
 384         head_element_pointer = null
 385         flag_fragment_parsing = null
 386         context_element = null
 387
 388         stop_parsing = ->
 389                 flag_parsing = false
 390
 391         parse_error = ->
 392                 if parse_error_cb?
 393                         parse_error_cb cur
 394                 else
 395                         console.log "Parse error at character #{cur} of #{txt.length}"
 396
 397         afe_push = (new_el) ->
 398                 matches = 0
 399                 for el, i in afe
 400                         if el.name is new_el.name and el.namespace is new_el.namespace
 401                                 for k, v of el.attrs
 402                                         continue unless new_el.attrs[k] is v
 403                                 for k, v of new_el.attrs
 404                                         continue unless el.attrs[k] is v
 405                                 matches += 1
 406                                 if matches is 3
 407                                         afe.splice i, 1
 408                                         break
 409                 afe.unshift new_el
 410         afe_push_marker = ->
 411                 afe.unshift new_afe_marker()
 412
 413         # the functions below impliment the Tree Contstruction algorithm
 414         # http://www.w3.org/TR/html5/syntax.html#tree-construction
 415
 416         # But first... the helpers
 417         template_tag_is_open = ->
 418                 for t in open_els
 419                         if t.name is 'template' # maybe should also check: and t.namespace is 'html'
 420                                 return true
 421                 return false
 422         is_in_scope_x = (tag_name, scope, namespace) ->
 423                 for t in open_els
 424                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 425                                 return true
 426                         if scope[t.name] is t.namespace
 427                                 return false
 428                 return false
 429         is_in_scope_x_y = (tag_name, scope, scope2, namespace) ->
 430                 for t in open_els
 431                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 432                                 return true
 433                         if scope[t.name] is t.namespace
 434                                 return false
 435                         if scope2[t.name] is t.namespace
 436                                 return false
 437                 return false
 438         standard_scopers = {
 439                 applet: NS_HTML, caption: NS_HTML, html: NS_HTML, table: NS_HTML,
 440                 td: NS_HTML, th: NS_HTML, marquee: NS_HTML, object: NS_HTML,
 441                 template: NS_HTML, mi: NS_MATHML,
 442
 443                 mo: NS_MATHML, mn: NS_MATHML, ms: NS_MATHML, mtext: NS_MATHML,
 444                 'annotation-xml': NS_MATHML,
 445
 446                 foreignObject: NS_SVG, desc: NS_SVG, title: NS_SVG
 447         }
 448         button_scopers = button: NS_HTML
 449         li_scopers = ol: NS_HTML, ul: NS_HTML
 450         table_scopers = html: NS_HTML, table: NS_HTML, template: NS_HTML
 451         is_in_scope = (tag_name, namespace = null) ->
 452                 return is_in_scope_x tag_name, standard_scopers, namespace
 453         is_in_button_scope = (tag_name, namespace = null) ->
 454                 return is_in_scope_x_y tag_name, standard_scopers, button_scopers, namespace
 455         is_in_table_scope = (tag_name, namespace = null) ->
 456                 return is_in_scope_x tag_name, table_scopers, namespace
 457         is_in_select_scope = (tag_name, namespace = null) ->
 458                 for t in open_els
 459                         if t.name is tag_name and (namespace is null or namespace is t.namespace)
 460                                 return true
 461                         if t.ns isnt NS_HTML and t.name isnt 'optgroup' and t.name isnt 'option'
 462                                 return false
 463                 return false
 464         # this checks for a particular element, not by name
 465         el_is_in_scope = (el) ->
 466                 for t in open_els
 467                         if t is el
 468                                 return true
 469                         if standard_scopers[t.name] is t.namespace
 470                                 return false
 471                 return false
 472
 473         clear_to_table_stopers = {
 474                 'table': true
 475                 'template': true
 476                 'html': true
 477         }
 478         clear_stack_to_table_context = ->
 479                 loop
 480                         if clear_to_table_stopers[open_els[0].name]?
 481                                 break
 482                         open_els.shift()
 483                 return
 484         clear_to_table_body_stopers = {
 485                 'tbody': true
 486                 'tfoot': true
 487                 'thead': true
 488                 'template': true
 489                 'html': true
 490         }
 491         clear_stack_to_table_body_context = ->
 492                 loop
 493                         if clear_to_table_body_stopers[open_els[0].name]?
 494                                 break
 495                         open_els.shift()
 496                 return
 497         clear_to_table_row_stopers = {
 498                 'tr': true
 499                 'template': true
 500                 'html': true
 501         }
 502         clear_stack_to_table_row_context = ->
 503                 loop
 504                         if clear_to_table_row_stopers[open_els[0].name]?
 505                                 break
 506                         open_els.shift()
 507                 return
 508         clear_afe_to_marker = ->
 509                 loop
 510                         return unless afe.length > 0 # this happens in fragment case, ?spec error
 511                         el = afe.shift()
 512                         if el.type is TYPE_AFE_MARKER
 513                                 return
 514                 return
 515
 516         # 8.2.3.1 ...
 517         # http://www.w3.org/TR/html5/syntax.html#reset-the-insertion-mode-appropriately
 518         reset_insertion_mode = ->
 519                 # 1. Let last be false.
 520                 last = false
 521                 # 2. Let node be the last node in the stack of open elements.
 522                 node_i = 0
 523                 node = open_els[node_i]
 524                 # 3. Loop: If node is the first node in the stack of open elements,
 525                 # then set last to true, and, if the parser was originally created as
 526                 # part of the HTML fragment parsing algorithm (fragment case) set node
 527                 # to the context element.
 528                 loop
 529                         if node_i is open_els.length - 1
 530                                 last = true
 531                                 # fixfull (fragment case)
 532
 533                         # 4. If node is a select element, run these substeps:
 534                         if node.name is 'select'
 535                                 # 1. If last is true, jump to the step below labeled done.
 536                                 unless last
 537                                         # 2. Let ancestor be node.
 538                                         ancestor_i = node_i
 539                                         ancestor = node
 540                                         # 3. Loop: If ancestor is the first node in the stack of
 541                                         # open elements, jump to the step below labeled done.
 542                                         loop
 543                                                 if ancestor_i is open_els.length - 1
 544                                                         break
 545                                                 # 4. Let ancestor be the node before ancestor in the stack
 546                                                 # of open elements.
 547                                                 ancestor_i += 1
 548                                                 ancestor = open_els[ancestor_i]
 549                                                 # 5. If ancestor is a template node, jump to the step below
 550                                                 # labeled done.
 551                                                 if ancestor.name is 'template'
 552                                                         break
 553                                                 # 6. If ancestor is a table node, switch the insertion mode
 554                                                 # to "in select in table" and abort these steps.
 555                                                 if ancestor.name is 'table'
 556                                                         insertion_mode = ins_mode_in_select_in_table
 557                                                         return
 558                                                 # 7. Jump back to the step labeled loop.
 559                                 # 8. Done: Switch the insertion mode to "in select" and abort
 560                                 # these steps.
 561                                 insertion_mode = ins_mode_in_select
 562                                 return
 563                         # 5. If node is a td or th element and last is false, then switch
 564                         # the insertion mode to "in cell" and abort these steps.
 565                         if (node.name is 'td' or node.name is 'th') and last is false
 566                                 insertion_mode = ins_mode_in_cell
 567                                 return
 568                         # 6. If node is a tr element, then switch the insertion mode to "in
 569                         # row" and abort these steps.
 570                         if node.name is 'tr'
 571                                 insertion_mode = ins_mode_in_row
 572                                 return
 573                         # 7. If node is a tbody, thead, or tfoot element, then switch the
 574                         # insertion mode to "in table body" and abort these steps.
 575                         if node.name is 'tbody' or node.name is 'thead' or node.name is 'tfoot'
 576                                 insertion_mode = ins_mode_in_table_body
 577                                 return
 578                         # 8. If node is a caption element, then switch the insertion mode
 579                         # to "in caption" and abort these steps.
 580                         if node.name is 'caption'
 581                                 insertion_mode = ins_mode_in_caption
 582                                 return
 583                         # 9. If node is a colgroup element, then switch the insertion mode
 584                         # to "in column group" and abort these steps.
 585                         if node.name is 'colgroup'
 586                                 insertion_mode = ins_mode_in_column_group
 587                                 return
 588                         # 10. If node is a table element, then switch the insertion mode to
 589                         # "in table" and abort these steps.
 590                         if node.name is 'table'
 591                                 insertion_mode = ins_mode_in_table
 592                                 return
 593                         # 11. If node is a template element, then switch the insertion mode
 594                         # to the current template insertion mode and abort these steps.
 595                         # fixfull (template insertion mode stack)
 596
 597                         # 12. If node is a head element and last is true, then switch the
 598                         # insertion mode to "in body" ("in body"! not "in head"!) and abort
 599                         # these steps. (fragment case)
 600                         if node.name is 'head' and last
 601                                 insertion_mode = ins_mode_in_body
 602                                 return
 603                         # 13. If node is a head element and last is false, then switch the
 604                         # insertion mode to "in head" and abort these steps.
 605                         if node.name is 'head' and last is false
 606                                 insertion_mode = ins_mode_in_head
 607                                 return
 608                         # 14. If node is a body element, then switch the insertion mode to
 609                         # "in body" and abort these steps.
 610                         if node.name is 'body'
 611                                 insertion_mode = ins_mode_in_body
 612                                 return
 613                         # 15. If node is a frameset element, then switch the insertion mode
 614                         # to "in frameset" and abort these steps. (fragment case)
 615                         if node.name is 'frameset'
 616                                 insertion_mode = ins_mode_in_frameset
 617                                 return
 618                         # 16. If node is an html element, run these substeps:
 619                         if node.name is 'html'
 620                                 # 1. If the head element pointer is null, switch the insertion
 621                                 # mode to "before head" and abort these steps. (fragment case)
 622                                 if head_element_pointer is null
 623                                         ins_mode = ins_mode_before_head
 624                                 else
 625                                         # 2. Otherwise, the head element pointer is not null,
 626                                         # switch the insertion mode to "after head" and abort these
 627                                         # steps.
 628                                         insertion_mode = ins_mode_after_head
 629                                 return
 630                         # 17. If last is true, then switch the insertion mode to "in body"
 631                         # and abort these steps. (fragment case)
 632                         if last
 633                                 insertion_mode = ins_mode_in_body
 634                                 return
 635                         # 18. Let node now be the node before node in the stack of open
 636                         # elements.
 637                         node_i += 1
 638                         node = open_els[node_i]
 639                         # 19. Return to the step labeled loop.
 640
 641         # 8.2.3.2
 642
 643         # http://www.w3.org/TR/html5/syntax.html#adjusted-current-node
 644         adjusted_current_node = ->
 645                 if open_els.length is 1 and flag_fragment_parsing
 646                         return context_element
 647                 return open_els[0]
 648
 649         # http://www.w3.org/TR/html5/syntax.html#reconstruct-the-active-formatting-elements
 650         # this implementation is structured (mostly) as described at the link above.
 651         # capitalized comments are the "labels" described at the link above.
 652         reconstruct_active_formatting_elements = ->
 653                 return if afe.length is 0
 654                 if afe[0].type is TYPE_AFE_MARKER or afe[0] in open_els
 655                         return
 656                 # Rewind
 657                 i = 0
 658                 loop
 659                         if i is afe.length - 1
 660                                 break
 661                         i += 1
 662                         if afe[i].type is TYPE_AFE_MARKER or afe[i] in open_els
 663                                 i -= 1 # Advance
 664                                 break
 665                 # Create
 666                 loop
 667                         el = insert_html_element afe[i].token
 668                         afe[i] = el
 669                         break if i is 0
 670                         i -= 1 # Advance
 671
 672         # http://www.w3.org/TR/html5/syntax.html#adoption-agency-algorithm
 673         # adoption agency algorithm
 674         # overview here:
 675         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-i-/b-/i
 676         #   http://www.w3.org/TR/html5/syntax.html#misnested-tags:-b-p-/b-/p
 677         #   http://www.w3.org/TR/html5/syntax.html#unclosed-formatting-elements
 678         adoption_agency = (subject) ->
 679                 debug_log "adoption_agency()"
 680                 debug_log "tree: #{serialize_els doc.children, false, true}"
 681                 debug_log "open_els: #{serialize_els open_els, true, true}"
 682                 debug_log "afe: #{serialize_els afe, true, true}"
 683                 if open_els[0].name is subject
 684                         el = open_els[0]
 685                         open_els.shift()
 686                         # remove it from the list of active formatting elements (if found)
 687                         for t, i in afe
 688                                 if t is el
 689                                         afe.splice i, 1
 690                                         break
 691                         debug_log "aaa: starting off with subject on top of stack, exiting"
 692                         return
 693                 outer = 0
 694                 loop
 695                         if outer >= 8
 696                                 return
 697                         outer += 1
 698                         # 5. Let formatting element be the last element in the list of
 699                         # active formatting elements that: is between the end of the list
 700                         # and the last scope marker in the list, if any, or the start of
 701                         # the list otherwise, and  has the tag name subject.
 702                         fe = null
 703                         for t, fe_of_afe in afe
 704                                 if t.type is TYPE_AFE_MARKER
 705                                         break
 706                                 if t.name is subject
 707                                         fe = t
 708                                         break
 709                         # If there is no such element, then abort these steps and instead
 710                         # act as described in the "any other end tag" entry above.
 711                         if fe is null
 712                                 debug_log "aaa: fe not found in afe"
 713                                 in_body_any_other_end_tag subject
 714                                 return
 715                         # 6. If formatting element is not in the stack of open elements,
 716                         # then this is a parse error; remove the element from the list, and
 717                         # abort these steps.
 718                         in_open_els = false
 719                         for t, fe_of_open_els in open_els
 720                                 if t is fe
 721                                         in_open_els = true
 722                                         break
 723                         unless in_open_els
 724                                 debug_log "aaa: fe not found in open_els"
 725                                 parse_error()
 726                                 # "remove it from the list" must mean afe, since it's not in open_els
 727                                 afe.splice fe_of_afe, 1
 728                                 return
 729                         # 7. If formatting element is in the stack of open elements, but
 730                         # the element is not in scope, then this is a parse error; abort
 731                         # these steps.
 732                         unless el_is_in_scope fe
 733                                 debug_log "aaa: fe not in scope"
 734                                 parse_error()
 735                                 return
 736                         # 8. If formatting element is not the current node, this is a parse
 737                         # error. (But do not abort these steps.)
 738                         unless open_els[0] is fe
 739                                 parse_error()
 740                                 # continue
 741                         # 9. Let furthest block be the topmost node in the stack of open
 742                         # elements that is lower in the stack than formatting element, and
 743                         # is an element in the special category. There might not be one.
 744                         fb = null
 745                         fb_of_open_els = null
 746                         for t, i in open_els
 747                                 if t is fe
 748                                         break
 749                                 if el_is_special t
 750                                         fb = t
 751                                         fb_of_open_els = i
 752                                         # and continue, to see if there's one that's more "topmost"
 753                         # 10. If there is no furthest block, then the UA must first pop all
 754                         # the nodes from the bottom of the stack of open elements, from the
 755                         # current node up to and including formatting element, then remove
 756                         # formatting element from the list of active formatting elements,
 757                         # and finally abort these steps.
 758                         if fb is null
 759                                 debug_log "aaa: no fb"
 760                                 loop
 761                                         t = open_els.shift()
 762                                         if t is fe
 763                                                 afe.splice fe_of_afe, 1
 764                                                 return
 765                         # 11. Let common ancestor be the element immediately above
 766                         # formatting element in the stack of open elements.
 767                         ca = open_els[fe_of_open_els + 1] # common ancestor
 768
 769                         node_above = open_els[fb_of_open_els + 1] # next node if node isn't in open_els anymore
 770                         # 12. Let a bookmark note the position of formatting element in the list of active formatting elements relative to the elements on either side of it in the list.
 771                         bookmark = new_aaa_bookmark()
 772                         for t, i in afe
 773                                 if t is fe
 774                                         afe.splice i, 0, bookmark
 775                                         break
 776                         node = last_node = fb
 777                         inner = 0
 778                         loop
 779                                 inner += 1
 780                                 # 3. Let node be the element immediately above node in the
 781                                 # stack of open elements, or if node is no longer in the stack
 782                                 # of open elements (e.g. because it got removed by this
 783                                 # algorithm), the element that was immediately above node in
 784                                 # the stack of open elements before node was removed.
 785                                 node_next = null
 786                                 for t, i in open_els
 787                                         if t is node
 788                                                 node_next = open_els[i + 1]
 789                                                 break
 790                                 node = node_next ? node_above
 791                                 debug_log "inner loop #{inner}"
 792                                 debug_log "tree: #{serialize_els doc.children, false, true}"
 793                                 debug_log "open_els: #{serialize_els open_els, true, true}"
 794                                 debug_log "afe: #{serialize_els afe, true, true}"
 795                                 debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
 796                                 debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
 797                                 debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
 798                                 debug_log "node: #{node.serialize true, true}"
 799                                 # TODO make sure node_above gets re-set if/when node is removed from open_els
 800
 801                                 # 4. If node is formatting element, then go to the next step in
 802                                 # the overall algorithm.
 803                                 if node is fe
 804                                         break
 805                                 debug_log "the meat"
 806                                 # 5. If inner loop counter is greater than three and node is in
 807                                 # the list of active formatting elements, then remove node from
 808                                 # the list of active formatting elements.
 809                                 node_in_afe = false
 810                                 for t, i in afe
 811                                         if t is node
 812                                                 if inner > 3
 813                                                         afe.splice i, 1
 814                                                         debug_log "max out inner"
 815                                                 else
 816                                                         node_in_afe = true
 817                                                         debug_log "in afe"
 818                                                 break
 819                                 # 6. If node is not in the list of active formatting elements,
 820                                 # then remove node from the stack of open elements and then go
 821                                 # back to the step labeled inner loop.
 822                                 unless node_in_afe
 823                                         debug_log "not in afe"
 824                                         for t, i in open_els
 825                                                 if t is node
 826                                                         node_above = open_els[i + 1]
 827                                                         open_els.splice i, 1
 828                                                         break
 829                                         continue
 830                                 debug_log "the bones"
 831                                 # 7. create an element for the token for which the element node
 832                                 # was created, in the HTML namespace, with common ancestor as
 833                                 # the intended parent; replace the entry for node in the list
 834                                 # of active formatting elements with an entry for the new
 835                                 # element, replace the entry for node in the stack of open
 836                                 # elements with an entry for the new element, and let node be
 837                                 # the new element.
 838                                 new_node = token_to_element node.token, NS_HTML, ca
 839                                 for t, i in afe
 840                                         if t is node
 841                                                 afe[i] = new_node
 842                                                 debug_log "replaced in afe"
 843                                                 break
 844                                 for t, i in open_els
 845                                         if t is node
 846                                                 node_above = open_els[i + 1]
 847                                                 open_els[i] = new_node
 848                                                 debug_log "replaced in open_els"
 849                                                 break
 850                                 node = new_node
 851                                 # 8. If last node is furthest block, then move the
 852                                 # aforementioned bookmark to be immediately after the new node
 853                                 # in the list of active formatting elements.
 854                                 if last_node is fb
 855                                         for t, i in afe
 856                                                 if t is bookmark
 857                                                         afe.splice i, 1
 858                                                         debug_log "removed bookmark"
 859                                                         break
 860                                         for t, i in afe
 861                                                 if t is node
 862                                                         # "after" means lower
 863                                                         afe.splice i, 0, bookmark # "after as <-
 864                                                         debug_log "placed bookmark after node"
 865                                                         debug_log "node: #{node.id} afe: #{serialize_els afe, true, true}"
 866                                                         break
 867                                 # 9. Insert last node into node, first removing it from its
 868                                 # previous parent node if any.
 869                                 if last_node.parent?
 870                                         debug_log "last_node has parent"
 871                                         for c, i in last_node.parent.children
 872                                                 if c is last_node
 873                                                         debug_log "removing last_node from parent"
 874                                                         last_node.parent.children.splice i, 1
 875                                                         break
 876                                 node.children.push last_node
 877                                 last_node.parent = node
 878                                 # 10. Let last node be node.
 879                                 last_node = node
 880                                 debug_log "at last"
 881                                 # 11. Return to the step labeled inner loop.
 882                         # 14. Insert whatever last node ended up being in the previous step
 883                         # at the appropriate place for inserting a node, but using common
 884                         # ancestor as the override target.
 885
 886                         # In the case where fe is immediately followed by fb:
 887                         #   * inner loop exits out early (node==fe)
 888                         #   * last_node is fb
 889                         #   * last_node is still in the tree (not a duplicate)
 890                         if last_node.parent?
 891                                 debug_log "FEFIRST? last_node has parent"
 892                                 for c, i in last_node.parent.children
 893                                         if c is last_node
 894                                                 debug_log "removing last_node from parent"
 895                                                 last_node.parent.children.splice i, 1
 896                                                 break
 897
 898                         debug_log "after aaa inner loop"
 899                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
 900                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
 901                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
 902                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
 903                         debug_log "tree: #{serialize_els doc.children, false, true}"
 904
 905                         debug_log "insert"
 906
 907
 908                         # can't use standard insert token thing, because it's already in
 909                         # open_els and must stay at it's current position in open_els
 910                         dest = adjusted_insertion_location ca
 911                         dest[0].children.splice dest[1], 0, last_node
 912                         last_node.parent = dest[0]
 913
 914
 915                         debug_log "ca: #{ca.name}##{ca.id} children: #{serialize_els ca.children, true, true}"
 916                         debug_log "fe: #{fe.name}##{fe.id} children: #{serialize_els fe.children, true, true}"
 917                         debug_log "fb: #{fb.name}##{fb.id} children: #{serialize_els fb.children, true, true}"
 918                         debug_log "last_node: #{last_node.name}##{last_node.id} children: #{serialize_els last_node.children, true, true}"
 919                         debug_log "tree: #{serialize_els doc.children, false, true}"
 920
 921                         # 15. Create an element for the token for which formatting element
 922                         # was created, in the HTML namespace, with furthest block as the
 923                         # intended parent.
 924                         new_element = token_to_element fe.token, NS_HTML, fb
 925                         # 16. Take all of the child nodes of furthest block and append them
 926                         # to the element created in the last step.
 927                         while fb.children.length
 928                                 t = fb.children.shift()
 929                                 t.parent = new_element
 930                                 new_element.children.push t
 931                         # 17. Append that new element to furthest block.
 932                         new_element.parent = fb
 933                         fb.children.push new_element
 934                         # 18. Remove formatting element from the list of active formatting
 935                         # elements, and insert the new element into the list of active
 936                         # formatting elements at the position of the aforementioned
 937                         # bookmark.
 938                         for t, i in afe
 939                                 if t is fe
 940                                         afe.splice i, 1
 941                                         break
 942                         for t, i in afe
 943                                 if t is bookmark
 944                                         afe[i] = new_element
 945                                         break
 946                         # 19. Remove formatting element from the stack of open elements,
 947                         # and insert the new element into the stack of open elements
 948                         # immediately below the position of furthest block in that stack.
 949                         for t, i in open_els
 950                                 if t is fe
 951                                         open_els.splice i, 1
 952                                         break
 953                         for t, i in open_els
 954                                 if t is fb
 955                                         open_els.splice i, 0, new_element
 956                                         break
 957                         # 20. Jump back to the step labeled outer loop.
 958                         debug_log "done wrapping fb's children. new_element: #{new_element.name}##{new_element.id}"
 959                         debug_log "tree: #{serialize_els doc.children, false, true}"
 960                         debug_log "open_els: #{serialize_els open_els, true, true}"
 961                         debug_log "afe: #{serialize_els afe, true, true}"
 962                 debug_log "AAA DONE"
 963
 964         # http://www.w3.org/TR/html5/syntax.html#close-a-p-element
 965         close_p_element = ->
 966                 generate_implied_end_tags 'p' # arg is exception
 967                 if open_els[0].name isnt 'p'
 968                         parse_error()
 969                 while open_els.length > 1 # just in case
 970                         el = open_els.shift()
 971                         if el.name is 'p'
 972                                 return
 973         close_p_if_in_button_scope = ->
 974                 if is_in_button_scope 'p'
 975                         close_p_element()
 976
 977         # http://www.w3.org/TR/html5/syntax.html#insert-a-character
 978         # aka insert_a_character = (t) ->
 979         insert_character = (t) ->
 980                 dest = adjusted_insertion_location()
 981                 # fixfull check for Document node
 982                 if dest[1] > 0
 983                         prev = dest[0].children[dest[1] - 1]
 984                         if prev.type is TYPE_TEXT
 985                                 prev.text += t.text
 986                                 return
 987                 dest[0].children.splice dest[1], 0, t
 988
 989         # 8.2.5.1
 990         # http://www.w3.org/TR/html5/syntax.html#creating-and-inserting-nodes
 991         # http://www.w3.org/TR/html5/syntax.html#appropriate-place-for-inserting-a-node
 992         adjusted_insertion_location = (override_target = null) ->
 993                 # 1. If there was an override target specified, then let target be the
 994                 # override target.
 995                 if override_target?
 996                         target = override_target
 997                 else # Otherwise, let target be the current node.
 998                         target = open_els[0]
 999                 # 2. Determine the adjusted insertion location using the first matching
1000                 # steps from the following list:
1001                 #
1002                 # If foster parenting is enabled and target is a table, tbody, tfoot,
1003                 # thead, or tr element Foster parenting happens when content is
1004                 # misnested in tables.
1005                 if flag_foster_parenting and foster_parenting_targets[target.name]
1006                         loop # once. this is here so we can ``break`` to "abort these substeps"
1007                                 # 1. Let last template be the last template element in the
1008                                 # stack of open elements, if any.
1009                                 last_template = null
1010                                 last_template_i = null
1011                                 for el, i in open_els
1012                                         if el.name is 'template'
1013                                                 last_template = el
1014                                                 last_template_i = i
1015                                                 break
1016                                 # 2. Let last table be the last table element in the stack of
1017                                 # open elements, if any.
1018                                 last_table = null
1019                                 last_table_i
1020                                 for el, i in open_els
1021                                         if el.name is 'table'
1022                                                 last_table = el
1023                                                 last_table_i = i
1024                                                 break
1025                                 # 3. If there is a last template and either there is no last
1026                                 # table, or there is one, but last template is lower (more
1027                                 # recently added) than last table in the stack of open
1028                                 # elements, then: let adjusted insertion location be inside
1029                                 # last template's template contents, after its last child (if
1030                                 # any), and abort these substeps.
1031                                 if last_template and (last_table is null or last_template_i < last_table_i)
1032                                         target = last_template # fixfull should be it's contents
1033                                         target_i = target.children.length
1034                                         break
1035                                 # 4. If there is no last table, then let adjusted insertion
1036                                 # location be inside the first element in the stack of open
1037                                 # elements (the html element), after its last child (if any),
1038                                 # and abort these substeps. (fragment case)
1039                                 if last_table is null
1040                                         # this is odd
1041                                         target = open_els[open_els.length - 1]
1042                                         target_i = target.children.length
1043                                 # 5. If last table has a parent element, then let adjusted
1044                                 # insertion location be inside last table's parent element,
1045                                 # immediately before last table, and abort these substeps.
1046                                 if last_table.parent?
1047                                         for c, i in last_table.parent.children
1048                                                 if c is last_table
1049                                                         target = last_table.parent
1050                                                         target_i = i
1051                                                         break
1052                                         break
1053                                 # 6. Let previous element be the element immediately above last
1054                                 # table in the stack of open elements.
1055                                 #
1056                                 # huh? how could it not have a parent?
1057                                 previous_element = open_els[last_table_i + 1]
1058                                 # 7. Let adjusted insertion location be inside previous
1059                                 # element, after its last child (if any).
1060                                 target = previous_element
1061                                 target_i = target.children.length
1062                                 # Note: These steps are involved in part because it's possible
1063                                 # for elements, the table element in this case in particular,
1064                                 # to have been moved by a script around in the DOM, or indeed
1065                                 # removed from the DOM entirely, after the element was inserted
1066                                 # by the parser.
1067                                 break # don't really loop
1068                 else
1069                         # Otherwise Let adjusted insertion location be inside target, after
1070                         # its last child (if any).
1071                         target_i = target.children.length
1072
1073                 # 3. If the adjusted insertion location is inside a template element,
1074                 # let it instead be inside the template element's template contents,
1075                 # after its last child (if any).
1076                 # fixfull (template)
1077
1078                 # 4. Return the adjusted insertion location.
1079                 return [target, target_i]
1080
1081         # http://www.w3.org/TR/html5/syntax.html#create-an-element-for-the-token
1082         # aka create_an_element_for_token
1083         token_to_element = (t, namespace, intended_parent) ->
1084                 # convert attributes into a hash
1085                 attrs = {}
1086                 for a in t.attrs_a
1087                         attrs[a[0]] = a[1] # TODO check what to do with dupilcate attrs
1088                 el = new Node TYPE_TAG, name: t.name, namespace: namespace, attrs: attrs, token: t
1089
1090                 # TODO 2. If the newly created element has an xmlns attribute in the
1091                 # XMLNS namespace whose value is not exactly the same as the element's
1092                 # namespace, that is a parse error. Similarly, if the newly created
1093                 # element has an xmlns:xlink attribute in the XMLNS namespace whose
1094                 # value is not the XLink Namespace, that is a parse error.
1095
1096                 # fixfull: the spec says stuff about form pointers and ownerDocument
1097
1098                 return el
1099
1100         # http://www.w3.org/TR/html5/syntax.html#insert-a-foreign-element
1101         insert_foreign_element = (token, namespace) ->
1102                 ail = adjusted_insertion_location()
1103                 ail_el = ail[0]
1104                 ail_i = ail[1]
1105                 el = token_to_element token, namespace, ail_el
1106                 # TODO skip this next step if it's broken (eg ail_el is document with child already)
1107                 el.parent = ail_el
1108                 ail_el.children.splice ail_i, 0, el
1109                 open_els.unshift el
1110                 return el
1111         # http://www.w3.org/TR/html5/syntax.html#insert-an-html-element
1112         insert_html_element = insert_foreign_element # (token, namespace) ->
1113
1114         # http://www.w3.org/TR/html5/syntax.html#insert-a-comment
1115         # position should be [node, index_within_children]
1116         insert_comment = (t, position = null) ->
1117                 position ?= adjusted_insertion_location()
1118                 position[0].children.splice position[1], 0, t
1119
1120         # 8.2.5.2
1121         # http://www.w3.org/TR/html5/syntax.html#generic-raw-text-element-parsing-algorithm
1122         parse_generic_raw_text = (t) ->
1123                 insert_html_element t
1124                 tok_state = tok_state_rawtext
1125                 original_insertion_mode = insertion_mode
1126                 insertion_mode = ins_mode_text
1127         parse_generic_rcdata_text = (t) ->
1128                 insert_html_element t
1129                 tok_state = tok_state_rcdata
1130                 original_insertion_mode = insertion_mode
1131                 insertion_mode = ins_mode_text
1132
1133         # 8.2.5.3 http://www.w3.org/TR/html5/syntax.html#closing-elements-that-have-implied-end-tags
1134         # http://www.w3.org/TR/html5/syntax.html#generate-implied-end-tags
1135         generate_implied_end_tags = (except = null) ->
1136                 while end_tag_implied[open_els[0].name] and open_els[0].name isnt except
1137                         open_els.shift()
1138
1139         # 8.2.5.4 The rules for parsing tokens in HTML content
1140         # http://www.w3.org/TR/html5/syntax.html#parsing-main-inhtml
1141
1142         # 8.2.5.4.1 The "initial" insertion mode
1143         # http://www.w3.org/TR/html5/syntax.html#the-initial-insertion-mode
1144         ins_mode_initial = (t) ->
1145                 if is_space_tok t
1146                         return
1147                 if t.type is TYPE_COMMENT
1148                         # ?fixfull
1149                         doc.children.push t
1150                         return
1151                 if t.type is TYPE_DOCTYPE
1152                         # FIXME check identifiers, set quirks, etc
1153                         # fixfull
1154                         doc.children.push t
1155                         insertion_mode = ins_mode_before_html
1156                         return
1157                 # Anything else
1158                 #fixfull (iframe, quirks)
1159                 insertion_mode = ins_mode_before_html
1160                 insertion_mode t # reprocess the token
1161                 return
1162
1163         # 8.2.5.4.2 http://www.w3.org/TR/html5/syntax.html#the-before-html-insertion-mode
1164         ins_mode_before_html = (t) ->
1165                 if t.type is TYPE_DOCTYPE
1166                         parse_error()
1167                         return
1168                 if t.type is TYPE_COMMENT
1169                         doc.children.push t
1170                         return
1171                 if is_space_tok t
1172                         return
1173                 if t.type is TYPE_START_TAG and t.name is 'html'
1174                         el = token_to_element t, NS_HTML, doc
1175                         doc.children.push el
1176                         open_els.unshift(el)
1177                         # fixfull (big paragraph in spec about manifest, fragment, urls, etc)
1178                         insertion_mode = ins_mode_before_head
1179                         return
1180                 if t.type is TYPE_END_TAG
1181                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1182                                 # fall through to "anything else"
1183                         else
1184                                 parse_error()
1185                                 return
1186                 # Anything else
1187                 html_tok = new_open_tag 'html'
1188                 el = token_to_element html_tok, NS_HTML, doc
1189                 doc.children.push el
1190                 open_els.unshift el
1191                 # ?fixfull browsing context
1192                 insertion_mode = ins_mode_before_head
1193                 insertion_mode t
1194                 return
1195
1196         # 8.2.5.4.3 http://www.w3.org/TR/html5/syntax.html#the-before-head-insertion-mode
1197         ins_mode_before_head = (t) ->
1198                 if is_space_tok t
1199                         return
1200                 if t.type is TYPE_COMMENT
1201                         insert_comment t
1202                         return
1203                 if t.type is TYPE_DOCTYPE
1204                         parse_error()
1205                         return
1206                 if t.type is TYPE_START_TAG and t.name is 'html'
1207                         ins_mode_in_body t
1208                         return
1209                 if t.type is TYPE_START_TAG and t.name is 'head'
1210                         el = insert_html_element t
1211                         head_element_pointer = el
1212                         insertion_mode = ins_mode_in_head
1213                 if t.type is TYPE_END_TAG
1214                         if t.name is 'head' or t.name is 'body' or t.name is 'html' or t.name is 'br'
1215                                 # fall through to Anything else below
1216                         else
1217                                 parse_error()
1218                                 return
1219                 # Anything else
1220                 head_tok = new_open_tag 'head'
1221                 el = insert_html_element head_tok
1222                 head_element_pointer = el
1223                 insertion_mode = ins_mode_in_head
1224                 insertion_mode t # reprocess current token
1225
1226         # 8.2.5.4.4 http://www.w3.org/TR/html5/syntax.html#parsing-main-inhead
1227         ins_mode_in_head_else = (t) -> # factored out for same-as-spec flow control
1228                 open_els.shift() # spec says this will be a 'head' node
1229                 insertion_mode = ins_mode_after_head
1230                 insertion_mode t
1231         ins_mode_in_head = (t) ->
1232                 if t.type is TYPE_TEXT and (t.text is "\t" or t.text is "\n" or t.text is "\u000c" or t.text is ' ')
1233                         insert_character t
1234                         return
1235                 if t.type is TYPE_COMMENT
1236                         insert_comment t
1237                         return
1238                 if t.type is TYPE_DOCTYPE
1239                         parse_error()
1240                         return
1241                 if t.type is TYPE_START_TAG and t.name is 'html'
1242                         ins_mode_in_body t
1243                         return
1244                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link')
1245                         el = insert_html_element t
1246                         open_els.shift()
1247                         t.acknowledge_self_closing()
1248                         return
1249                 if t.type is TYPE_START_TAG and t.name is 'meta'
1250                         el = insert_html_element t
1251                         open_els.shift()
1252                         t.acknowledge_self_closing()
1253                         # fixfull encoding stuff
1254                         return
1255                 if t.type is TYPE_START_TAG and t.name is 'title'
1256                         parse_generic_rcdata_text t
1257                         return
1258                 if t.type is TYPE_START_TAG and ((t.name is 'noscript' and flag_scripting) or (t.name is 'noframes' or t.name is 'style'))
1259                         parse_generic_raw_text t
1260                         return
1261                 if t.type is TYPE_START_TAG and t.name is 'noscript' and flag_scripting is false
1262                         insert_html_element t
1263                         insertion_mode = ins_mode_in_head_noscript # FIXME implement
1264                         return
1265                 if t.type is TYPE_START_TAG and t.name is 'script'
1266                         ail = adjusted_insertion_location()
1267                         el = token_to_element t, NS_HTML, ail
1268                         el.flag 'parser-inserted', true # FIXME implement
1269                         # fixfull frament case
1270                         ail[0].children.splice ail[1], 0, el
1271                         open_els.unshift el
1272                         tok_state = tok_state_script_data
1273                         original_insertion_mode = insertion_mode # make sure orig... is defined
1274                         insertion_mode = ins_mode_text # FIXME implement
1275                         return
1276                 if t.type is TYPE_END_TAG and t.name is 'head'
1277                         open_els.shift() # will be a head element... spec says so
1278                         insertion_mode = ins_mode_after_head
1279                         return
1280                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1281                         ins_mode_in_head_else t
1282                         return
1283                 if t.type is TYPE_START_TAG and t.name is 'template'
1284                         insert_html_element t
1285                         afe_push_marker()
1286                         flag_frameset_ok = false
1287                         insertion_mode = ins_mode_in_template
1288                         template_insertion_modes.unshift ins_mode_in_template # FIXME implement
1289                         return
1290                 if t.type is TYPE_END_TAG and t.name is 'template'
1291                         if template_tag_is_open()
1292                                 generate_implied_end_tags
1293                                 if open_els[0].name isnt 'template'
1294                                         parse_error()
1295                                 loop
1296                                         el = open_els.shift()
1297                                         if el.name is 'template'
1298                                                 break
1299                                 clear_afe_to_marker()
1300                                 template_insertion_modes.shift()
1301                                 reset_insertion_mode()
1302                         else
1303                                 parse_error()
1304                         return
1305                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1306                         parse_error()
1307                         return
1308                 ins_mode_in_head_else t
1309
1310         # 8.2.5.4.5 http://www.w3.org/TR/html5/syntax.html#parsing-main-inheadnoscript
1311         ins_mode_in_head_noscript = (t) ->
1312                 # FIXME ?fixfull
1313                 console.log "ins_mode_in_head_noscript unimplemented"
1314
1315         # 8.2.5.4.6 http://www.w3.org/TR/html5/syntax.html#the-after-head-insertion-mode
1316         ins_mode_after_head_else = (t) ->
1317                 body_tok = new_open_tag 'body'
1318                 insert_html_element body_tok
1319                 insertion_mode = ins_mode_in_body
1320                 insertion_mode t # reprocess token
1321                 return
1322         ins_mode_after_head = (t) ->
1323                 if is_space_tok t
1324                         insert_character t
1325                         return
1326                 if t.type is TYPE_COMMENT
1327                         insert_comment t
1328                         return
1329                 if t.type is TYPE_DOCTYPE
1330                         parse_error()
1331                         return
1332                 if t.type is TYPE_START_TAG and t.name is 'html'
1333                         ins_mode_in_body t
1334                         return
1335                 if t.type is TYPE_START_TAG and t.name is 'body'
1336                         insert_html_element t
1337                         flag_frameset_ok = false
1338                         insertion_mode = ins_mode_in_body
1339                         return
1340                 if t.type is TYPE_START_TAG and t.name is 'frameset'
1341                         insert_html_element t
1342                         insertion_mode = ins_mode_in_frameset
1343                         return
1344                 if t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')
1345                         parse_error()
1346                         open_els.unshift head_element_pointer
1347                         ins_mode_in_head t
1348                         for el, i of open_els
1349                                 if el is head_element_pointer
1350                                         open_els.splice i, 1
1351                                         return
1352                         console.log "warning: 23904 couldn't find head element in open_els"
1353                         return
1354                 if t.type is TYPE_END_TAG and t.name is 'template'
1355                         ins_mode_in_head t
1356                         return
1357                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'html' or t.name is 'br')
1358                         ins_mode_after_head_else t
1359                         return
1360                 if (t.type is TYPE_START_TAG and t.name is 'head') or t.type is TYPE_END_TAG
1361                         parse_error()
1362                         return
1363                 # Anything else
1364                 ins_mode_after_head_else t
1365
1366         # 8.2.5.4.7 http://www.w3.org/TR/html5/syntax.html#parsing-main-inbody
1367         in_body_any_other_end_tag = (name) -> # factored out because adoption agency calls it
1368                 for node, i in open_els
1369                         if node.name is name # FIXME check namespace too
1370                                 generate_implied_end_tags name # arg is exception
1371                                 parse_error() unless i is 0
1372                                 while i >= 0
1373                                         open_els.shift()
1374                                         i -= 1
1375                                 return
1376                         if special_elements[node.name]? # FIXME check namespac too
1377                                 parse_error()
1378                                 return
1379         ins_mode_in_body = (t) ->
1380                 switch t.type
1381                         when TYPE_TEXT
1382                                 switch t.text
1383                                         when "\u0000"
1384                                                 parse_error()
1385                                         when "\t", "\u000a", "\u000c", "\u000d", ' '
1386                                                 reconstruct_active_formatting_elements()
1387                                                 insert_character t
1388                                         else
1389                                                 reconstruct_active_formatting_elements()
1390                                                 insert_character t
1391                                                 flag_frameset_ok = false
1392                         when TYPE_COMMENT
1393                                 insert_comment t
1394                         when TYPE_DOCTYPE
1395                                 parse_error()
1396                         when TYPE_START_TAG
1397                                 switch t.name
1398                                         when 'html'
1399                                                 parse_error()
1400                                                 return if template_tag_is_open()
1401                                                 root_attrs = open_els[open_els.length - 1].attrs
1402                                                 for k, v of t.attrs
1403                                                         root_attrs[k] = v unless root_attrs[k]?
1404                                         when 'base', 'basefont', 'bgsound', 'link', 'meta', 'noframes', 'script', 'style', 'template', 'title'
1405                                                 # FIXME also do this for </template> (end tag)
1406                                                 return ins_mode_in_head t
1407                                         when 'body'
1408                                                 parse_error()
1409                                                 # TODO
1410                                         when 'frameset'
1411                                                 parse_error()
1412                                                 # TODO
1413                                         when 'address', 'article', 'aside', 'blockquote', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'main', 'nav', 'ol', 'p', 'section', 'summary', 'ul'
1414                                                 close_p_if_in_button_scope()
1415                                                 insert_html_element t
1416                                         when 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'
1417                                                 close_p_if_in_button_scope()
1418                                                 if open_els[0].name in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
1419                                                         parse_error()
1420                                                         open_els.shift()
1421                                                 insert_html_element t
1422                                         # TODO lots more to implement here
1423                                         when 'a'
1424                                                 # If the list of active formatting elements
1425                                                 # contains an a element between the end of the list and
1426                                                 # the last marker on the list (or the start of the list
1427                                                 # if there is no marker on the list), then this is a
1428                                                 # parse error; run the adoption agency algorithm for
1429                                                 # the tag name "a", then remove that element from the
1430                                                 # list of active formatting elements and the stack of
1431                                                 # open elements if the adoption agency algorithm didn't
1432                                                 # already remove it (it might not have if the element
1433                                                 # is not in table scope).
1434                                                 found = false
1435                                                 for el in afe
1436                                                         if el.type is TYPE_AFE_MARKER
1437                                                                 break
1438                                                         if el.name is 'a'
1439                                                                 found = el
1440                                                 if found?
1441                                                         parse_error()
1442                                                         adoption_agency 'a'
1443                                                         for el, i in afe
1444                                                                 if el is found
1445                                                                         afe.splice i, 1
1446                                                         for el, i in open_els
1447                                                                 if el is found
1448                                                                         open_els.splice i, 1
1449                                                 reconstruct_active_formatting_elements()
1450                                                 el = insert_html_element t
1451                                                 afe_push el
1452                                         when 'b', 'big', 'code', 'em', 'font', 'i', 's', 'small', 'strike', 'strong', 'tt', 'u'
1453                                                 reconstruct_active_formatting_elements()
1454                                                 el = insert_html_element t
1455                                                 afe_push el
1456                                         when 'table'
1457                                                 # fixfull quirksmode thing
1458                                                 close_p_if_in_button_scope()
1459                                                 insert_html_element t
1460                                                 insertion_mode = ins_mode_in_table
1461                                         # TODO lots more to implement here
1462                                         else # any other start tag
1463                                                 reconstruct_active_formatting_elements()
1464                                                 insert_html_element t
1465                         when TYPE_EOF
1466                                 ok_tags = {
1467                                         dd: true, dt: true, li: true, p: true, tbody: true, td: true,
1468                                         tfoot: true, th: true, thead: true, tr: true, body: true, html: true,
1469                                 }
1470                                 for t in open_els
1471                                         unless ok_tags[t.name]?
1472                                                 parse_error()
1473                                                 break
1474                                 # TODO stack of template insertion modes thing
1475                                 stop_parsing()
1476                         when TYPE_END_TAG
1477                                 switch t.name
1478                                         when 'body'
1479                                                 unless is_in_scope 'body'
1480                                                         parse_error()
1481                                                         return
1482                                                 # TODO implement parse error and move to tree_after_body
1483                                         when 'html'
1484                                                 unless is_in_scope 'body' # weird, but it's what the spec says
1485                                                         parse_error()
1486                                                         return
1487                                                 # TODO implement parse error and move to tree_after_body, reprocess
1488                                         when 'address', 'article', 'aside', 'blockquote', 'button', 'center', 'details', 'dialog', 'dir', 'div', 'dl', 'fieldset', 'figcaption', 'figure', 'footer', 'header', 'hgroup', 'listing', 'main', 'nav', 'ol', 'pre', 'section', 'summary', 'ul'
1489                                                 unless is_in_scope t.name, NS_HTML
1490                                                         parse_error()
1491                                                         return
1492                                                 generate_implied_end_tags()
1493                                                 unless open_els[0].name is t.name and open_els[0].namespace is NS_HTML
1494                                                         parse_error()
1495                                                 loop
1496                                                         el = open_els.shift()
1497                                                         if el.name is t.name and el.namespace is NS_HTML
1498                                                                 return
1499                                         # TODO lots more close tags to implement here
1500                                         when 'p'
1501                                                 unless is_in_button_scope 'p'
1502                                                         parse_error()
1503                                                         insert_html_element new_open_tag 'p'
1504                                                 close_p_element()
1505                                         # TODO lots more close tags to implement here
1506                                         when 'a', 'b', 'big', 'code', 'em', 'font', 'i', 'nobr', 's', 'small', 'strike', 'strong', 'tt', 'u'
1507                                                 adoption_agency t.name
1508                                         # TODO lots more close tags to implement here
1509                                         else
1510                                                 in_body_any_other_end_tag t.name
1511                 return
1512
1513         ins_mode_in_table_else = (t) ->
1514                 parse_error()
1515                 flag_foster_parenting = true # FIXME
1516                 ins_mode_in_body t
1517                 flag_foster_parenting = false
1518         can_in_table = { # FIXME do this inline like everywhere else
1519                 'table': true
1520                 'tbody': true
1521                 'tfoot': true
1522                 'thead': true
1523                 'tr': true
1524         }
1525
1526         # 8.2.5.4.8 http://www.w3.org/TR/html5/syntax.html#parsing-main-incdata
1527         ins_mode_text = (t) ->
1528                 if t.type is TYPE_TEXT
1529                         insert_character t
1530                         return
1531                 if t.type is TYPE_EOF
1532                         parse_error()
1533                         if open_els[0].name is 'script'
1534                                 open_els[0].flag 'already started', true
1535                         open_els.shift()
1536                         insertion_mode = original_insertion_mode
1537                         insertion_mode t
1538                         return
1539                 if t.type is TYPE_END_TAG and t.name is 'script'
1540                         open_els.shift()
1541                         insertion_mode = original_insertion_mode
1542                         # fixfull the spec seems to assume that I'm going to run the script
1543                         # http://www.w3.org/TR/html5/syntax.html#scriptEndTag
1544                         return
1545                 if t.type is TYPE_END_TAG
1546                         open_els.shift()
1547                         insertion_mode = original_insertion_mode
1548                         return
1549                 console.log 'warning: end of ins_mode_text reached'
1550
1551         # the functions below implement the tokenizer stats described here:
1552         # http://www.w3.org/TR/html5/syntax.html#tokenization
1553
1554         # 8.2.5.4.9 http://www.w3.org/TR/html5/syntax.html#parsing-main-intable
1555         ins_mode_in_table = (t) ->
1556                 switch t.type
1557                         when TYPE_TEXT
1558                                 if can_in_table[t.name]
1559                                         original_insertion_mode = insertion_mode
1560                                         insertion_mode = ins_mode_in_table_text
1561                                         insertion_mode t
1562                                 else
1563                                         ins_mode_in_table_else t
1564                         when TYPE_COMMENT
1565                                 insert_comment t
1566                         when TYPE_DOCTYPE
1567                                 parse_error()
1568                         when TYPE_START_TAG
1569                                 switch t.name
1570                                         when 'caption'
1571                                                 clear_stack_to_table_context()
1572                                                 afe_push_marker()
1573                                                 insert_html_element t
1574                                                 insertion_mode = ins_mode_in_caption
1575                                         when 'colgroup'
1576                                                 clear_stack_to_table_context()
1577                                                 insert_html_element t
1578                                                 insertion_mode = ins_mode_in_column_group
1579                                         when 'col'
1580                                                 clear_stack_to_table_context()
1581                                                 insert_html_element new_open_tag 'colgroup'
1582                                                 insertion_mode = ins_mode_in_column_group
1583                                                 insertion_mode t
1584                                         when 'tbody', 'tfoot', 'thead'
1585                                                 clear_stack_to_table_context()
1586                                                 insert_html_element t
1587                                                 insertion_mode = ins_mode_in_table_body
1588                                         when 'td', 'th', 'tr'
1589                                                 clear_stack_to_table_context()
1590                                                 insert_html_element new_open_tag 'tbody'
1591                                                 insertion_mode = ins_mode_in_table_body
1592                                                 insertion_mode t
1593                                         when 'table'
1594                                                 parse_error()
1595                                                 if is_in_table_scope 'table'
1596                                                         loop
1597                                                                 el = open_els.shift()
1598                                                                 if el.name is 'table'
1599                                                                         break
1600                                                         reset_insertion_mode()
1601                                                         insertion_mode t
1602                                         when 'style', 'script', 'template'
1603                                                 ins_mode_in_head t
1604                                         when 'input'
1605                                                 if is_input_hidden_tok t
1606                                                         ins_mode_in_table_else t
1607                                                 else
1608                                                         parse_error()
1609                                                         el = insert_html_element t
1610                                                         open_els.shift()
1611                                                         t.acknowledge_self_closing()
1612                                         when 'form'
1613                                                 parse_error()
1614                                                 if form_element_pointer?
1615                                                         return
1616                                                 if template_tag_is_open()
1617                                                         return
1618                                                 form_element_pointer = insert_html_element t
1619                                                 open_els.shift()
1620                                         else
1621                                                 ins_mode_in_table_else t
1622                         when TYPE_END_TAG
1623                                 switch t.name
1624                                         when 'table'
1625                                                 if is_in_table_scope 'table'
1626                                                         loop
1627                                                                 el = open_els.shift()
1628                                                                 if el.name is 'table'
1629                                                                         break
1630                                                         reset_insertion_mode()
1631                                                 else
1632                                                         parse_error
1633                                         when 'body', 'caption', 'col', 'colgroup', 'html', 'tbody', 'td', 'tfoot', 'th', 'thead', 'tr'
1634                                                 parse_error()
1635                                         when 'template'
1636                                                 ins_mode_in_head t
1637                                         else
1638                                                 ins_mode_in_table_else t
1639                         when TYPE_EOF
1640                                 ins_mode_in_body t
1641                         else
1642                                 ins_mode_in_table_else t
1643
1644
1645         # 8.2.5.4.10 http://www.w3.org/TR/html5/syntax.html#parsing-main-intabletext
1646         ins_mode_in_table_text = (t) ->
1647                 if t.type is TYPE_TEXT and t.text is "\u0000"
1648                         # huh? I thought the tokenizer didn't emit these
1649                         parse_error()
1650                         return
1651                 if t.type is TYPE_TEXT
1652                         pending_table_character_tokens.push t
1653                         return
1654                 # Anything else
1655                 all_space = true
1656                 for old in pending_table_character_tokens
1657                         unless is_space_tok old
1658                                 all_space = false
1659                                 break
1660                 if all_space
1661                         for old in pending_table_character_tokens
1662                                 insert_character old
1663                 else
1664                         for old in pending_table_character_tokens
1665                                 ins_mode_table_else old
1666                 pending_table_character_tokens = [] # FIXME test (spec doesn't say this)
1667                 insertion_mode = original_insertion_mode
1668                 insertion_mode t
1669
1670         # 8.2.5.4.11 http://www.w3.org/TR/html5/syntax.html#parsing-main-incaption
1671         ins_mode_in_caption = (t) ->
1672                 if t.type is TYPE_END_TAG and t.name is 'caption'
1673                         if is_in_table_scope 'caption'
1674                                 generate_implied_end_tags()
1675                                 if open_els[0].name isnt 'caption'
1676                                         parse_error()
1677                                 loop
1678                                         el = open_els.shift()
1679                                         if el.name is 'caption'
1680                                                 break
1681                                 clear_afe_to_marker()
1682                                 insertion_mode = ins_mode_in_table
1683                         else
1684                                 parse_error()
1685                                 # fragment case
1686                         return
1687                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1688                         parse_error()
1689                         if is_in_table_scope 'caption'
1690                                 loop
1691                                         el = open_els.shift()
1692                                         if el.name is 'caption'
1693                                                 break
1694                                 clear_afe_to_marker()
1695                                 insertion_mode = ins_mode_in_table
1696                                 insertion_mode t
1697                         # else fragment case
1698                         return
1699                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1700                         parse_error()
1701                         return
1702                 # Anything else
1703                 ins_mode_in_body t
1704
1705         # 8.2.5.4.12 http://www.w3.org/TR/html5/syntax.html#parsing-main-incolgroup
1706         ins_mode_in_column_group = (t) ->
1707                 if is_space_tok t
1708                         insert_character t
1709                         return
1710                 if t.type is TYPE_COMMENT
1711                         insert_comment t
1712                         return
1713                 if t.type is TYPE_DOCTYPE
1714                         parse_error()
1715                         return
1716                 if t.type is TYPE_START_TAG and t.name is 'html'
1717                         ins_mode_in_body t
1718                         return
1719                 if t.type is TYPE_START_TAG and t.name is 'col'
1720                         el = insert_html_element t
1721                         open_els.shift()
1722                         t.acknowledge_self_closing()
1723                         return
1724                 if t.type is TYPE_END_TAG and t.name is 'colgroup'
1725                         if open_els[0].name is 'colgroup'
1726                                 open_els.shift()
1727                                 insertion_mode = ins_mode_in_table
1728                         else
1729                                 parse_error()
1730                         return
1731                 if t.type is TYPE_END_TAG and t.name is 'col'
1732                         parse_error()
1733                         return
1734                 if (t.type is TYPE_START_TAG or t.type is TYPE_END_TAG) and t.name is 'template'
1735                         ins_mode_in_head t
1736                         return
1737                 if t.type is TYPE_EOF
1738                         ins_mode_in_body t
1739                         return
1740                 # Anything else
1741                 if open_els[0].name isnt 'colgroup'
1742                         parse_error()
1743                         return
1744                 open_els.shift()
1745                 insertion_mode = ins_mode_in_table
1746                 insertion_mode t
1747                 return
1748
1749         # 8.2.5.4.13 http://www.w3.org/TR/html5/syntax.html#parsing-main-intbody
1750         ins_mode_in_table_body = (t) ->
1751                 if t.type is TYPE_START_TAG and t.name is 'tr'
1752                         clear_stack_to_table_body_context()
1753                         insert_html_element t
1754                         insertion_mode = ins_mode_in_row
1755                         return
1756                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1757                         parse_error()
1758                         clear_stack_to_table_body_context()
1759                         insert_html_element new_open_tag 'tr'
1760                         insertion_mode = ins_mode_in_row
1761                         insertion_mode t
1762                         return
1763                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1764                         unless is_in_table_scope t.name # fixfull check namespace
1765                                 parse_error()
1766                                 return
1767                         clear_stack_to_table_body_context()
1768                         open_els.shift()
1769                         insertion_mode = ins_mode_in_table
1770                         return
1771                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')) or (t.type is TYPE_END_TAG and t.name is 'table')
1772                         has = false
1773                         for el in open_els
1774                                 if el.name is 'tbody' or el.name is 'tfoot' or el.name is 'thead'
1775                                         has = true
1776                                         break
1777                                 if table_scopers[el.name]
1778                                         break
1779                         if !has
1780                                 parse_error()
1781                                 return
1782                         clear_stack_to_table_body_context()
1783                         open_els.shift()
1784                         insertion_mode = ins_mode_in_table
1785                         insertion_mode t
1786                         return
1787                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th' or t.name is 'tr')
1788                         parse_error()
1789                         return
1790                 # Anything else
1791                 ins_mode_in_table t
1792
1793         # 8.2.5.4.14 http://www.w3.org/TR/html5/syntax.html#parsing-main-intr
1794         ins_mode_in_row = (t) ->
1795                 if t.type is TYPE_START_TAG and (t.name is 'th' or t.name is 'td')
1796                         clear_stack_to_table_row_context()
1797                         insert_html_element t
1798                         insertion_mode = ins_mode_in_cell
1799                         afe_push_marker()
1800                         return
1801                 if t.type is TYPE_END_TAG and t.name is 'tr'
1802                         if is_in_table_scope 'tr'
1803                                 clear_stack_to_table_row_context()
1804                                 open_els.shift()
1805                                 insertion_mode = ins_mode_in_table_body
1806                         else
1807                                 parse_error()
1808                         return
1809                 if (t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')) or t.type is TYPE_END_TAG and t.name is 'table'
1810                         if is_in_table_scope 'tr'
1811                                 clear_stack_to_table_row_context()
1812                                 open_els.shift()
1813                                 insertion_mode = ins_mode_in_table_body
1814                                 insertion_mode t
1815                         else
1816                                 parse_error()
1817                         return
1818                 if t.type is TYPE_END_TAG and (t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
1819                         if is_in_table_scope t.name # fixfull namespace
1820                                 if is_in_table_scope 'tr'
1821                                         clear_stack_to_table_row_context()
1822                                         open_els.shift()
1823                                         insertion_mode = ins_mode_in_table_body
1824                                         insertion_mode t
1825                         else
1826                                 parse_error()
1827                         return
1828                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html' or t.name is 'td' or t.name is 'th')
1829                         parse_error()
1830                         return
1831                 # Anything else
1832                 ins_mode_in_table t
1833
1834         # http://www.w3.org/TR/html5/syntax.html#close-the-cell
1835         close_the_cell = ->
1836                 generate_implied_end_tags()
1837                 unless open_els[0].name is 'td' or open_els[0] is 'th'
1838                         parse_error()
1839                 loop
1840                         el = open_els.shift()
1841                         if el.name is 'td' or el.name is 'th'
1842                                 break
1843                 clear_afe_to_marker()
1844                 insertion_mode = ins_mode_in_row
1845
1846         # 8.2.5.4.15 http://www.w3.org/TR/html5/syntax.html#parsing-main-intd
1847         ins_mode_in_cell = (t) ->
1848                 if t.type is TYPE_END_TAG and (t.name is 'td' or t.name is 'th')
1849                         if is_in_table_scope t.name
1850                                 generate_implied_end_tags()
1851                                 if open_els[0].name isnt t.name
1852                                         parse_error
1853                                 loop
1854                                         el = open_els.shift()
1855                                         if el.name is t.name
1856                                                 break
1857                                 clear_afe_to_marker()
1858                                 insertion_mode = ins_mode_in_row
1859                         else
1860                                 parse_error()
1861                         return
1862                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'td' or t.name is 'tfoot' or t.name is 'th' or t.name is 'thead' or t.name is 'tr')
1863                         has = false
1864                         for el in open_els
1865                                 if el.name is 'td' or el.name is 'th'
1866                                         has = true
1867                                         break
1868                                 if table_scopers[el.name]
1869                                         break
1870                         if !has
1871                                 parse_error()
1872                                 return
1873                         close_the_cell()
1874                         insertion_mode t
1875                         return
1876                 if t.type is TYPE_END_TAG and (t.name is 'body' or t.name is 'caption' or t.name is 'col' or t.name is 'colgroup' or t.name is 'html')
1877                         parse_error()
1878                         return
1879                 if t.type is TYPE_END_TAG and (t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr')
1880                         if is_in_table_scope t.name # fixfull namespace
1881                                 close_the_cell()
1882                                 insertion_mode t
1883                         else
1884                                 parse_error()
1885                         return
1886                 # Anything Else
1887                 ins_mode_in_body t
1888
1889         # 8.2.5.4.16 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselect
1890         ins_mode_in_select = (t) ->
1891                 if t.type is TYPE_TEXT and t.text is "\u0000"
1892                         parse_error()
1893                         return
1894                 if t.type is TYPE_TEXT
1895                         insert_character t
1896                         return
1897                 if t.type is TYPE_COMMENT
1898                         insert_comment t
1899                         return
1900                 if t.type is TYPE_DOCTYPE
1901                         parse_error()
1902                         return
1903                 if t.type is TYPE_START_TAG and t.name is 'html'
1904                         ins_mode_in_body t
1905                         return
1906                 if t.type is TYPE_START_TAG and t.name is 'option'
1907                         if open_els[0].name is 'option'
1908                                 open_els.shift()
1909                         insert_html_element t
1910                         return
1911                 if t.type is TYPE_START_TAG and t.name is 'optgroup'
1912                         if open_els[0].name is 'option'
1913                                 open_els.shift()
1914                         if open_els[0].name is 'optgroup'
1915                                 open_els.shift()
1916                         insert_html_element t
1917                         return
1918                 if t.type is TYPE_END_TAG and t.name is 'optgroup'
1919                         if open_els[0].name is 'option' and open_els[1].name is 'optgroup'
1920                                 open_els.shift()
1921                         if open_els[0].name is 'optgroup'
1922                                 open_els.shift()
1923                         else
1924                                 parse_error()
1925                         return
1926                 if t.type is TYPE_END_TAG and t.name is 'option'
1927                         if open_els[0].name is 'option'
1928                                 open_els.shift()
1929                         else
1930                                 parse_error()
1931                         return
1932                 if t.type is TYPE_END_TAG and t.name is 'select'
1933                         if is_in_select_scope 'select'
1934                                 loop
1935                                         el = open_els.shift()
1936                                         if el.name is 'select'
1937                                                 break
1938                                 reset_insertion_mode()
1939                         else
1940                                 parse_error()
1941                         return
1942                 if t.type is TYPE_START_TAG and t.name is 'select'
1943                         parse_error()
1944                         loop
1945                                 el = open_els.shift()
1946                                 if el.name is 'select'
1947                                         break
1948                         reset_insertion_mode()
1949                         # spec says that this is the same as </select> but it doesn't say
1950                         # to check scope first
1951                         return
1952                 if t.type is TYPE_START_TAG and (t.name is 'input' or t.name is 'keygen' or t.name is 'textarea')
1953                         parse_error()
1954                         if is_in_select_scope 'select'
1955                                 return
1956                         loop
1957                                 el = open_els.shift()
1958                                 if el.name is 'select'
1959                                         break
1960                         reset_insertion_mode()
1961                         insertion_mode t
1962                         return
1963                 if t.type is TYPE_START_TAG and (t.name is 'script' or t.name is 'template')
1964                         ins_mode_in_head t
1965                         return
1966                 if t.type is TYPE_EOF
1967                         ins_mode_in_body t
1968                         return
1969                 # Anything else
1970                 parse_error()
1971                 return
1972
1973         # 8.2.5.4.17 http://www.w3.org/TR/html5/syntax.html#parsing-main-inselectintable
1974         ins_mode_in_select_in_table = (t) ->
1975                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1976                         parse_error()
1977                         loop
1978                                 el = open_els.shift()
1979                                 if el.name is 'select'
1980                                         break
1981                         reset_insertion_mode()
1982                         insertion_mode t
1983                         return
1984                 if t.type is TYPE_END_TAG and (t.name is 'caption' or t.name is 'table' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead' or t.name is 'tr' or t.name is 'td' or t.name is 'th')
1985                         parse_error()
1986                         unless is_in_table_scope t.name, NS_HTML
1987                                 return
1988                         loop
1989                                 el = open_els.shift()
1990                                 if el.name is 'select'
1991                                         break
1992                         reset_insertion_mode()
1993                         insertion_mode t
1994                         return
1995                 # Anything else
1996                 ins_mode_in_select t
1997                 return
1998
1999         # 8.2.5.4.18 http://www.w3.org/TR/html5/syntax.html#parsing-main-intemplate
2000         ins_mode_in_template = (t) ->
2001                 if t.type is TYPE_TEXT or t.type is TYPE_COMMENT or t.type is TYPE_DOCTYPE
2002                         ins_mode_in_body t
2003                         return
2004                 if (t.type is TYPE_START_TAG and (t.name is 'base' or t.name is 'basefont' or t.name is 'bgsound' or t.name is 'link' or t.name is 'meta' or t.name is 'noframes' or t.name is 'script' or t.name is 'style' or t.name is 'template' or t.name is 'title')) or (t.type is TYPE_END_TAG and t.name is 'template')
2005                         ins_mode_in_head t
2006                         return
2007                 if t.type is TYPE_START_TAG and (t.name is 'caption' or t.name is 'colgroup' or t.name is 'tbody' or t.name is 'tfoot' or t.name is 'thead')
2008                         template_insertion_modes.shift()
2009                         template_insertion_modes.unshift ins_mode_in_table
2010                         insertion_mode = ins_mode_in_table
2011                         insertion_mode t
2012                         return
2013                 if t.type is TYPE_START_TAG and t.name is 'col'
2014                         template_insertion_modes.shift()
2015                         template_insertion_modes.unshift ins_mode_in_column_group
2016                         insertion_mode = ins_mode_in_column_group
2017                         insertion_mode t
2018                         return
2019                 if t.type is TYPE_START_TAG and t.name is 'tr'
2020                         template_insertion_modes.shift()
2021                         template_insertion_modes.unshift ins_mode_in_table_body
2022                         insertion_mode = ins_mode_in_table_body
2023                         insertion_mode t
2024                         return
2025                 if t.type is TYPE_START_TAG and (t.name is 'td' or t.name is 'th')
2026                         template_insertion_modes.shift()
2027                         template_insertion_modes.unshift ins_mode_in_row
2028                         insertion_mode = ins_mode_in_row
2029                         insertion_mode t
2030                         return
2031                 if t.type is TYPE_START_TAG
2032                         template_insertion_modes.shift()
2033                         template_insertion_modes.unshift ins_mode_in_body
2034                         insertion_mode = ins_mode_in_body
2035                         insertion_mode t
2036                         return
2037                 if t.type is TYPE_END_TAG
2038                         parse_error()
2039                         return
2040                 if t.type is TYPE_EOF
2041                         unless template_tag_is_open()
2042                                 stop_parsing()
2043                                 return
2044                         parse_error()
2045                         loop
2046                                 el = open_els.shift()
2047                                 if el.name is 'template' # fixfull check namespace
2048                                         break
2049                         clear_afe_to_marker()
2050                         template_insertion_modes.shift()
2051                         reset_insertion_mode()
2052                         insertion_mode t
2053
2054         # 8.2.5.4.19 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterbody
2055         ins_mode_after_body = (t) ->
2056                 if is_space_tok t
2057                         ins_mode_in_body t
2058                         return
2059                 if t.type is TYPE_COMMENT
2060                         insert_comment t, [open_els[0], open_els[0].children.length]
2061                         return
2062                 if t.type is TYPE_DOCTYPE
2063                         parse_error()
2064                         return
2065                 if t.type is TYPE_START_TAG and t.name is 'html'
2066                         ins_mode_in_body t
2067                         return
2068                 if t.type is TYPE_END_TAG and t.name is 'html'
2069                         # fixfull fragment case
2070                         insertion_mode = ins_mode_after_after_body
2071                         return
2072                 if t.type is TYPE_EOF
2073                         stop_parsing()
2074                         return
2075                 # Anything ELse
2076                 parse_error()
2077                 insertion_mode = ins_mode_in_body
2078                 insertion_mode t
2079
2080         # 8.2.5.4.20 http://www.w3.org/TR/html5/syntax.html#parsing-main-inframeset
2081         ins_mode_in_frameset = (t) ->
2082                 if is_space_tok t
2083                         insert_character t
2084                         return
2085                 if t.type is TYPE_COMMENT
2086                         insert_comment t
2087                         return
2088                 if t.type is TYPE_DOCTYPE
2089                         parse_error()
2090                         return
2091                 if t.type is TYPE_START_TAG and t.name is 'html'
2092                         ins_mode_in_body t
2093                         return
2094                 if t.type is TYPE_START_TAG and t.name is 'frameset'
2095                         insert_html_element t
2096                         return
2097                 if t.type is TYPE_END_TAG and t.name is 'frameset'
2098                         # TODO ?correct for: "if the current node is the root html element"
2099                         if open_els.length is 1
2100                                 parse_error()
2101                                 return # fragment case
2102                         open_els.shift()
2103                         if flag_fragment_parsing is false and open_els[0].name isnt 'frameset'
2104                                 insertion_mode = ins_mode_after_frameset
2105                         return
2106                 if t.type is TYPE_START_TAG and t.name is 'frame'
2107                         insert_html_element t
2108                         open_els.shift()
2109                         t.acknowledge_self_closing()
2110                         return
2111                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2112                         ins_mode_in_head t
2113                         return
2114                 if t.type is TYPE_EOF
2115                         # TODO ?correct for: "if the current node is not the root html element"
2116                         if open_els.length isnt 1
2117                                 parse_error()
2118                         stop_parsing()
2119                         return
2120                 # Anything else
2121                 parse_error()
2122                 return
2123
2124         # 8.2.5.4.21 http://www.w3.org/TR/html5/syntax.html#parsing-main-afterframeset
2125         ins_mode_after_frameset = (t) ->
2126                 if is_space_tok t
2127                         insert_character t
2128                         return
2129                 if t.type is TYPE_COMMENT
2130                         insert_comment t
2131                         return
2132                 if t.type is TYPE_DOCTYPE
2133                         parse_error()
2134                         return
2135                 if t.type is TYPE_START_TAG and t.name is 'html'
2136                         ins_mode_in_body t
2137                         return
2138                 if t.type is TYPE_END_TAG and t.name is 'html'
2139                         insert_mode = ins_mode_after_after_frameset
2140                         return
2141                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2142                         ins_mode_in_head t
2143                         return
2144                 if t.type is TYPE_EOF
2145                         stop_parsing()
2146                         return
2147                 # Anything else
2148                 parse_error()
2149                 return
2150
2151         # 8.2.5.4.22 http://www.w3.org/TR/html5/syntax.html#the-after-after-body-insertion-mode
2152         ins_mode_after_after_body = (t) ->
2153                 if t.type is TYPE_COMMENT
2154                         insert_comment t, [doc, doc.children.length]
2155                         return
2156                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2157                         ins_mode_in_body t
2158                         return
2159                 if t.type is TYPE_EOF
2160                         stop_parsing()
2161                         return
2162                 # Anything else
2163                 parse_error()
2164                 insertion_mode = ins_mode_in_body
2165                 return
2166
2167         # 8.2.5.4.23 http://www.w3.org/TR/html5/syntax.html#the-after-after-frameset-insertion-mode
2168         ins_mode_after_after_frameset = (t) ->
2169                 if t.type is TYPE_COMMENT
2170                         insert_comment t, [doc, doc.children.length]
2171                         return
2172                 if t.type is TYPE_DOCTYPE or is_space_tok(t) or (t.type is TYPE_START_TAG and t.name is 'html')
2173                         ins_mode_in_body t
2174                         return
2175                 if t.type is TYPE_EOF
2176                         stop_parsing()
2177                         return
2178                 if t.type is TYPE_START_TAG and t.name is 'noframes'
2179                         ins_mode_in_head t
2180                         return
2181                 # Anything else
2182                 parse_error()
2183                 return
2184
2185
2186
2187
2188
2189         # 8.2.4.1 http://www.w3.org/TR/html5/syntax.html#data-state
2190         tok_state_data = ->
2191                 switch c = txt.charAt(cur++)
2192                         when '&'
2193                                 return new_text_node parse_character_reference()
2194                         when '<'
2195                                 tok_state = tok_state_tag_open
2196                         when "\u0000"
2197                                 parse_error()
2198                                 return new_text_node c
2199                         when '' # EOF
2200                                 return new_eof_token()
2201                         else
2202                                 return new_text_node c
2203                 return null
2204
2205         # 8.2.4.2 http://www.w3.org/TR/html5/syntax.html#character-reference-in-data-state
2206         # not needed: tok_state_character_reference_in_data = ->
2207         # just call parse_character_reference()
2208
2209         # 8.2.4.3 http://www.w3.org/TR/html5/syntax.html#rcdata-state
2210         tok_state_rcdata = ->
2211                 switch c = txt.charAt(cur++)
2212                         when '&'
2213                                 return new_text_node parse_character_reference()
2214                         when '<'
2215                                 tok_state = tok_state_rcdata_less_than_sign
2216                         when "\u0000"
2217                                 parse_error()
2218                                 return new_character_token "\ufffd"
2219                         when '' # EOF
2220                                 return new_eof_token()
2221                         else
2222                                 return new_character_token c
2223                 return null
2224
2225         # 8.2.4.4 http://www.w3.org/TR/html5/syntax.html#character-reference-in-rcdata-state
2226         # not needed: tok_state_character_reference_in_rcdata = ->
2227         # just call parse_character_reference()
2228
2229         # 8.2.4.5 http://www.w3.org/TR/html5/syntax.html#rawtext-state
2230         tok_state_rawtext = ->
2231                 switch c = txt.charAt(cur++)
2232                         when '<'
2233                                 tok_state = tok_state_rawtext_less_than_sign
2234                         when "\u0000"
2235                                 parse_error()
2236                                 return new_character_token "\ufffd"
2237                         when '' # EOF
2238                                 return new_eof_token()
2239                         else
2240                                 return new_character_token c
2241                 return null
2242
2243         # 8.2.4.6 http://www.w3.org/TR/html5/syntax.html#script-data-state
2244         tok_state_script_data = ->
2245                 switch c = txt.charAt(cur++)
2246                         when '<'
2247                                 tok_state = tok_state_script_data_less_than_sign
2248                         when "\u0000"
2249                                 parse_error()
2250                                 return new_character_token "\ufffd"
2251                         when '' # EOF
2252                                 return new_eof_token()
2253                         else
2254                                 return new_character_token c
2255                 return null
2256
2257         # 8.2.4.7 http://www.w3.org/TR/html5/syntax.html#plaintext-state
2258         tok_state_plaintext = ->
2259                 switch c = txt.charAt(cur++)
2260                         when "\u0000"
2261                                 parse_error()
2262                                 return new_character_token "\ufffd"
2263                         when '' # EOF
2264                                 return new_eof_token()
2265                         else
2266                                 return new_character_token c
2267                 return null
2268
2269
2270         # 8.2.4.8 http://www.w3.org/TR/html5/syntax.html#tag-open-state
2271         tok_state_tag_open = ->
2272                 switch c = txt.charAt(cur++)
2273                         when '!'
2274                                 tok_state = tok_state_markup_declaration_open
2275                         when '/'
2276                                 tok_state = tok_state_end_tag_open
2277                         when '?'
2278                                 parse_error()
2279                                 tok_cur_tag = new_comment_token '?'
2280                                 tok_state = tok_state_bogus_comment
2281                         else
2282                                 if is_lc_alpha(c)
2283                                         tok_cur_tag = new_open_tag c
2284                                         tok_state = tok_state_tag_name
2285                                 else if is_uc_alpha(c)
2286                                         tok_cur_tag = new_open_tag c.toLowerCase()
2287                                         tok_state = tok_state_tag_name
2288                                 else
2289                                         parse_error()
2290                                         tok_state = tok_state_data
2291                                         cur -= 1 # we didn't parse/handle the char after <
2292                                         return new_text_node '<'
2293                 return null
2294
2295         # 8.2.4.9 http://www.w3.org/TR/html5/syntax.html#end-tag-open-state
2296         tok_state_end_tag_open = ->
2297                 switch c = txt.charAt(cur++)
2298                         when '>'
2299                                 parse_error()
2300                                 tok_state = tok_state_data
2301                         when '' # EOF
2302                                 parse_error()
2303                                 tok_state = tok_state_data
2304                                 return new_text_node '</'
2305                         else
2306                                 if is_uc_alpha(c)
2307                                         tok_cur_tag = new_end_tag c.toLowerCase()
2308                                         tok_state = tok_state_tag_name
2309                                 else if is_lc_alpha(c)
2310                                         tok_cur_tag = new_end_tag c
2311                                         tok_state = tok_state_tag_name
2312                                 else
2313                                         parse_error()
2314                                         tok_cur_tag = new_comment_token '/'
2315                                         tok_state = tok_state_bogus_comment
2316                 return null
2317
2318         # 8.2.4.10 http://www.w3.org/TR/html5/syntax.html#tag-name-state
2319         tok_state_tag_name = ->
2320                 switch c = txt.charAt(cur++)
2321                         when "\t", "\n", "\u000c", ' '
2322                                 tok_state = tok_state_before_attribute_name
2323                         when '/'
2324                                 tok_state = tok_state_self_closing_start_tag
2325                         when '>'
2326                                 tok_state = tok_state_data
2327                                 tmp = tok_cur_tag
2328                                 tok_cur_tag = null
2329                                 return tmp
2330                         when "\u0000"
2331                                 parse_error()
2332                                 tok_cur_tag.name += "\ufffd"
2333                         when '' # EOF
2334                                 parse_error()
2335                                 tok_state = tok_state_data
2336                         else
2337                                 if is_uc_alpha(c)
2338                                         tok_cur_tag.name += c.toLowerCase()
2339                                 else
2340                                         tok_cur_tag.name += c
2341                 return null
2342
2343         # 8.2.4.11 http://www.w3.org/TR/html5/syntax.html#rcdata-less-than-sign-state
2344         tok_state_rcdata_less_than_sign = ->
2345                 c = txt.charAt(cur++)
2346                 if c is '/'
2347                         temporary_buffer = ''
2348                         tok_state = tok_state_rcdata_end_tag_open
2349                         return null
2350                 # Anything else
2351                 tok_state = tok_state_rcdata
2352                 cur -= 1 # reconsume the input character
2353                 return new_character_token '<'
2354
2355         # 8.2.4.12 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-open-state
2356         tok_state_rcdata_end_tag_open = ->
2357                 c = txt.charAt(cur++)
2358                 if is_uc_alpha(c)
2359                         tok_cur_tag = new_end_tag c.toLowerCase()
2360                         temporary_buffer += c
2361                         tok_state = tok_state_rcdata_end_tag_name
2362                         return null
2363                 if is_lc_alpha(c)
2364                         tok_cur_tag = new_end_tag c
2365                         temporary_buffer += c
2366                         tok_state = tok_state_rcdata_end_tag_name
2367                         return null
2368                 # Anything else
2369                 tok_state = tok_state_rcdata
2370                 cur -= 1 # reconsume the input character
2371                 return new_character_token "</" # fixfull separate these
2372
2373         # http://www.w3.org/TR/html5/syntax.html#appropriate-end-tag-token
2374         is_appropriate_end_tag = (t) ->
2375                 # spec says to check against "the tag name of the last start tag to
2376                 # have been emitted from this tokenizer", but this is only called from
2377                 # the various "raw" states, which I'm pretty sure all push the start
2378                 # token onto open_els. TODO: verify this after the script data states
2379                 # are implemented
2380                 debug_log "#{t.type}, #{t.name} open_els: #{serialize_els open_els, true, true}"
2381                 return t.type is TYPE_END_TAG and t.name is open_els[0].name
2382
2383         # 8.2.4.13 http://www.w3.org/TR/html5/syntax.html#rcdata-end-tag-name-state
2384         tok_state_rcdata_end_tag_name = ->
2385                 c = txt.charAt(cur++)
2386                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2387                         if is_appropriate_end_tag tok_cur_tag
2388                                 tok_state = tok_state_before_attribute_name
2389                                 return
2390                         # else fall through to "Anything else"
2391                 if c is '/'
2392                         if is_appropriate_end_tag tok_cur_tag
2393                                 tok_state = tok_state_self_closing_start_tag # FIXME spec typo?
2394                                 return
2395                         # else fall through to "Anything else"
2396                 if c is '>'
2397                         if is_appropriate_end_tag tok_cur_tag
2398                                 tok_state = tok_state_data
2399                                 return tok_cur_tag
2400                         # else fall through to "Anything else"
2401                 if is_uc_alpha(c)
2402                         tok_cur_tag.name += c.toLowerCase()
2403                         temporary_buffer += c
2404                         return null
2405                 if is_lc_alpha(c)
2406                         tok_cur_tag.name += c
2407                         temporary_buffer += c
2408                         return null
2409                 # Anything else
2410                 tok_state = tok_state_rcdata
2411                 cur -= 1 # reconsume the input character
2412                 return new_character_token '</' + temporary_buffer # fixfull separate these
2413
2414         # 8.2.4.14 http://www.w3.org/TR/html5/syntax.html#rawtext-less-than-sign-state
2415         tok_state_rawtext_less_than_sign = ->
2416                 c = txt.charAt(cur++)
2417                 if c is '/'
2418                         temporary_buffer = ''
2419                         tok_state = tok_state_rawtext_end_tag_open
2420                         return null
2421                 # Anything else
2422                 tok_state = tok_state_rawtext
2423                 cur -= 1 # reconsume the input character
2424                 return new_character_token '<'
2425
2426         # 8.2.4.15 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-open-state
2427         tok_state_rawtext_end_tag_open = ->
2428                 c = txt.charAt(cur++)
2429                 if is_uc_alpha(c)
2430                         tok_cur_tag = new_end_tag c.toLowerCase()
2431                         temporary_buffer += c
2432                         tok_state = tok_state_rawtext_end_tag_name
2433                         return null
2434                 if is_lc_alpha(c)
2435                         tok_cur_tag = new_end_tag c
2436                         temporary_buffer += c
2437                         tok_state = tok_state_rawtext_end_tag_name
2438                         return null
2439                 # Anything else
2440                 tok_state = tok_state_rawtext
2441                 cur -= 1 # reconsume the input character
2442                 return new_character_token "</" # fixfull separate these
2443
2444         # 8.2.4.16 http://www.w3.org/TR/html5/syntax.html#rawtext-end-tag-name-state
2445         tok_state_rawtext_end_tag_name = ->
2446                 c = txt.charAt(cur++)
2447                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2448                         if is_appropriate_end_tag tok_cur_tag
2449                                 tok_state = tok_state_before_attribute_name
2450                                 return
2451                         # else fall through to "Anything else"
2452                 if c is '/'
2453                         if is_appropriate_end_tag tok_cur_tag
2454                                 tok_state = tok_state_self_closing_start_tag
2455                                 return
2456                         # else fall through to "Anything else"
2457                 if c is '>'
2458                         if is_appropriate_end_tag tok_cur_tag
2459                                 tok_state = tok_state_data
2460                                 return tok_cur_tag
2461                         # else fall through to "Anything else"
2462                 if is_uc_alpha(c)
2463                         tok_cur_tag.name += c.toLowerCase()
2464                         temporary_buffer += c
2465                         return null
2466                 if is_lc_alpha(c)
2467                         tok_cur_tag.name += c
2468                         temporary_buffer += c
2469                         return null
2470                 # Anything else
2471                 tok_state = tok_state_rawtext
2472                 cur -= 1 # reconsume the input character
2473                 return new_character_token '</' + temporary_buffer # fixfull separate these
2474
2475         # 8.2.4.17 http://www.w3.org/TR/html5/syntax.html#script-data-less-than-sign-state
2476         tok_state_script_data_less_than_sign = ->
2477                 c = txt.charAt(cur++)
2478                 if c is '/'
2479                         temporary_buffer = ''
2480                         tok_state = tok_state_script_data_end_tag_open
2481                         return
2482                 if c is '!'
2483                         tok_state = tok_state_script_data_escape_start
2484                         return new_character_token '<!' # fixfull split
2485                 # Anything else
2486                 tok_state = tok_state_script_data
2487                 cur -= 1 # Reconsume
2488                 return new_character_token '<'
2489
2490         # 8.2.4.18 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2491         tok_state_script_data_end_tag_open = ->
2492                 c = txt.charAt(cur++)
2493                 if is_uc_alpha(c)
2494                         tok_cur_tag = new_end_tag c.toLowerCase()
2495                         temporary_buffer += c
2496                         tok_state = tok_state_script_data_end_tag_name
2497                         return
2498                 if is_lc_alpha(c)
2499                         tok_cur_tag = new_end_tag c
2500                         temporary_buffer += c
2501                         tok_state = tok_state_script_data_end_tag_name
2502                         return
2503                 # Anything else
2504                 tok_state = tok_state_script_data
2505                 cur -= 1 # Reconsume
2506                 return new_character_token '</'
2507
2508         # 8.2.4.19 http://www.w3.org/TR/html5/syntax.html#script-data-end-tag-open-state
2509         tok_state_script_data_end_tag_name = ->
2510                 c = txt.charAt(cur++)
2511                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2512                         if is_appropriate_end_tag tok_cur_tag
2513                                 tok_state = tok_state_before_attribute_name
2514                                 return
2515                         # fall through
2516                 if c is '/'
2517                         if is_appropriate_end_tag tok_cur_tag
2518                                 tok_state = tok_state_self_closing_start_tag
2519                                 return
2520                         # fall through
2521                 if is_uc_alpha(c)
2522                         tok_cur_tag.name += c.toLowerCase()
2523                         temporary_buffer += c
2524                         return
2525                 if is_lc_alpha(c)
2526                         tok_cur_tag.name += c
2527                         temporary_buffer += c
2528                         return
2529                 # Anything else
2530                 tok_state = tok_state_script_data
2531                 cur -= 1 # Reconsume
2532                 return new_character_token "</#{temporary_buffer}" # fixfull split
2533
2534         # 8.2.4.20 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-state
2535         tok_state_script_data_escape_start = ->
2536                 c = txt.charAt(cur++)
2537                 if c is '-'
2538                         tok_state = tok_state_script_data_escape_start_dash
2539                         return new_character_token '-'
2540                 # Anything else
2541                 tok_state = tok_state_script_data
2542                 cur -= 1 # Reconsume
2543                 return
2544
2545         # 8.2.4.21 http://www.w3.org/TR/html5/syntax.html#script-data-escape-start-dash-state
2546         tok_state_script_data_escape_start_dash = ->
2547                 c = txt.charAt(cur++)
2548                 if c is '-'
2549                         tok_state = tok_state_script_data_escaped_dash_dash
2550                         return new_character_token '-'
2551                 # Anything else
2552                 tok_state = tok_state_script_data
2553                 cur -= 1 # Reconsume
2554                 return
2555
2556         # 8.2.4.22 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-state
2557         tok_state_script_data_escaped = ->
2558                 c = txt.charAt(cur++)
2559                 if c is '-'
2560                         tok_state = tok_state_script_data_escaped_dash
2561                         return new_character_token '-'
2562                 if c is '<'
2563                         tok_state = tok_state_script_data_escaped_less_than_sign
2564                         return
2565                 if c is "\u0000"
2566                         parse_error()
2567                         return new_character_token "\ufffd"
2568                 if c is '' # EOF
2569                         tok_state = tok_state_data
2570                         parse_error()
2571                         cur -= 1 # Reconsume
2572                         return
2573                 # Anything else
2574                 return new_character_token c
2575
2576         # 8.2.4.23 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-state
2577         tok_state_script_data_escaped_dash = ->
2578                 c = txt.charAt(cur++)
2579                 if c is '-'
2580                         tok_state = tok_state_script_data_escaped_dash_dash
2581                         return new_character_token '-'
2582                 if c is '<'
2583                         tok_state = tok_state_script_data_escaped_less_than_sign
2584                         return
2585                 if c is "\u0000"
2586                         parse_error()
2587                         tok_state = tok_state_script_data_escaped
2588                         return new_character_token "\ufffd"
2589                 if c is '' # EOF
2590                         tok_state = tok_state_data
2591                         parse_error()
2592                         cur -= 1 # Reconsume
2593                         return
2594                 # Anything else
2595                 tok_state = tok_state_script_data_escaped
2596                 return new_character_token c
2597
2598         # 8.2.4.24 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-dash-dash-state
2599         tok_state_script_data_escaped_dash_dash = ->
2600                 c = txt.charAt(cur++)
2601                 if c is '-'
2602                         return new_character_token '-'
2603                 if c is '<'
2604                         tok_state = tok_state_script_data_escaped_less_than_sign
2605                         return
2606                 if c is '>'
2607                         tok_state = tok_state_script_data
2608                         return new_character_token '>'
2609                 if c is "\u0000"
2610                         parse_error()
2611                         tok_state = tok_state_script_data_escaped
2612                         return new_character_token "\ufffd"
2613                 if c is '' # EOF
2614                         parse_error()
2615                         tok_state = tok_state_data
2616                         cur -= 1 # Reconsume
2617                         return
2618                 # Anything else
2619                 tok_state = tok_state_script_data_escaped
2620                 return new_character_token c
2621
2622         # 8.2.4.25 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-less-than-sign-state
2623         tok_state_script_data_escaped_less_than_sign = ->
2624                 c = txt.charAt(cur++)
2625                 if c is '/'
2626                         temporary_buffer = ''
2627                         tok_state = tok_state_script_data_escaped_end_tag_open
2628                         return
2629                 if is_uc_alpha(c)
2630                         temporary_buffer = c.toLowerCase() # yes, really
2631                         tok_state = tok_state_script_data_double_escape_start
2632                         return new_character_token "<#{c}" # fixfull split
2633                 if is_lc_alpha(c)
2634                         temporary_buffer = c
2635                         tok_state = tok_state_script_data_double_escape_start
2636                         return new_character_token "<#{c}" # fixfull split
2637                 # Anything else
2638                 tok_state = tok_state_script_data_escaped
2639                 cur -= 1 # Reconsume
2640                 return new_character_token c
2641
2642         # 8.2.4.26 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-open-state
2643         tok_state_script_data_escaped_end_tag_open = ->
2644                 c = txt.charAt(cur++)
2645                 if is_uc_alpha(c)
2646                         tok_cur_tag = new_end_tag c.toLowerCase()
2647                         temporary_buffer += c
2648                         tok_state = tok_state_script_data_escaped_end_tag_name
2649                         return
2650                 if is_lc_alpha(c)
2651                         tok_cur_tag = new_end_tag c
2652                         temporary_buffer += c
2653                         tok_state = tok_state_script_data_escaped_end_tag_name
2654                         return
2655                 # Anything else
2656                 tok_state = tok_state_script_data_escaped
2657                 cur -= 1 # Reconsume
2658                 return new_character_token '</' # fixfull split
2659
2660         # 8.2.4.27 http://www.w3.org/TR/html5/syntax.html#script-data-escaped-end-tag-name-state
2661         tok_state_script_data_escaped_end_tag_name = ->
2662                 c = txt.charAt(cur++)
2663                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
2664                         if is_appropriate_end_tag tok_cur_tag
2665                                 tok_state = tok_state_before_attribute_name
2666                                 return
2667                         # fall through
2668                 if c is '/'
2669                         if is_appropriate_end_tag tok_cur_tag
2670                                 tok_state = tok_state_self_closing_start_tag
2671                                 return
2672                         # fall through
2673                 if is_uc_alpha(c)
2674                         tok_cur_tag.name += c.toLowerCase()
2675                         temporary_buffer += c.toLowerCase()
2676                         return
2677                 if is_lc_alpha(c)
2678                         tok_cur_tag.name += c
2679                         temporary_buffer += c.toLowerCase()
2680                         return
2681                 # Anything else
2682                 tok_state = tok_state_script_data_escaped
2683                 cur -= 1 # Reconsume
2684                 return new_character_token "</#{temporary_buffer}" # fixfull split
2685
2686         # 8.2.4.28 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-start-state
2687         tok_state_script_data_double_escape_start = ->
2688                 c = txt.charAt(cur++)
2689                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
2690                         if temporary_buffer is 'script'
2691                                 tok_state = tok_state_script_data_double_escaped
2692                         else
2693                                 tok_state = tok_state_script_data_escaped
2694                         return new_character_token c
2695                 if is_uc_alpha(c)
2696                         temporary_buffer += c.toLowerCase() # yes, really lowercase
2697                         return new_character_token c
2698                 if is_lc_alpha(c)
2699                         temporary_buffer += c
2700                         return new_character_token c
2701                 # Anything else
2702                 tok_state = tok_state_script_data_escaped
2703                 cur -= 1 # Reconsume
2704                 return
2705
2706         # 8.2.4.29 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-state
2707         tok_state_script_data_double_escaped = ->
2708                 c = txt.charAt(cur++)
2709                 if c is '-'
2710                         tok_state = tok_state_script_data_double_escaped_dash
2711                         return new_character_token '-'
2712                 if c is '<'
2713                         tok_state = tok_state_script_data_double_escaped_less_than_sign
2714                         return new_character_token '<'
2715                 if c is "\u0000"
2716                         parse_error()
2717                         return new_character_token "\ufffd"
2718                 if c is '' # EOF
2719                         parse_error()
2720                         tok_state = tok_state_data
2721                         cur -= 1 # Reconsume
2722                         return
2723                 # Anything else
2724                 return new_character_token c
2725
2726         # 8.2.4.30 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-state
2727         tok_state_script_data_double_escaped_dash = ->
2728                 c = txt.charAt(cur++)
2729                 if c is '-'
2730                         tok_state = tok_state_script_data_double_escaped_dash_dash
2731                         return new_character_token '-'
2732                 if c is '<'
2733                         tok_state = tok_state_script_data_double_escaped_less_than_sign
2734                         return new_character_token '<'
2735                 if c is "\u0000"
2736                         parse_error()
2737                         tok_state = tok_state_script_data_double_escaped
2738                         return new_character_token "\ufffd"
2739                 if c is '' # EOF
2740                         parse_error()
2741                         tok_state = tok_state_data
2742                         cur -= 1 # Reconsume
2743                         return
2744                 # Anything else
2745                 tok_state = tok_state_script_data_double_escaped
2746                 return new_character_token c
2747
2748         # 8.2.4.31 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-dash-dash-state
2749         tok_state_script_data_double_escaped_dash_dash = ->
2750                 c = txt.charAt(cur++)
2751                 if c is '-'
2752                         return new_character_token '-'
2753                 if c is '<'
2754                         tok_state = tok_state_script_data_double_escaped_less_than_sign
2755                         return new_character_token '<'
2756                 if c is '>'
2757                         tok_state = tok_state_script_data
2758                         return new_character_token '>'
2759                 if c is "\u0000"
2760                         parse_error()
2761                         tok_state = tok_state_script_data_double_escaped
2762                         return new_character_token "\ufffd"
2763                 if c is '' # EOF
2764                         parse_error()
2765                         tok_state = tok_state_data
2766                         cur -= 1 # Reconsume
2767                         return
2768                 # Anything else
2769                 tok_state = tok_state_script_data_double_escaped
2770                 return new_character_token c
2771
2772         # 8.2.4.32 http://www.w3.org/TR/html5/syntax.html#script-data-double-escaped-less-than-sign-state
2773         tok_state_script_data_double_escaped_less_than_sign = ->
2774                 c = txt.charAt(cur++)
2775                 if c is '/'
2776                         temporary_buffer = ''
2777                         tok_state = tok_state_script_data_double_escape_end
2778                         return new_character_token '/'
2779                 # Anything else
2780                 tok_state = tok_state_script_data_double_escaped
2781                 cur -= 1 # Reconsume
2782                 return
2783
2784         # 8.2.4.33 http://www.w3.org/TR/html5/syntax.html#script-data-double-escape-end-state
2785         tok_state_script_data_double_escape_end = ->
2786                 c = txt.charAt(cur++)
2787                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' ' or c is '/' or c is '>'
2788                         if temporary_buffer is 'script'
2789                                 tok_state = tok_state_script_data_escaped
2790                         else
2791                                 tok_state = tok_state_script_data_double_escaped
2792                         return new_character_token c
2793                 if is_uc_alpha(c)
2794                         temporary_buffer += c.toLowerCase() # yes, really lowercase
2795                         return new_character_token c
2796                 if is_lc_alpha(c)
2797                         temporary_buffer += c
2798                         return new_character_token c
2799                 # Anything else
2800                 tok_state = tok_state_script_data_double_escaped
2801                 cur -= 1 # Reconsume
2802                 return
2803
2804         # 8.2.4.34 http://www.w3.org/TR/html5/syntax.html#before-attribute-name-state
2805         tok_state_before_attribute_name = ->
2806                 attr_name = null
2807                 switch c = txt.charAt(cur++)
2808                         when "\t", "\n", "\u000c", ' '
2809                                 return null
2810                         when '/'
2811                                 tok_state = tok_state_self_closing_start_tag
2812                                 return null
2813                         when '>'
2814                                 tok_state = tok_state_data
2815                                 tmp = tok_cur_tag
2816                                 tok_cur_tag = null
2817                                 return tmp
2818                         when "\u0000"
2819                                 parse_error()
2820                                 attr_name = "\ufffd"
2821                         when '"', "'", '<', '='
2822                                 parse_error()
2823                                 attr_name = c
2824                         when '' # EOF
2825                                 parse_error()
2826                                 tok_state = tok_state_data
2827                         else
2828                                 if is_uc_alpha(c)
2829                                         attr_name = c.toLowerCase()
2830                                 else
2831                                         attr_name = c
2832                 if attr_name?
2833                         tok_cur_tag.attrs_a.unshift [attr_name, '']
2834                         tok_state = tok_state_attribute_name
2835                 return null
2836
2837         # 8.2.4.35 http://www.w3.org/TR/html5/syntax.html#attribute-name-state
2838         tok_state_attribute_name = ->
2839                 switch c = txt.charAt(cur++)
2840                         when "\t", "\n", "\u000c", ' '
2841                                 tok_state = tok_state_after_attribute_name
2842                         when '/'
2843                                 tok_state = tok_state_self_closing_start_tag
2844                         when '='
2845                                 tok_state = tok_state_before_attribute_value
2846                         when '>'
2847                                 tok_state = tok_state_data
2848                                 tmp = tok_cur_tag
2849                                 tok_cur_tag = null
2850                                 return tmp
2851                         when "\u0000"
2852                                 parse_error()
2853                                 tok_cur_tag.attrs_a[0][0] = "\ufffd"
2854                         when '"', "'", '<'
2855                                 parse_error()
2856                                 tok_cur_tag.attrs_a[0][0] = c
2857                         when '' # EOF
2858                                 parse_error()
2859                                 tok_state = tok_state_data
2860                         else
2861                                 if is_uc_alpha(c)
2862                                         tok_cur_tag.attrs_a[0][0] = c.toLowerCase()
2863                                 else
2864                                         tok_cur_tag.attrs_a[0][0] += c
2865                 return null
2866
2867         # 8.2.4.36 http://www.w3.org/TR/html5/syntax.html#after-attribute-name-state
2868         tok_state_after_attribute_name = ->
2869                 c = txt.charAt(cur++)
2870                 if c is "\t" or c is "\n" or c is "\u000c" or c is ' '
2871                         return
2872                 if c is '/'
2873                         tok_state = tok_state_self_closing_start_tag
2874                         return
2875                 if c is '='
2876                         tok_state = tok_state_before_attribute_value
2877                         return
2878                 if c is '>'
2879                         tok_state = tok_state_data
2880                         return
2881                 if is_uc_alpha(c)
2882                         tok_cur_tag.attrs_a.unshift [c.toLowerCase(), '']
2883                         tok_state = tok_state_attribute_name
2884                         return
2885                 if c is "\u0000"
2886                         parse_error()
2887                         tok_cur_tag.attrs_a.unshift ["\ufffd", '']
2888                         tok_state = tok_state_attribute_name
2889                         return
2890                 if c is '' # EOF
2891                         parse_error()
2892                         tok_state = tok_state_data
2893                         cur -= 1 # reconsume
2894                         return
2895                 if c is '"' or c is "'" or c is '<'
2896                         parse_error()
2897                         # fall through to Anything else
2898                 # Anything else
2899                 tok_cur_tag.attrs_a.unshift [c, '']
2900                 tok_state = tok_state_attribute_name
2901
2902         # 8.2.4.37 http://www.w3.org/TR/html5/syntax.html#before-attribute-value-state
2903         tok_state_before_attribute_value = ->
2904                 switch c = txt.charAt(cur++)
2905                         when "\t", "\n", "\u000c", ' '
2906                                 return null
2907                         when '"'
2908                                 tok_state = tok_state_attribute_value_double_quoted
2909                         when '&'
2910                                 tok_state = tok_state_attribute_value_unquoted
2911                                 cur -= 1
2912                         when "'"
2913                                 tok_state = tok_state_attribute_value_single_quoted
2914                         when "\u0000"
2915                                 # Parse error
2916                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2917                                 tok_state = tok_state_attribute_value_unquoted
2918                         when '>'
2919                                 # Parse error
2920                                 tok_state = tok_state_data
2921                                 tmp = tok_cur_tag
2922                                 tok_cur_tag = null
2923                                 return tmp
2924                         when '' # EOF
2925                                 parse_error()
2926                                 tok_state = tok_state_data
2927                         else
2928                                 tok_cur_tag.attrs_a[0][1] += c
2929                                 tok_state = tok_state_attribute_value_unquoted
2930                 return null
2931
2932         # 8.2.4.38 http://www.w3.org/TR/html5/syntax.html#attribute-value-(double-quoted)-state
2933         tok_state_attribute_value_double_quoted = ->
2934                 switch c = txt.charAt(cur++)
2935                         when '"'
2936                                 tok_state = tok_state_after_attribute_value_quoted
2937                         when '&'
2938                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '"', true
2939                         when "\u0000"
2940                                 # Parse error
2941                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2942                         when '' # EOF
2943                                 parse_error()
2944                                 tok_state = tok_state_data
2945                         else
2946                                 tok_cur_tag.attrs_a[0][1] += c
2947                 return null
2948
2949         # 8.2.4.39 http://www.w3.org/TR/html5/syntax.html#attribute-value-(single-quoted)-state
2950         tok_state_attribute_value_single_quoted = ->
2951                 switch c = txt.charAt(cur++)
2952                         when "'"
2953                                 tok_state = tok_state_after_attribute_value_quoted
2954                         when '&'
2955                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference "'", true
2956                         when "\u0000"
2957                                 # Parse error
2958                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2959                         when '' # EOF
2960                                 parse_error()
2961                                 tok_state = tok_state_data
2962                         else
2963                                 tok_cur_tag.attrs_a[0][1] += c
2964                 return null
2965
2966         # 8.2.4.40 http://www.w3.org/TR/html5/syntax.html#attribute-value-(unquoted)-state
2967         tok_state_attribute_value_unquoted = ->
2968                 switch c = txt.charAt(cur++)
2969                         when "\t", "\n", "\u000c", ' '
2970                                 tok_state = tok_state_before_attribute_name
2971                         when '&'
2972                                 tok_cur_tag.attrs_a[0][1] += parse_character_reference '>', true
2973                         when '>'
2974                                 tok_state = tok_state_data
2975                                 tmp = tok_cur_tag
2976                                 tok_cur_tag = null
2977                                 return tmp
2978                         when "\u0000"
2979                                 tok_cur_tag.attrs_a[0][1] += "\ufffd"
2980                         when '' # EOF
2981                                 parse_error()
2982                                 tok_state = tok_state_data
2983                         else
2984                                 # Parse Error if ', <, = or ` (backtick)
2985                                 tok_cur_tag.attrs_a[0][1] += c
2986                 return null
2987
2988         # 8.2.4.42 http://www.w3.org/TR/html5/syntax.html#after-attribute-value-(quoted)-state
2989         tok_state_after_attribute_value_quoted = ->
2990                 switch c = txt.charAt(cur++)
2991                         when "\t", "\n", "\u000c", ' '
2992                                 tok_state = tok_state_before_attribute_name
2993                         when '/'
2994                                 tok_state = tok_state_self_closing_start_tag
2995                         when '>'
2996                                 tok_state = tok_state_data
2997                                 tmp = tok_cur_tag
2998                                 tok_cur_tag = null
2999                                 return tmp
3000                         when '' # EOF
3001                                 parse_error()
3002                                 tok_state = tok_state_data
3003                         else
3004                                 # Parse Error
3005                                 tok_state = tok_state_before_attribute_name
3006                                 cur -= 1 # we didn't handle that char
3007                 return null
3008
3009         # 8.2.4.43 http://www.w3.org/TR/html5/syntax.html#self-closing-start-tag-state
3010         tok_state_self_closing_start_tag = ->
3011                 c = txt.charAt(cur++)
3012                 if c is '>'
3013                         tok_cur_tag.flag 'self-closing'
3014                         tok_state = tok_state_data
3015                         return tok_cur_tag
3016                 if c is ''
3017                         parse_error()
3018                         tok_state = tok_state_data
3019                         cur -= 1 # Reconsume
3020                         return
3021                 # Anything else
3022                 parse_error()
3023                 tok_state = tok_state_before_attribute_name
3024                 cur -= 1 # Reconsume
3025                 return
3026
3027         # 8.2.4.44 http://www.w3.org/TR/html5/syntax.html#bogus-comment-state
3028         # WARNING: put a comment token in tok_cur_tag before setting this state
3029         tok_state_bogus_comment = ->
3030                 next_gt = txt.indexOf '>', cur
3031                 if next_gt is -1
3032                         val = txt.substr cur
3033                         cur = txt.length
3034                 else
3035                         val = txt.substr cur, (next_gt - cur)
3036                         cur = next_gt + 1
3037                 val = val.replace "\u0000", "\ufffd"
3038                 tok_cur_tag.text += val
3039                 tok_state = tok_state_data
3040                 return tok_cur_tag
3041
3042         # 8.2.4.45 http://www.w3.org/TR/html5/syntax.html#markup-declaration-open-state
3043         tok_state_markup_declaration_open = ->
3044                 if txt.substr(cur, 2) is '--'
3045                         cur += 2
3046                         tok_cur_tag = new_comment_token ''
3047                         tok_state = tok_state_comment_start
3048                         return
3049                 if txt.substr(cur, 7).toLowerCase() is 'doctype'
3050                         cur += 7
3051                         tok_state = tok_state_doctype
3052                         return
3053                 acn = adjusted_current_node()
3054                 if acn and acn.namespace isnt NS_HTML and txt.substr(cur, 7) is '[CDATA['
3055                         cur += 7
3056                         tok_state = tok_state_cdata_section
3057                         return
3058                 # Otherwise
3059                 parse_error()
3060                 tok_cur_tag = new_comment_token '!' # TODO test ("!" right?)
3061                 tok_state = tok_state_bogus_comment
3062                 return
3063
3064         # 8.2.4.46 http://www.w3.org/TR/html5/syntax.html#comment-start-state
3065         tok_state_comment_start = ->
3066                 switch c = txt.charAt(cur++)
3067                         when '-'
3068                                 tok_state = tok_state_comment_start_dash
3069                         when "\u0000"
3070                                 parse_error()
3071                                 return new_character_token "\ufffd"
3072                         when '>'
3073                                 parse_error()
3074                                 tok_state = tok_state_data
3075                                 return tok_cur_tag
3076                         when '' # EOF
3077                                 parse_error()
3078                                 tok_state = tok_state_data
3079                                 cur -= 1 # Reconsume
3080                                 return tok_cur_tag
3081                         else
3082                                 tok_cur_tag.text += c
3083                 return null
3084
3085         # 8.2.4.47 http://www.w3.org/TR/html5/syntax.html#comment-start-dash-state
3086         tok_state_comment_start_dash = ->
3087                 switch c = txt.charAt(cur++)
3088                         when '-'
3089                                 tok_state = tok_state_comment_end
3090                         when "\u0000"
3091                                 parse_error()
3092                                 tok_cur_tag.text += "-\ufffd"
3093                                 tok_state = tok_state_comment
3094                         when '>'
3095                                 parse_error()
3096                                 tok_state = tok_state_data
3097                                 return tok_cur_tag
3098                         when '' # EOF
3099                                 parse_error()
3100                                 tok_state = tok_state_data
3101                                 cur -= 1 # Reconsume
3102                                 return tok_cur_tag
3103                         else
3104                                 tok_cur_tag.text += "-#{c}"
3105                                 tok_state = tok_state_comment
3106                 return null
3107
3108         # 8.2.4.48 http://www.w3.org/TR/html5/syntax.html#comment-state
3109         tok_state_comment = ->
3110                 switch c = txt.charAt(cur++)
3111                         when '-'
3112                                 tok_state = tok_state_comment_end_dash
3113                         when "\u0000"
3114                                 parse_error()
3115                                 tok_cur_tag.text += "\ufffd"
3116                         when '' # EOF
3117                                 parse_error()
3118                                 tok_state = tok_state_data
3119                                 cur -= 1 # Reconsume
3120                                 return tok_cur_tag
3121                         else
3122                                 tok_cur_tag.text += c
3123                 return null
3124
3125         # 8.2.4.49 http://www.w3.org/TR/html5/syntax.html#comment-end-dash-state
3126         tok_state_comment_end_dash = ->
3127                 switch c = txt.charAt(cur++)
3128                         when '-'
3129                                 tok_state = tok_state_comment_end
3130                         when "\u0000"
3131                                 parse_error()
3132                                 tok_cur_tag.text += "-\ufffd"
3133                                 tok_state = tok_state_comment
3134                         when '' # EOF
3135                                 parse_error()
3136                                 tok_state = tok_state_data
3137                                 cur -= 1 # Reconsume
3138                                 return tok_cur_tag
3139                         else
3140                                 tok_cur_tag.text += "-#{c}"
3141                                 tok_state = tok_state_comment
3142                 return null
3143
3144         # 8.2.4.50 http://www.w3.org/TR/html5/syntax.html#comment-end-state
3145         tok_state_comment_end = ->
3146                 switch c = txt.charAt(cur++)
3147                         when '>'
3148                                 tok_state = tok_state_data
3149                                 return tok_cur_tag
3150                         when "\u0000"
3151                                 parse_error()
3152                                 tok_cur_tag.text += "--\ufffd"
3153                                 tok_state = tok_state_comment
3154                         when '!'
3155                                 parse_error()
3156                                 tok_state = tok_state_comment_end_bang
3157                         when '-'
3158                                 parse_error()
3159                                 tok_cur_tag.text += '-'
3160                         when '' # EOF
3161                                 parse_error()
3162                                 tok_state = tok_state_data
3163                                 cur -= 1 # Reconsume
3164                                 return tok_cur_tag
3165                         else
3166                                 parse_error()
3167                                 tok_cur_tag.text += "--#{c}"
3168                                 tok_state = tok_state_comment
3169                 return null
3170
3171         # 8.2.4.51 http://www.w3.org/TR/html5/syntax.html#comment-end-bang-state
3172         tok_state_comment_end_bang = ->
3173                 switch c = txt.charAt(cur++)
3174                         when '-'
3175                                 tok_cur_tag.text += "--!#{c}"
3176                                 tok_state = tok_state_comment_end_dash
3177                         when '>'
3178                                 tok_state = tok_state_data
3179                                 return tok_cur_tag
3180                         when "\u0000"
3181                                 parse_error()
3182                                 tok_cur_tag.text += "--!\ufffd"
3183                                 tok_state = tok_state_comment
3184                         when '' # EOF
3185                                 parse_error()
3186                                 tok_state = tok_state_data
3187                                 cur -= 1 # Reconsume
3188                                 return tok_cur_tag
3189                         else
3190                                 tok_cur_tag.text += "--!#{c}"
3191                                 tok_state = tok_state_comment
3192                 return null
3193
3194         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3195         tok_state_doctype = ->
3196                 switch c = txt.charAt(cur++)
3197                         when "\t", "\u000a", "\u000c", ' '
3198                                 tok_state = tok_state_before_doctype_name
3199                         when '' # EOF
3200                                 parse_error()
3201                                 tok_state = tok_state_data
3202                                 el = new_doctype_token ''
3203                                 el.flag 'force-quirks', true
3204                                 cur -= 1 # Reconsume
3205                                 return el
3206                         else
3207                                 parse_error()
3208                                 tok_state = tok_state_before_doctype_name
3209                                 cur -= 1 # Reconsume
3210                 return null
3211
3212         # 8.2.4.52 http://www.w3.org/TR/html5/syntax.html#doctype-state
3213         tok_state_before_doctype_name = ->
3214                 c = txt.charAt(cur++)
3215                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3216                         return
3217                 if is_uc_alpha(c)
3218                         tok_cur_tag = new_doctype_token c.toLowerCase()
3219                         tok_state = tok_state_doctype_name
3220                         return
3221                 if c is "\u0000"
3222                         parse_error()
3223                         tok_cur_tag = new_doctype_token "\ufffd"
3224                         tok_state = tok_state_doctype_name
3225                         return
3226                 if c is '>'
3227                         parse_error()
3228                         el = new_doctype_token ''
3229                         el.flag 'force-quirks', true
3230                         tok_state = tok_state_data
3231                         return el
3232                 if c is '' # EOF
3233                         parse_error()
3234                         tok_state = tok_state_data
3235                         el = new_doctype_token ''
3236                         el.flag 'force-quirks', true
3237                         cur -= 1 # Reconsume
3238                         return el
3239                 # Anything else
3240                 tok_cur_tag = new_doctype_token c
3241                 tok_state = tok_state_doctype_name
3242                 return null
3243
3244         # 8.2.4.54 http://www.w3.org/TR/html5/syntax.html#doctype-name-state
3245         tok_state_doctype_name = ->
3246                 c = txt.charAt(cur++)
3247                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3248                         tok_state = tok_state_after_doctype_name
3249                         return
3250                 if c is '>'
3251                         tok_state = tok_state_data
3252                         return tok_cur_tag
3253                 if is_uc_alpha(c)
3254                         tok_cur_tag.name += c.toLowerCase()
3255                         return
3256                 if c is "\u0000"
3257                         parse_error()
3258                         tok_cur_tag.name += "\ufffd"
3259                         return
3260                 if c is '' # EOF
3261                         parse_error()
3262                         tok_state = tok_state_data
3263                         tok_cur_tag.flag 'force-quirks', true
3264                         cur -= 1 # Reconsume
3265                         return tok_cur_tag
3266                 # Anything else
3267                 tok_cur_tag.name += c
3268                 return null
3269
3270         # 8.2.4.55 http://www.w3.org/TR/html5/syntax.html#after-doctype-name-state
3271         tok_state_after_doctype_name = ->
3272                 c = txt.charAt(cur++)
3273                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3274                         return
3275                 if c is '>'
3276                         tok_state = tok_state_data
3277                         return tok_cur_tag
3278                 if c is '' # EOF
3279                         parse_error()
3280                         tok_state = tok_state_data
3281                         tok_cur_tag.flag 'force-quirks', true
3282                         cur -= 1 # Reconsume
3283                         return tok_cur_tag
3284                 # Anything else
3285                 if txt.substr(cur - 1, 6).toLowerCase() is 'public'
3286                         cur += 5
3287                         tok_state = tok_state_after_doctype_public_keyword
3288                         return
3289                 if txt.substr(cur - 1, 6).toLowerCase() is 'system'
3290                         cur += 5
3291                         tok_state = tok_state_after_doctype_system_keyword
3292                         return
3293                 parse_error()
3294                 tok_cur_tag.flag 'force-quirks', true
3295                 tok_state = tok_state_bogus_doctype
3296                 return null
3297
3298         # 8.2.4.56 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-keyword-state
3299         tok_state_after_doctype_public_keyword = ->
3300                 c = txt.charAt(cur++)
3301                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3302                         tok_state = tok_state_before_doctype_public_identifier
3303                         return
3304                 if c is '"'
3305                         parse_error()
3306                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3307                         tok_state = tok_state_doctype_public_identifier_double_quoted
3308                         return
3309                 if c is "'"
3310                         parse_error()
3311                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3312                         tok_state = tok_state_doctype_public_identifier_single_quoted
3313                         return
3314                 if c is '>'
3315                         parse_error()
3316                         tok_cur_tag.flag 'force-quirks', true
3317                         tok_state = tok_state_data
3318                         return tok_cur_tag
3319                 if c is '' # EOF
3320                         parse_error()
3321                         tok_state = tok_state_data
3322                         tok_cur_tag.flag 'force-quirks', true
3323                         cur -= 1 # Reconsume
3324                         return tok_cur_tag
3325                 # Anything else
3326                 parse_error()
3327                 tok_cur_tag.flag 'force-quirks', true
3328                 tok_state = tok_state_bogus_doctype
3329                 return null
3330
3331         # 8.2.4.57 http://www.w3.org/TR/html5/syntax.html#before-doctype-public-identifier-state
3332         tok_state_before_doctype_public_identifier = ->
3333                 c = txt.charAt(cur++)
3334                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3335                         return
3336                 if c is '"'
3337                         parse_error()
3338                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3339                         tok_state = tok_state_doctype_public_identifier_double_quoted
3340                         return
3341                 if c is "'"
3342                         parse_error()
3343                         tok_cur_tag.public_identifier = '' # FIXME should this go in @attrs or @text?
3344                         tok_state = tok_state_doctype_public_identifier_single_quoted
3345                         return
3346                 if c is '>'
3347                         parse_error()
3348                         tok_cur_tag.flag 'force-quirks', true
3349                         tok_state = tok_state_data
3350                         return tok_cur_tag
3351                 if c is '' # EOF
3352                         parse_error()
3353                         tok_state = tok_state_data
3354                         tok_cur_tag.flag 'force-quirks', true
3355                         cur -= 1 # Reconsume
3356                         return tok_cur_tag
3357                 # Anything else
3358                 parse_error()
3359                 tok_cur_tag.flag 'force-quirks', true
3360                 tok_state = tok_state_bogus_doctype
3361                 return null
3362
3363
3364         # 8.2.4.58 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(double-quoted)-state
3365         tok_state_doctype_public_identifier_double_quoted = ->
3366                 c = txt.charAt(cur++)
3367                 if c is '"'
3368                         tok_state = tok_state_after_doctype_public_identifier
3369                         return
3370                 if c is "\u0000"
3371                         parse_error()
3372                         tok_cur_tag.public_identifier += "\ufffd"
3373                         return
3374                 if c is '>'
3375                         parse_error()
3376                         tok_cur_tag.flag 'force-quirks', true
3377                         tok_state = tok_state_data
3378                         return tok_cur_tag
3379                 if c is '' # EOF
3380                         parse_error()
3381                         tok_state = tok_state_data
3382                         tok_cur_tag.flag 'force-quirks', true
3383                         cur -= 1 # Reconsume
3384                         return tok_cur_tag
3385                 # Anything else
3386                 tok_cur_tag.public_identifier += c
3387                 return null
3388
3389         # 8.2.4.59 http://www.w3.org/TR/html5/syntax.html#doctype-public-identifier-(single-quoted)-state
3390         tok_state_doctype_public_identifier_single_quoted = ->
3391                 c = txt.charAt(cur++)
3392                 if c is "'"
3393                         tok_state = tok_state_after_doctype_public_identifier
3394                         return
3395                 if c is "\u0000"
3396                         parse_error()
3397                         tok_cur_tag.public_identifier += "\ufffd"
3398                         return
3399                 if c is '>'
3400                         parse_error()
3401                         tok_cur_tag.flag 'force-quirks', true
3402                         tok_state = tok_state_data
3403                         return tok_cur_tag
3404                 if c is '' # EOF
3405                         parse_error()
3406                         tok_state = tok_state_data
3407                         tok_cur_tag.flag 'force-quirks', true
3408                         cur -= 1 # Reconsume
3409                         return tok_cur_tag
3410                 # Anything else
3411                 tok_cur_tag.public_identifier += c
3412                 return null
3413
3414         # 8.2.4.60 http://www.w3.org/TR/html5/syntax.html#after-doctype-public-identifier-state
3415         tok_state_after_doctype_public_identifier = ->
3416                 c = txt.charAt(cur++)
3417                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3418                         tok_state = tok_state_between_doctype_public_and_system_identifiers
3419                         return
3420                 if c is '>'
3421                         tok_state = tok_state_data
3422                         return tok_cur_tag
3423                 if c is '"'
3424                         parse_error()
3425                         tok_cur_tag.system_identifier = ''
3426                         tok_state = tok_state_doctype_system_identifier_double_quoted
3427                         return
3428                 if c is "'"
3429                         parse_error()
3430                         tok_cur_tag.system_identifier = ''
3431                         tok_state = tok_state_doctype_system_identifier_single_quoted
3432                         return
3433                 if c is '' # EOF
3434                         parse_error()
3435                         tok_state = tok_state_data
3436                         tok_cur_tag.flag 'force-quirks', true
3437                         cur -= 1 # Reconsume
3438                         return tok_cur_tag
3439                 # Anything else
3440                 parse_error()
3441                 tok_cur_tag.flag 'force-quirks', true
3442                 tok_state = tok_state_bogus_doctype
3443                 return null
3444
3445         # 8.2.4.61 http://www.w3.org/TR/html5/syntax.html#between-doctype-public-and-system-identifiers-state
3446         tok_state_between_doctype_public_and_system_identifiers = ->
3447                 c = txt.charAt(cur++)
3448                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3449                         return
3450                 if c is '>'
3451                         tok_state = tok_state_data
3452                         return tok_cur_tag
3453                 if c is '"'
3454                         parse_error()
3455                         tok_cur_tag.system_identifier = ''
3456                         tok_state = tok_state_doctype_system_identifier_double_quoted
3457                         return
3458                 if c is "'"
3459                         parse_error()
3460                         tok_cur_tag.system_identifier = ''
3461                         tok_state = tok_state_doctype_system_identifier_single_quoted
3462                         return
3463                 if c is '' # EOF
3464                         parse_error()
3465                         tok_state = tok_state_data
3466                         tok_cur_tag.flag 'force-quirks', true
3467                         cur -= 1 # Reconsume
3468                         return tok_cur_tag
3469                 # Anything else
3470                 parse_error()
3471                 tok_cur_tag.flag 'force-quirks', true
3472                 tok_state = tok_state_bogus_doctype
3473                 return null
3474
3475         # 8.2.4.62 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-keyword-state
3476         tok_state_after_doctype_system_keyword = ->
3477                 c = txt.charAt(cur++)
3478                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3479                         tok_state = tok_state_before_doctype_system_identifier
3480                         return
3481                 if c is '"'
3482                         parse_error()
3483                         tok_cur_tag.system_identifier = ''
3484                         tok_state = tok_state_doctype_system_identifier_double_quoted
3485                         return
3486                 if c is "'"
3487                         parse_error()
3488                         tok_cur_tag.system_identifier = ''
3489                         tok_state = tok_state_doctype_system_identifier_single_quoted
3490                         return
3491                 if c is '>'
3492                         parse_error()
3493                         tok_cur_tag.flag 'force-quirks', true
3494                         tok_state = tok_state_data
3495                         return tok_cur_tag
3496                 if c is '' # EOF
3497                         parse_error()
3498                         tok_state = tok_state_data
3499                         tok_cur_tag.flag 'force-quirks', true
3500                         cur -= 1 # Reconsume
3501                         return tok_cur_tag
3502                 # Anything else
3503                 parse_error()
3504                 tok_cur_tag.flag 'force-quirks', true
3505                 tok_state = tok_state_bogus_doctype
3506                 return null
3507
3508         # 8.2.4.63 http://www.w3.org/TR/html5/syntax.html#before-doctype-system-identifier-state
3509         tok_state_before_doctype_system_identifier = ->
3510                 c = txt.charAt(cur++)
3511                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3512                         return
3513                 if c is '"'
3514                         tok_cur_tag.system_identifier = ''
3515                         tok_state = tok_state_doctype_system_identifier_double_quoted
3516                         return
3517                 if c is "'"
3518                         tok_cur_tag.system_identifier = ''
3519                         tok_state = tok_state_doctype_system_identifier_single_quoted
3520                         return
3521                 if c is '>'
3522                         parse_error()
3523                         tok_cur_tag.flag 'force-quirks', true
3524                         tok_state = tok_state_data
3525                         return tok_cur_tag
3526                 if c is '' # EOF
3527                         parse_error()
3528                         tok_state = tok_state_data
3529                         tok_cur_tag.flag 'force-quirks', true
3530                         cur -= 1 # Reconsume
3531                         return tok_cur_tag
3532                 # Anything else
3533                 parse_error()
3534                 tok_cur_tag.flag 'force-quirks', true
3535                 tok_state = tok_state_bogus_doctype
3536                 return null
3537
3538         # 8.2.4.64 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(double-quoted)-state
3539         tok_state_doctype_system_identifier_double_quoted = ->
3540                 c = txt.charAt(cur++)
3541                 if c is '"'
3542                         tok_state = tok_state_after_doctype_system_identifier
3543                         return
3544                 if c is "\u0000"
3545                         parse_error()
3546                         tok_cur_tag.system_identifier += "\ufffd"
3547                         return
3548                 if c is '>'
3549                         parse_error()
3550                         tok_cur_tag.flag 'force-quirks', true
3551                         tok_state = tok_state_data
3552                         return tok_cur_tag
3553                 if c is '' # EOF
3554                         parse_error()
3555                         tok_state = tok_state_data
3556                         tok_cur_tag.flag 'force-quirks', true
3557                         cur -= 1 # Reconsume
3558                         return tok_cur_tag
3559                 # Anything else
3560                 tok_cur_tag.system_identifier += c
3561                 return null
3562
3563         # 8.2.4.65 http://www.w3.org/TR/html5/syntax.html#doctype-system-identifier-(single-quoted)-state
3564         tok_state_doctype_system_identifier_single_quoted = ->
3565                 c = txt.charAt(cur++)
3566                 if c is "'"
3567                         tok_state = tok_state_after_doctype_system_identifier
3568                         return
3569                 if c is "\u0000"
3570                         parse_error()
3571                         tok_cur_tag.system_identifier += "\ufffd"
3572                         return
3573                 if c is '>'
3574                         parse_error()
3575                         tok_cur_tag.flag 'force-quirks', true
3576                         tok_state = tok_state_data
3577                         return tok_cur_tag
3578                 if c is '' # EOF
3579                         parse_error()
3580                         tok_state = tok_state_data
3581                         tok_cur_tag.flag 'force-quirks', true
3582                         cur -= 1 # Reconsume
3583                         return tok_cur_tag
3584                 # Anything else
3585                 tok_cur_tag.system_identifier += c
3586                 return null
3587
3588         # 8.2.4.66 http://www.w3.org/TR/html5/syntax.html#after-doctype-system-identifier-state
3589         tok_state_after_doctype_system_identifier = ->
3590                 c = txt.charAt(cur++)
3591                 if c is "\t" or c is "\u000a" or c is "\u000c" or c is ' '
3592                         return
3593                 if c is '>'
3594                         tok_state = tok_state_data
3595                         return tok_cur_tag
3596                 if c is '' # EOF
3597                         parse_error()
3598                         tok_state = tok_state_data
3599                         tok_cur_tag.flag 'force-quirks', true
3600                         cur -= 1 # Reconsume
3601                         return tok_cur_tag
3602                 # Anything else
3603                 parse_error()
3604                 # do _not_ tok_cur_tag.flag 'force-quirks', true
3605                 tok_state = tok_state_bogus_doctype
3606                 return null
3607
3608         # 8.2.4.67 http://www.w3.org/TR/html5/syntax.html#bogus-doctype-state
3609         tok_state_bogus_doctype = ->
3610                 c = txt.charAt(cur++)
3611                 if c is '>'
3612                         tok_state = tok_state_data
3613                         return tok_cur_tag
3614                 if c is '' # EOF
3615                         tok_state = tok_state_data
3616                         cur -= 1 # Reconsume
3617                         return tok_cur_tag
3618                 # Anything else
3619                 return null
3620
3621
3622         # 8.2.4.69 http://www.w3.org/TR/html5/syntax.html#consume-a-character-reference
3623         # Don't set this as a state, just call it
3624         # returns a string (NOT a text node)
3625         parse_character_reference = (allowed_char = null, in_attr = false) ->
3626                 if cur >= txt.length
3627                         return '&'
3628                 switch c = txt.charAt(cur)
3629                         when "\t", "\n", "\u000c", ' ', '<', '&', '', allowed_char
3630                                 # explicitly not a parse error
3631                                 return '&'
3632                         when ';'
3633                                 # there has to be "one or more" alnums between & and ; to be a parse error
3634                                 return '&'
3635                         when '#'
3636                                 if cur + 1 >= txt.length
3637                                         return '&'
3638                                 if txt.charAt(cur + 1).toLowerCase() is 'x'
3639                                         prefix = '#x'
3640                                         charset = hex_chars
3641                                         start = cur + 2
3642                                 else
3643                                         charset = digits
3644                                         start = cur + 1
3645                                         prefix = '#'
3646                                 i = 0
3647                                 while start + i < txt.length and charset.indexOf(txt.charAt(start + i)) > -1
3648                                         i += 1
3649                                 if i is 0
3650                                         return '&'
3651                                 if txt.charAt(start + i) is ';'
3652                                         i += 1
3653                                 # FIXME This is supposed to generate parse errors for some chars
3654                                 decoded = decode_named_char_ref(prefix + txt.substr(start, i).toLowerCase())
3655                                 if decoded?
3656                                         cur = start + i
3657                                         return decoded
3658                                 return '&'
3659                         else
3660                                 for i in [0...31]
3661                                         if alnum.indexOf(txt.charAt(cur + i)) is -1
3662                                                 break
3663                                 if i is 0
3664                                         # exit early, because parse_error() below needs at least one alnum
3665                                         return '&'
3666                                 if txt.charAt(cur + i) is ';'
3667                                         i += 1 # include ';' terminator in value
3668                                         decoded = decode_named_char_ref txt.substr(cur, i)
3669                                         if decoded?
3670                                                 cur += i
3671                                                 return decoded
3672                                         parse_error()
3673                                         return '&'
3674                                 else
3675                                         # no ';' terminator (only legacy char refs)
3676                                         max = i
3677                                         for i in [2..max] # no prefix matches, so ok to check shortest first
3678                                                 c = legacy_char_refs[txt.substr(cur, i)]
3679                                                 if c?
3680                                                         if in_attr
3681                                                                 if txt.charAt(cur + i) is '='
3682                                                                         # "because some legacy user agents will
3683                                                                         # misinterpret the markup in those cases"
3684                                                                         parse_error()
3685                                                                         return '&'
3686                                                                 if alnum.indexOf(txt.charAt(cur + i)) > -1
3687                                                                         # this makes attributes forgiving about url args
3688                                                                         return '&'
3689                                                         # ok, and besides the weird exceptions for attributes...
3690                                                         # return the matching char
3691                                                         cur += i # consume entity chars
3692                                                         parse_error() # because no terminating ";"
3693                                                         return c
3694                                         parse_error()
3695                                         return '&'
3696                 return # never reached
3697
3698         # tree constructor initialization
3699         # see comments on TYPE_TAG/etc for the structure of this data
3700         doc = new Node TYPE_TAG, name: 'html', namespace: NS_HTML
3701         open_els = []
3702         afe = [] # active formatting elements
3703         template_insertion_modes = []
3704         insertion_mode = ins_mode_initial
3705         original_insertion_mode = insertion_mode # TODO check spec
3706         flag_scripting = true # TODO might need an extra flag to get <noscript> to parse correctly
3707         flag_frameset_ok = true
3708         flag_parsing = true
3709         flag_foster_parenting = false
3710         form_element_pointer = null
3711         temporary_buffer = null
3712         pending_table_character_tokens = []
3713         head_element_pointer = null
3714         flag_fragment_parsing = false # parser originally created as part of the html fragment parsing algorithm (fragment case)
3715         context_element = null # FIXME initialize from args.fragment http://www.w3.org/TR/html5/syntax.html#parsing-html-fragments
3716
3717         # tokenizer initialization
3718         tok_state = tok_state_data
3719
3720         # proccess input
3721         while flag_parsing
3722                 t = tok_state()
3723                 if t?
3724                         insertion_mode t
3725                         # fixfull parse error if has self-closing flag, but it wasn't acknolwedged
3726         return doc.children
3727
3728 serialize_els = (els, shallow, show_ids) ->
3729         serialized = ''
3730         sep = ''
3731         for t in els
3732                 serialized += sep
3733                 sep = ','
3734                 serialized += t.serialize shallow, show_ids
3735         return serialized
3736
3737 # TODO export TYPE_*
3738 module.exports.parse_html = parse_html
3739 module.exports.debug_log_reset = debug_log_reset
3740 module.exports.debug_log_each = debug_log_each
3741 module.exports.TYPE_TAG = TYPE_TAG
3742 module.exports.TYPE_TEXT = TYPE_TEXT
3743 module.exports.TYPE_COMMENT = TYPE_COMMENT
3744 module.exports.TYPE_DOCTYPE = TYPE_DOCTYPE